导入需要的库和数据

import numpy as np  
import pandas as pd  
import torch  
import seaborn as sns  
import matplotlib.pyplot as plt  
from matplotlib import ticker  
from torch import nn  
from torch.utils.data import TensorDataset,DataLoader  
 
train_data = pd.read_csv("D:\\dataset\\home-data-for-ml-course\\train.csv")  
test_data = pd.read_csv("D:\\dataset\\home-data-for-ml-course\\test.csv")  

pd.concat拼接数据集

## 用pd.concat拼接训练集和测试集,不包括训练集的标签,ignore_index重置索引  
all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))  

数据预处理

## 数据预处理,将数据中非字符串的数值放缩到0均值,1标准差来标准化数据  
numeric_index = all_features.dtypes[all_features.dtypes != "object"].index  
all_features[numeric_index] = all_features[numeric_index].apply(lambda x: (x - x.mean()) / (x.std()))  
all_features[numeric_index] = all_features[numeric_index].fillna(0)  
  
print(all_features.shape) 
 
 
## 用onehot编码替换离散值,get_dummies自动生成独热编码,dummy_na将NA也视为一种类别  
all_features = pd.get_dummies(all_features, dummy_na=True)  
## get_dummies只会生成True False,使用astype改为1,0  
all_features = all_features.astype(np.float32)
 
## 查看数据的信息  
print(all_features.info())  
  
## 获取训练数据的长度,将训练数据和测试数据分开  
n_train = train_data.shape[0]  
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float32)  
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float32)  
train_labels = torch.tensor(train_data.SalePrice.values.reshape(-1, 1), dtype=torch.float32)  

设置损失函数

loss = nn.MSELoss()  
in_features=train_features.shape[1]  
def get_net():  
    net = nn.Sequential(nn.Linear(in_features,1))  
    return net  
  
## 房价应该关注相对误差,计算误差时可以取(y-hat(y))/y  
def log_rmse(net,features,labels):  
    ## 对预测结果进行限制,房价最小值不低于1  
    clipped_preds = torch.clamp(net(features),1,float("inf"))  
    ## 使用rmse  
    rmse = torch.sqrt(loss(torch.log(clipped_preds),torch.log(labels)))  
    return rmse.item()  

定义训练函数

def train(net, train_features, train_labels, test_features, test_labels, epochs, learning_rate, weight_decay, batch_size):  
    train_ls, test_ls = [], []  
    train_iter = DataLoader(TensorDataset(train_features, train_labels), batch_size, shuffle=True)  
    optimizer = torch.optim.Adam(net.parameters(),lr=learning_rate,weight_decay=weight_decay)  
    for epoch in range(epochs):  
        for X, y in train_iter:  
            optimizer.zero_grad()  
            l = loss(net(X), y)  
            l.backward()  
            optimizer.step()  
  
        train_ls.append(log_rmse(net, train_features, train_labels))  
        if test_labels is not None:  
            test_ls.append(log_rmse(net, test_features, test_labels))  
    return train_ls, test_ls  

定义k折验证数据的选取

def get_k_fold_data(k, i, X, y):  
    assert k > 1  
    fold_size = X.shape[0] // k  
    X_train, y_train = None, None  
    for j in range(k):  
        idx = slice(j * fold_size, (j + 1) * fold_size)  
        X_part, y_part = X[idx, :], y[idx]  
        if j == i:  
            X_valid, y_valid = X_part, y_part  
        elif X_train is None:  
            X_train, y_train = X_part, y_part  
        else:  
            X_train = torch.cat([X_train, X_part], 0)  
            y_train = torch.cat([y_train, y_part], 0)  
    return X_train, y_train, X_valid, y_valid  

定义图表显示函数

def show_plot(num_epochs,train_ls,valid_ls):  
    epochs = list(range(1, num_epochs + 1))  
    df = pd.DataFrame({  
        'Epoch': epochs * 2,  # 复制两次用于区分 train 和 valid
        'RMSE': train_ls + valid_ls,  # 合并两个列表  
        'Type': ['Train'] * num_epochs + ['Valid'] * num_epochs  # 标记数据来源  
    })  
  
    # 设置 Seaborn 风格  
    sns.set_style("whitegrid")  
  
    # 绘制折线图  
    plt.figure(figsize=(8, 6))  
    sns.lineplot(data=df, x='Epoch', y='RMSE', hue='Type', marker='o')  
  
    # 设置对数 y 轴  
    plt.yscale('log')  
  
    # 添加标题和标签  
    plt.xlabel('Epoch')  
    plt.ylabel('RMSE')  
    plt.title('Training and Validation RMSE')  
    plt.legend(title='Dataset')  
    plt.gca().yaxis.set_major_locator(
    ticker.MaxNLocator(integer=False, prune='lower'))  
    
    plt.gca().yaxis.set_major_formatter(
    ticker.FuncFormatter(lambda x, _: f'{x:.2f}'))  
    # 显示图像  
    plt.show()

定义k折验证

def k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay,batch_size):  
    train_l_sum, valid_l_sum = 0, 0  
    for i in range(k):  
        data = get_k_fold_data(k, i, X_train, y_train)  
        net = get_net()  
        train_ls, valid_ls = train(net, *data, num_epochs, learning_rate,weight_decay, batch_size)  
        train_l_sum += train_ls[-1]  
        valid_l_sum += valid_ls[-1]  
        if i == 0:  
            show_plot(num_epochs,train_ls,valid_ls)
        print(f'折{i + 1},训练log rmse{float(train_ls[-1]):f}, '  
              f'验证log rmse{float(valid_ls[-1]):f}')  
    return train_l_sum / k, valid_l_sum / k  

开始训练

k, num_epochs, lr, weight_decay, batch_size = 5, 100, 5, 0, 64  
train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr,  
                          weight_decay, batch_size)  
print(f'{k}-折验证: 平均训练log rmse: {float(train_l):f}, '  
      f'平均验证log rmse: {float(valid_l):f}')  
 

训练结果

(2919, 79)
<class 'pandas.core.frame.DataFrame'>
Index: 2919 entries, 0 to 1458
Columns: 330 entries, MSSubClass to SaleCondition_nan
dtypes: float32(330)
memory usage: 3.7 MB
None
折1,训练log rmse0.170216, 验证log rmse0.157181
折2,训练log rmse0.162235, 验证log rmse0.188669
折3,训练log rmse0.164142, 验证log rmse0.168466
折4,训练log rmse0.168107, 验证log rmse0.154406
折5,训练log rmse0.162575, 验证log rmse0.182377
5-折验证: 平均训练log rmse: 0.165455, 平均验证log rmse: 0.170220

out

提交测试结果

  
net = get_net()  
train_ls, _ = train(net, train_features, train_labels, None, None,  
                    num_epochs, lr, weight_decay, batch_size)  
preds = net(test_features).detach().numpy()  
test_data['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])  
submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1)  
submission.to_csv('submission.csv', index=False)