导入需要的库和数据
import numpy as np
import pandas as pd
import torch
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import ticker
from torch import nn
from torch.utils.data import TensorDataset,DataLoader
train_data = pd.read_csv( "D: \\ dataset \\ home-data-for-ml-course \\ train.csv" )
test_data = pd.read_csv( "D: \\ dataset \\ home-data-for-ml-course \\ test.csv" )
## 用pd.concat拼接训练集和测试集,不包括训练集的标签,ignore_index重置索引
all_features = pd.concat((train_data.iloc[:, 1 : - 1 ], test_data.iloc[:, 1 :]))
数据预处理
## 数据预处理,将数据中非字符串的数值放缩到0均值,1标准差来标准化数据
numeric_index = all_features.dtypes[all_features.dtypes != "object" ].index
all_features[numeric_index] = all_features[numeric_index].apply( lambda x: (x - x.mean()) / (x.std()))
all_features[numeric_index] = all_features[numeric_index].fillna( 0 )
print (all_features.shape)
## 用onehot编码替换离散值,get_dummies自动生成独热编码,dummy_na将NA也视为一种类别
all_features = pd.get_dummies(all_features, dummy_na = True )
## get_dummies只会生成True False,使用astype改为1,0
all_features = all_features.astype(np.float32)
## 查看数据的信息
print (all_features.info())
## 获取训练数据的长度,将训练数据和测试数据分开
n_train = train_data.shape[ 0 ]
train_features = torch.tensor(all_features[:n_train].values, dtype = torch.float32)
test_features = torch.tensor(all_features[n_train:].values, dtype = torch.float32)
train_labels = torch.tensor(train_data.SalePrice.values.reshape( - 1 , 1 ), dtype = torch.float32)
设置损失函数
loss = nn.MSELoss()
in_features = train_features.shape[ 1 ]
def get_net ():
net = nn.Sequential(nn.Linear(in_features, 1 ))
return net
## 房价应该关注相对误差,计算误差时可以取(y-hat(y))/y
def log_rmse (net,features,labels):
## 对预测结果进行限制,房价最小值不低于1
clipped_preds = torch.clamp(net(features), 1 , float ( "inf" ))
## 使用rmse
rmse = torch.sqrt(loss(torch.log(clipped_preds),torch.log(labels)))
return rmse.item()
定义训练函数
def train (net, train_features, train_labels, test_features, test_labels, epochs, learning_rate, weight_decay, batch_size):
train_ls, test_ls = [], []
train_iter = DataLoader(TensorDataset(train_features, train_labels), batch_size, shuffle = True )
optimizer = torch.optim.Adam(net.parameters(), lr = learning_rate, weight_decay = weight_decay)
for epoch in range (epochs):
for X, y in train_iter:
optimizer.zero_grad()
l = loss(net(X), y)
l.backward()
optimizer.step()
train_ls.append(log_rmse(net, train_features, train_labels))
if test_labels is not None :
test_ls.append(log_rmse(net, test_features, test_labels))
return train_ls, test_ls
定义k折验证数据的选取
def get_k_fold_data (k, i, X, y):
assert k > 1
fold_size = X.shape[ 0 ] // k
X_train, y_train = None , None
for j in range (k):
idx = slice (j * fold_size, (j + 1 ) * fold_size)
X_part, y_part = X[idx, :], y[idx]
if j == i:
X_valid, y_valid = X_part, y_part
elif X_train is None :
X_train, y_train = X_part, y_part
else :
X_train = torch.cat([X_train, X_part], 0 )
y_train = torch.cat([y_train, y_part], 0 )
return X_train, y_train, X_valid, y_valid
定义图表显示函数
def show_plot (num_epochs,train_ls,valid_ls):
epochs = list ( range ( 1 , num_epochs + 1 ))
df = pd.DataFrame({
'Epoch' : epochs * 2 , # 复制两次用于区分 train 和 valid
'RMSE' : train_ls + valid_ls, # 合并两个列表
'Type' : [ 'Train' ] * num_epochs + [ 'Valid' ] * num_epochs # 标记数据来源
})
# 设置 Seaborn 风格
sns.set_style( "whitegrid" )
# 绘制折线图
plt.figure( figsize = ( 8 , 6 ))
sns.lineplot( data = df, x = 'Epoch' , y = 'RMSE' , hue = 'Type' , marker = 'o' )
# 设置对数 y 轴
plt.yscale( 'log' )
# 添加标题和标签
plt.xlabel( 'Epoch' )
plt.ylabel( 'RMSE' )
plt.title( 'Training and Validation RMSE' )
plt.legend( title = 'Dataset' )
plt.gca().yaxis.set_major_locator(
ticker.MaxNLocator( integer = False , prune = 'lower' ))
plt.gca().yaxis.set_major_formatter(
ticker.FuncFormatter( lambda x, _: f ' { x :.2f } ' ))
# 显示图像
plt.show()
定义k折验证
def k_fold (k, X_train, y_train, num_epochs, learning_rate, weight_decay,batch_size):
train_l_sum, valid_l_sum = 0 , 0
for i in range (k):
data = get_k_fold_data(k, i, X_train, y_train)
net = get_net()
train_ls, valid_ls = train(net, * data, num_epochs, learning_rate,weight_decay, batch_size)
train_l_sum += train_ls[ - 1 ]
valid_l_sum += valid_ls[ - 1 ]
if i == 0 :
show_plot(num_epochs,train_ls,valid_ls)
print ( f '折 { i + 1} ,训练log rmse {float (train_ls[ - 1 ]) :f } , '
f '验证log rmse {float (valid_ls[ - 1 ]) :f } ' )
return train_l_sum / k, valid_l_sum / k
开始训练
k, num_epochs, lr, weight_decay, batch_size = 5 , 100 , 5 , 0 , 64
train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr,
weight_decay, batch_size)
print ( f ' { k } -折验证: 平均训练log rmse: {float (train_l) :f } , '
f '平均验证log rmse: {float (valid_l) :f } ' )
训练结果
( 2919, 79 )
< class 'pandas.core.frame.DataFrame' >
Index: 2919 entries, 0 to 1458
Columns: 330 entries, MSSubClass to SaleCondition_nan
dtypes: float32 ( 330 )
memory usage: 3.7 MB
None
折1,训练log rmse0.170216, 验证log rmse0.157181
折2,训练log rmse0.162235, 验证log rmse0.188669
折3,训练log rmse0.164142, 验证log rmse0.168466
折4,训练log rmse0.168107, 验证log rmse0.154406
折5,训练log rmse0.162575, 验证log rmse0.182377
5-折验证: 平均训练log rmse: 0.165455, 平均验证log rmse: 0.170220
提交测试结果
net = get_net()
train_ls, _ = train(net, train_features, train_labels, None , None ,
num_epochs, lr, weight_decay, batch_size)
preds = net(test_features).detach().numpy()
test_data[ 'SalePrice' ] = pd.Series(preds.reshape( 1 , - 1 )[ 0 ])
submission = pd.concat([test_data[ 'Id' ], test_data[ 'SalePrice' ]], axis = 1 )
submission.to_csv( 'submission.csv' , index = False )