import pandas as pd from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error # 1. 加载数据 train_data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv') test_data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv') # 2. 极简预处理:仅处理缺失值 train_data.fillna(train_data.median(numeric_only=True), inplace=True) test_data.fillna(test_data.median(numeric_only=True), inplace=True) # 3. 选择数值型特征 numeric_cols = train_data.select_dtypes(include=['int64', 'float64']).columns X = train_data[numeric_cols].drop('SalePrice', axis=1) y = train_data['SalePrice'] # 4. 划分训练集 X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42) # 5. 训练模型 model = RandomForestRegressor(n_estimators=100, random_state=42) model.fit(X_train, y_train) # 6. 验证评估 val_preds = model.predict(X_val) rmse = mean_squared_error(y_val, val_preds, squared=False) print(f'验证集 RMSE: {rmse}') # 7. 生成预测结果 test_X = test_data[numeric_cols].drop('SalePrice', errors='ignore') test_preds = model.predict(test_X) # 8. 保存提交文件 submission = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': test_preds}) submission.to_csv('submission.csv', index=False)参考来源
- kaggle房价预测-回归模型
- 基于python的kaggle练习(二)——员工离职预测
- 利用python机器学习库进行Kaggle皮马印第安人糖尿病预测分析
- 【python】kaggle项目之纽约出租车行程时间预测
- python实现房价预测(一)