一、SVD的实际价值
1.计算效率提升
特征从n维降至k维(k<n)
减少模型参数数量
加快训练和预测速度
2.模型泛化能力
去除噪声和冗余信息
可能提高模型在测试集上的表现
减少过拟合风险
二、实际书写思路及其代码
针对心脏并数据集我们进行了相应的SVD分解并比较了其在准确率上的差异。
# 先运行之前预处理好的代码 import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import warnings from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, classification_report, confusion_matrix warnings.filterwarnings("ignore") # 设置中文字体 plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False df = pd.read_csv(r'D:\Python60DaysChallenge-main\heart.csv') X = df.iloc[:, :-1] y = df.iloc[:, -1] # 切分数据 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) print(f"训练集形状: {X_train.shape}") print(f"测试集形状: {X_test.shape}") print(f"原始特征数: {X_train.shape[1]}\n") # ===================== 1. 原始数据(不标准化) ===================== print("=== 1. 原始数据模型(不标准化)===") model_original = LogisticRegression(random_state=42, max_iter=1000) model_original.fit(X_train, y_train) y_pred_original = model_original.predict(X_test) accuracy_original = accuracy_score(y_test, y_pred_original) print(f"准确率: {accuracy_original:.4f}") # 原始数据分类报告 print("分类报告:") print(classification_report(y_test, y_pred_original)) print() # ===================== 2. SVD降维 + 标准化 ===================== print("\n=== 2. SVD降维 + 标准化模型 ===") # 标准化数据 from sklearn.preprocessing import StandardScaler scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) # 对标准化后的训练集进行SVD分解 U_train, sigma_train, Vt_train = np.linalg.svd(X_train_scaled, full_matrices=False) print(f"奇异值数量: {len(sigma_train)}") # 通过累计方差贡献率选择k(只使用训练集信息) cumulative_variance_ratio = np.cumsum(sigma_train**2) / np.sum(sigma_train**2) k = np.argmax(cumulative_variance_ratio >= 0.95) + 1 print(f"选择k={k},累计方差贡献率: {cumulative_variance_ratio[k-1]:.4f}") print(f"特征从 {X_train.shape[1]} 维降至 {k} 维") # 绘制累计方差贡献率图 plt.figure(figsize=(10, 6)) plt.plot(range(1, len(cumulative_variance_ratio) + 1), cumulative_variance_ratio, 'b-', linewidth=2) plt.axhline(y=0.95, color='r', linestyle='--', label='95% 方差线') plt.axvline(x=k, color='g', linestyle='--', label=f'k={k}') plt.xlabel('奇异值数量') plt.ylabel('累计方差贡献率') plt.title('累计方差贡献率曲线') plt.legend() plt.grid(True) plt.show() # 降维 - 正确使用Vt_k Vt_k = Vt_train[:k, :] # 形状: (k, n_features) print(f"Vt_k形状: {Vt_k.shape}") # 对训练集和测试集进行降维 X_train_reduced = X_train_scaled @ Vt_k.T # 等价于: np.dot(X_train_scaled, Vt_k.T) X_test_reduced = X_test_scaled @ Vt_k.T print(f"降维后训练集形状: {X_train_reduced.shape}") print(f"降维后测试集形状: {X_test_reduced.shape}") # 验证降维的维度 print(f"验证: 训练集从 {X_train_scaled.shape} 降维到 {X_train_reduced.shape}") print(f"验证: 测试集从 {X_test_scaled.shape} 降维到 {X_test_reduced.shape}") # 训练SVD降维后的模型 model_svd = LogisticRegression(random_state=42, max_iter=1000) model_svd.fit(X_train_reduced, y_train) y_pred_svd = model_svd.predict(X_test_reduced) accuracy_svd = accuracy_score(y_test, y_pred_svd) print(f"准确率: {accuracy_svd:.4f}") # SVD降维分类报告 print("分类报告:") print(classification_report(y_test, y_pred_svd)) print() # ===================== 性能对比 ===================== print("\n" + "="*50) print("模型性能对比总结:") print("="*50) print(f"1. 原始数据模型准确率: {accuracy_original:.4f}") print(f"2. SVD降维模型准确率: {accuracy_svd:.4f}") print(f"准确率变化: {accuracy_svd - accuracy_original:+.4f}") if accuracy_svd > accuracy_original: print(f"SVD降维提高了准确率 {(accuracy_svd - accuracy_original)*100:.2f}%") elif accuracy_svd < accuracy_original: print(f"SVD降维降低了准确率 {(accuracy_original - accuracy_svd)*100:.2f}%") else: print("SVD降维对准确率无影响") # ===================== 混淆矩阵可视化 ===================== fig, axes = plt.subplots(1, 2, figsize=(14, 5)) # 原始数据混淆矩阵 cm_original = confusion_matrix(y_test, y_pred_original) sns.heatmap(cm_original, annot=True, fmt='d', cmap='Blues', ax=axes[0]) axes[0].set_title(f'原始数据模型\n准确率: {accuracy_original:.4f}') axes[0].set_xlabel('预测标签') axes[0].set_ylabel('真实标签') # SVD降维混淆矩阵 cm_svd = confusion_matrix(y_test, y_pred_svd) sns.heatmap(cm_svd, annot=True, fmt='d', cmap='Blues', ax=axes[1]) axes[1].set_title(f'SVD降维模型\n准确率: {accuracy_svd:.4f}') axes[1].set_xlabel('预测标签') axes[1].set_ylabel('真实标签') plt.tight_layout() plt.show() # ===================== 计算近似误差 ===================== print("\n" + "="*50) print("SVD降维质量评估:") print("="*50) # 重建训练集 Sigma_k = np.diag(sigma_train[:k]) X_train_reconstructed = U_train[:, :k] @ Sigma_k @ Vt_k # 计算重构误差 frobenius_error = np.linalg.norm(X_train_scaled - X_train_reconstructed, 'fro') frobenius_original = np.linalg.norm(X_train_scaled, 'fro') relative_error = frobenius_error / frobenius_original print(f"重构误差 (Frobenius范数): {frobenius_error:.6f}") print(f"相对重构误差: {relative_error:.6f}") print(f"信息保留率: {(1 - relative_error)*100:.2f}%") # ===================== 不同k值性能分析 ===================== print("\n" + "="*50) print("不同k值下的模型性能:") print("="*50) accuracies_by_k = [] k_range = range(1, min(15, len(sigma_train)) + 1) for k_test in k_range: Vt_k_test = Vt_train[:k_test, :] X_train_reduced_test = X_train_scaled @ Vt_k_test.T X_test_reduced_test = X_test_scaled @ Vt_k_test.T model_test = LogisticRegression(random_state=42, max_iter=1000) model_test.fit(X_train_reduced_test, y_train) y_pred_test = model_test.predict(X_test_reduced_test) accuracy_test = accuracy_score(y_test, y_pred_test) accuracies_by_k.append(accuracy_test) variance_ratio = cumulative_variance_ratio[k_test-1] if k_test == k: print(f"k={k_test:2d} (自动选择) | 准确率: {accuracy_test:.4f} | 方差贡献率: {variance_ratio:.4f}") else: print(f"k={k_test:2d} | 准确率: {accuracy_test:.4f} | 方差贡献率: {variance_ratio:.4f}") # 找到最佳k值 best_idx = np.argmax(accuracies_by_k) best_k = k_range[best_idx] best_accuracy = accuracies_by_k[best_idx] print(f"\n最佳k值: k={best_k}, 准确率: {best_accuracy:.4f}") print(f"与自动选择的k={k}相比: 准确率{best_accuracy-accuracies_by_k[k-1]:+.4f}") # 绘制不同k值下的准确率 plt.figure(figsize=(10, 6)) plt.plot(k_range, accuracies_by_k, 'o-', linewidth=2, markersize=8, color='blue') plt.axvline(x=k, color='r', linestyle='--', label=f'自动选择 k={k}', linewidth=2) plt.axvline(x=best_k, color='green', linestyle='--', label=f'最佳 k={best_k}', linewidth=2) plt.axhline(y=accuracy_original, color='orange', linestyle='--', label=f'原始数据准确率: {accuracy_original:.4f}', linewidth=2) plt.xlabel('k值(保留的奇异值数量)', fontsize=12) plt.ylabel('测试集准确率', fontsize=12) plt.title('不同k值下的SVD模型准确率', fontsize=14) plt.legend(loc='best') plt.grid(True, alpha=0.3) plt.tight_layout() plt.show()运行结果:
注意:
1.SVD不是简单地"去除"某些原始特征
SVD创建了新的主成分(线性组合)
每个主成分是所有原始特征的加权组合
去除的是主成分,不是原始特征
2.k=4时的情况
保留了4个主成分
这4个主成分解释了数据的大部分变异
丢弃了9个主成分(对应较小的奇异值)
@浙大疏锦行