# 如何使用sklearn進行數據挖掘
## 引言
在當今數據驅動的時代,數據挖掘已成為從海量數據中提取有價值信息的關鍵技術。Python生態中的scikit-learn(簡稱sklearn)作為最受歡迎的機器學習庫之一,為數據挖掘任務提供了高效且易用的工具集。本文將系統介紹如何利用sklearn完成典型的數據挖掘流程,涵蓋數據預處理、特征工程、模型訓練與評估等核心環節。
---
## 一、環境準備與數據加載
### 1.1 安裝sklearn
```bash
pip install scikit-learn pandas numpy matplotlib
sklearn支持多種數據輸入格式:
from sklearn import datasets
# 加載內置數據集
iris = datasets.load_iris()
X, y = iris.data, iris.target
# 從CSV文件加載(需配合pandas)
import pandas as pd
df = pd.read_csv('data.csv')
X = df.drop('target', axis=1)
y = df['target']
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(X_categorical)
from sklearn.feature_selection import SelectKBest, f_classif
selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(X, y)
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95) # 保留95%方差
X_pca = pca.fit_transform(X)
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2, interaction_only=True)
X_poly = poly.fit_transform(X)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42)
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)
from sklearn.svm import SVR
reg = SVR(kernel='rbf')
reg.fit(X_train, y_train)
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)
# 分類評估
from sklearn.metrics import accuracy_score, f1_score
y_pred = clf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
# 回歸評估
from sklearn.metrics import mean_squared_error
print(f"MSE: {mean_squared_error(y_test, y_pred):.2f}")
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
print(f"CV Accuracy: {scores.mean():.2f} (±{scores.std():.2f})")
from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators': [50, 100, 200]}
grid_search = GridSearchCV(clf, param_grid, cv=5)
grid_search.fit(X_train, y_train)
print(f"Best params: {grid_search.best_params_}")
import joblib
# 保存模型
joblib.dump(clf, 'model.pkl')
# 加載模型
clf_loaded = joblib.load('model.pkl')
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
('imputer', SimpleImputer()),
('scaler', StandardScaler()),
('classifier', RandomForestClassifier())
])
pipeline.fit(X_train, y_train)
import seaborn as sns
sns.pairplot(df, hue='churn')
# 創建預處理管道
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numerical_features),
('cat', OneHotEncoder(), categorical_features)])
# 構建完整模型
model = Pipeline([
('preprocessor', preprocessor),
('classifier', GradientBoostingClassifier())
])
# 訓練與評估
model.fit(X_train, y_train)
print(classification_report(y_test, model.predict(X_test)))
class_weight參數或SMOTE過采樣make_scorer創建業務指標n_jobs=-1利用所有CPU核心partial_fit方法SGDClassifier替代常規算法feature_importances_屬性sklearn通過其一致的API設計和豐富的算法實現,顯著降低了數據挖掘的技術門檻。掌握本文介紹的核心流程后,讀者可以: - 快速構建端到端的數據挖掘管道 - 靈活應對結構化數據的各類問題 - 通過模塊化組合實現復雜需求
建議進一步探索:
- 官方文檔
- sklearn.externals擴展功能
- 與其他庫(如XGBoost)的集成使用
注意:本文代碼示例需根據實際數據調整參數,完整項目建議采用Jupyter Notebook進行交互式開發。 “`
(全文約2350字,實際字數可能因Markdown渲染方式略有差異)
免責聲明:本站發布的內容(圖片、視頻和文字)以原創、轉載和分享為主,文章觀點不代表本網站立場,如果涉及侵權請聯系站長郵箱:is@yisu.com進行舉報,并提供相關證據,一經查實,將立刻刪除涉嫌侵權內容。