這篇文章主要講解了“python怎么實現AdaBoost算法”,文中的講解內容簡單清晰,易于學習與理解,下面請大家跟著小編的思路慢慢深入,一起來研究和學習“python怎么實現AdaBoost算法”吧!
import numpy as np import pandas as pd import math from math import log from math import exp from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split def create_data(): iris = load_iris() df = pd.DataFrame(iris.data, columns=iris.feature_names) df['label'] = iris.target df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label'] data = np.array(df.iloc[:100, [0, 1, -1]]) for i in range(len(data)): if data[i, -1] == 0: data[i, -1] = -1 return data[:, :2], data[:, -1] class AdaBoost: def __init__(self, n_estimators=50, learning_rate=1.0): self.clf_num = n_estimators self.learning_rate = learning_rate def init_args(self, datasets, labels): self.X = datasets self.Y = labels self.M, self.N = datasets.shape # 弱分類器數目和集合 self.clf_sets = [] # 初始化weights self.weights = [1.0 / self.M] * self.M # G(x)系數 alpha self.alpha = [] def _G(self, features, labels, weights): m = len(features) error = 100000.0 # 無窮大 best_v = 0.0 # 單維features features_min = min(features) features_max = max(features) n_step = (features_max - features_min + self.learning_rate) // self.learning_rate # print('n_step:{}'.format(n_step)) direct, compare_array = None, None for i in range(1, int(n_step)): v = features_min + self.learning_rate * i if v not in features: # 誤分類計算 compare_array_positive = np.array( [1 if features[k] > v else -1 for k in range(m)]) weight_error_positive = sum([ weights[k] for k in range(m) if compare_array_positive[k] != labels[k] ]) compare_array_nagetive = np.array( [-1 if features[k] > v else 1 for k in range(m)]) weight_error_nagetive = sum([ weights[k] for k in range(m) if compare_array_nagetive[k] != labels[k] ]) if weight_error_positive < weight_error_nagetive: weight_error = weight_error_positive _compare_array = compare_array_positive direct = 'positive' else: weight_error = weight_error_nagetive _compare_array = compare_array_nagetive direct = 'nagetive' # print('v:{} error:{}'.format(v, weight_error)) if weight_error < error: error = weight_error compare_array = _compare_array best_v = v return best_v, direct, error, compare_array # 計算alpha def _alpha(self, error): return 0.5 * np.log((1 - error) / error) # 規范化因子 def _Z(self, weights, a, clf): return sum([ weights[i] * np.exp(-1 * a * self.Y[i] * clf[i]) for i in range(self.M) ]) # 權值更新 def _w(self, a, clf, Z): for i in range(self.M): self.weights[i] = self.weights[i] * np.exp( -1 * a * self.Y[i] * clf[i]) / Z # G(x)的線性組合 def _f(self, alpha, clf_sets): pass def G(self, x, v, direct): if direct == 'positive': return 1 if x > v else -1 else: return -1 if x > v else 1 def fit(self, X, y): self.init_args(X, y) for epoch in range(self.clf_num): axis = 0 final_direct = 'null' best_clf_error, best_v, clf_result = 100000, None, None # 根據特征維度, 選擇誤差最小的 for j in range(self.N): features = self.X[:, j] # 分類閾值,分類誤差,分類結果 v, direct, error, compare_array = self._G( features, self.Y, self.weights) if error < best_clf_error: best_clf_error = error best_v = v final_direct = direct clf_result = compare_array axis = j # axis數字代表第幾個屬性列 # print('epoch:{}/{} feature:{} error:{} v:{}'.format(epoch, self.clf_num, j, error, best_v)) if best_clf_error == 0: break # 計算G(x)系數a a = self._alpha(best_clf_error) self.alpha.append(a) # 記錄分類器 self.clf_sets.append((axis, best_v, final_direct)) # 規范化因子 Z = self._Z(self.weights, a, clf_result) # 權值更新 self._w(a, clf_result, Z) def predict(self, feature): result = 0.0 for i in range(len(self.clf_sets)): axis, clf_v, direct = self.clf_sets[i] f_input = feature[axis] result += self.alpha[i] * self.G(f_input, clf_v, direct) # sign return 1 if result > 0 else -1 def score(self, X_test, y_test): right_count = 0 for i in range(len(X_test)): feature = X_test[i] if self.predict(feature) == y_test[i]: right_count += 1 return right_count / len(X_test) X, y = create_data() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) clf = AdaBoost(n_estimators=3, learning_rate=0.5) clf.fit(X_train, y_train) print("評分:{}".format(clf.score(X_test, y_test)))
結果:有時1.0
有時0.75
有時0.6
有時0.4
注意,這個程序計算規范化因子的時候可能報錯:TypeError: 'NoneType' object is not subscriptable
。原因是由于劃分數據的時候,v選擇的時候恰好造成了一邊為空,另一邊為滿的。由于有一邊是空的,所以,計算規范化因子的時候,參數clf為none。這時候我們在用clf[i],肯定是不行的,也就報了這個錯誤。
import numpy as np import pandas as pd import math from math import log from math import exp from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split from sklearn.ensemble import AdaBoostClassifier def create_data(): iris = load_iris() df = pd.DataFrame(iris.data, columns=iris.feature_names) df['label'] = iris.target df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label'] data = np.array(df.iloc[:100, [0, 1, -1]]) for i in range(len(data)): if data[i, -1] == 0: data[i, -1] = -1 return data[:, :2], data[:, -1] X, y = create_data() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) clf = AdaBoostClassifier(n_estimators=100, learning_rate=0.5) clf.fit(X_train, y_train) print("評分:{}".format(clf.score(X_test, y_test)))
感謝各位的閱讀,以上就是“python怎么實現AdaBoost算法”的內容了,經過本文的學習后,相信大家對python怎么實現AdaBoost算法這一問題有了更深刻的體會,具體使用情況還需要大家實踐驗證。這里是億速云,小編將為大家推送更多相關知識點的文章,歡迎關注!
免責聲明:本站發布的內容(圖片、視頻和文字)以原創、轉載和分享為主,文章觀點不代表本網站立場,如果涉及侵權請聯系站長郵箱:is@yisu.com進行舉報,并提供相關證據,一經查實,將立刻刪除涉嫌侵權內容。