算法原理:​​Bagging、Boosting(AdaBoost)原理与公式推导​

算法步骤:                          

                              python3 Boosting(AdaBoost)算法实现_公式推导

import numpy as np
import seaborn as sns

sns.set_style('white')

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier


def AdaBoost(X: np.array, t: np.array, M=10, learning_rate=1.0) -> np.array:
# t = y
estimator_list, t_predict_list, estimator_error_list, \
estimator_weight_list, sample_weight_list = [], [], [], [], []

# 样本量
N = len(t)
# 初始化权重
w = np.ones(N) / N
sample_weight_list.append(w.copy())

for m in range(M):
# 拟合一个基分类器estimator(y_m)
estimator = DecisionTreeClassifier(max_depth=1, max_leaf_nodes=2)
estimator.fit(X, t, sample_weight=w)
t_predict = estimator.predict(X)

# 错分类数
M_m = (t_predict != t)

# 基分类器误差
epsilon_m = np.average(M_m, weights=w, axis=0)

# 基分类器的权重
alpha = learning_rate * np.log((1 - epsilon_m) / epsilon_m)

# 更新样本权重 w
w *= np.exp(alpha * M_m * ((w > 0) | (alpha < 0)))

# 保留迭代的值
estimator_list.append(estimator)
t_predict_list.append(t_predict.copy())
estimator_error_list.append(epsilon_m.copy())
estimator_weight_list.append(alpha.copy())
sample_weight_list.append(w.copy())

# 为方便计算,将上面的列表转为np.array
estimator_list = np.array(estimator_list)
t_predict_list = np.array(t_predict_list)
estimator_error_list = np.array(estimator_error_list)
estimator_weight_list = np.array(estimator_weight_list)
sample_weight_list = np.array(sample_weight_list)

# 预测
pre = []
for n in range(N):
alpha_ym = t_predict_list[:, n] * estimator_weight_list
sign = np.sign(alpha_ym.sum())
pre.append(sign)
print('Accuracy=', (pre == t).sum() / N)
return estimator_list, estimator_weight_list, sample_weight_list


# prepare the data
from sklearn.datasets import make_classification

X, t = make_classification(n_samples=1000, n_features=4, n_informative=2,
n_redundant=0, random_state=0, shuffle=False)
ada = AdaBoost(X, t) # Accuracy= 0.496

# 与sklearn 对比
clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1, max_leaf_nodes=2),
algorithm='SAMME', n_estimators=10, learning_rate=1.0)
clf.fit(X, t)
clf.score(X, t) # 0.947