# 6.集成学习

1147-柳同学

## 热门标签

, , , ,

### 二、Voting—投票

#### 1.硬投票分类器

from sklearn.datasets import make_moons
# 投票分类器
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# 加载数据集
# n_samples表示样本点，noise表示噪声
X, y = make_moons(n_samples=7000, noise=0.1)

# 数据集的分割
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

# 定义三个基本分类器
lr = LogisticRegression()
dt = DecisionTreeClassifier()
# probability=True表示可以输出概率值
svc = SVC()

# 定义投票分类器
# 硬投票分类器要求基分类器可以输出分类结果
vote = VotingClassifier(
estimators=[('lr', lr), ('dt', dt), ('svc', svc)],
voting='hard'  # soft or hard
)

# 输出各个分类器的概率
for clf in (lr, dt, svc, vote):
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(clf.__class__.__name__, '=', accuracy_score(y_test, y_pred))

LogisticRegression = 0.8761904761904762
DecisionTreeClassifier = 0.9957142857142857
SVC = 0.9976190476190476
VotingClassifier = 0.9966666666666667


#### 2.软投票分类器

from sklearn.datasets import make_moons
# 投票分类器
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# 加载数据集
# n_samples表示样本点，noise表示噪声
X, y = make_moons(n_samples=7000, noise=0.1)

# 数据集的分割
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

# 定义三个基本分类器
lr = LogisticRegression()
dt = DecisionTreeClassifier()
# probability=True表示可以输出概率值
svc = SVC(probability=True)

# 定义投票分类器
# 软投票分类器要求基分类器可以输入概率值
vote = VotingClassifier(
estimators=[('lr', lr), ('dt', dt), ('svc', svc)],
voting='soft'  # soft or hard
)

# 输出各个分类器的概率
for clf in (lr, dt, svc, vote):
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(clf.__class__.__name__, '=', accuracy_score(y_test, y_pred))

LogisticRegression = 0.8761904761904762
DecisionTreeClassifier = 0.9980952380952381
SVC = 0.9995238095238095
VotingClassifier = 1.0


### 三、Bagging与随机森林

#### 2.随机森林

from sklearn.datasets import load_iris
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# 加载数据
# X，y
X = iris_data.data
y = iris_data.target

# max_samples设置为整数时，表示采样的样本数，设置为浮点数时，表示max_samples * x.shape[0]
# oob_score利用取不到的数据作为测试集得出的评分
bag_svc = BaggingClassifier(
SVC(),
n_estimators=500,
bootstrap=True,
max_samples=1.0,
oob_score=True
)

bag_svc.fit(X, y)
# 预测
y_hat_svc = bag_svc.predict(X)
print(bag_svc.__class__.__name__, '=', accuracy_score(y, y_hat_svc))
print('oob_score = ', bag_svc.oob_score_)

# 基分类器是决策树,bagging后就是随机森林
bag_dt = BaggingClassifier(
DecisionTreeClassifier(),
n_estimators=500,
bootstrap=True,
max_samples=1.0,
oob_score=True
)

bag_dt.fit(X, y)
# 预测
y_hat_dt = bag_dt.predict(X)
print(bag_dt.__class__.__name__, '=', accuracy_score(y, y_hat_dt))
print('oob_score =', bag_dt.oob_score_)

# 随机森林
rbf = RandomForestClassifier(n_estimators=500)
rbf.fit(X, y)
y_hat_rbf = rbf.predict(X)
print(rbf.__class__.__name__, '=', accuracy_score(y, y_hat_rbf))

BaggingClassifier = 0.9666666666666667
oob_score =  0.9666666666666667
BaggingClassifier = 1.0
oob_score = 0.9533333333333334
RandomForestClassifier = 1.0


#### 5.异常检测

#!usr/bin/env python
# -*- coding:utf-8 -*-
"""
@time: 2021/01/19
@desc:
"""
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# 加载数据

# 数据量级不同时，一般做归一化，不过使用决策树做基本分类器时，不用做归一化，但诸如Logitic回归、SVM等一定要做归一化
X = StandardScaler().fit_transform(X)

# 数据集分割
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

# 定义弱分类器
clf = DecisionTreeClassifier()

n_estimators=50,
learning_rate=0.5)

model.fit(X_train, y_train)
y_test_hat = model.predict(X_test)
print('score = ', accuracy_score(y_test, y_test_hat))

score =  0.9444444444444444


### 五、Boosting—GBDT算法

#### 1.提升树

import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor

# 加载数据
X = data[:, :-1]
y = data[:, -1].reshape(-1, 1)
return X, y

if __name__ == '__main__':
plt.scatter(X, y)
plt.show()

# 根据算法原理手动实现GBDT—让后一个预测器针对前一个的残差进行拟合
tree1 = DecisionTreeRegressor(max_depth=5)
tree1.fit(X, y)
y_hat_1 = tree1.predict(X).reshape(-1, 1)
# 计算第一棵树的残差
y_reg1 = y - y_hat_1

# 对第一棵树的残差进行拟合
tree2 = DecisionTreeRegressor(max_depth=5)
tree2.fit(X, y_reg1)
y_hat_2 = tree2.predict(X).reshape(-1, 1)
# 计算第二棵树的残差
y_reg2 = y_reg1 - y_hat_2

# 对第二棵树的残差进行拟合
tree3 = DecisionTreeRegressor(max_depth=5)
tree3.fit(X, y_reg2)
y_hat_3 = tree3.predict(X).reshape(-1, 1)

# 测试前5条数据
X_test = X[:5, :]
y_1 = tree1.predict(X_test)
y_2 = tree2.predict(X_test)
y_3 = tree3.predict(X_test)

y_pred = sum(tree.predict(X_test) for tree in (tree1, tree2, tree3))
print(y_pred)

# 梯度提升
gbrt.fit(X, y)
print(gbrt.predict(X_test))

[17.61560196  9.15380196 12.831       4.57199973  6.68971688]
[17.61560196  9.15380196 12.831       4.57199973  6.68971688]


### 六、Boosting—XGBoost

#### 1.XGBoost公式推导

#!usr/bin/env python
# -*- coding:utf-8 -*-
"""
@file: xgb.py
@time: 2021/01/19
@desc:
"""
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

# xgboost中的数据格式是libsvm，libsvm的作用是对稀疏特征进行优化
# 每行表示一个样本，每行开头0,1表示标签，后面则是特征索引 : 数值，其他未表示的都是0

# 读取数据
data_train = xgb.DMatrix('DATA/agaricus.txt.train')
data_test = xgb.DMatrix('DATA/agaricus.txt.test')

# 设置参数
# eta：可看成学习率learning_rate,控制学习速度。典型值一般设置为：0.01-0.2
# gamma：分裂节点时，损失函数减小值只有大于等于gamma才分裂，gamma值越大，算法越保守，越不容易过拟合，但性能就不一定能保证，需要平衡。
# objective
#     - reg:linear：线性回归
#     - reg:logistic：逻辑回归
#     - binary:logistic 二分类的逻辑回归，返回预测的概率
#     - binary:logitraw：二分类逻辑回归，输出是逻辑为0/1的前一步的分数
#     - multi:softmax：用于Xgboost 做多分类问题，需要设置num_class（分类的个数）
#     - multi:softprob：和softmax一样，但是返回的是每个数据属于各个类别的概率。
#     - rank:pairwise：让Xgboost 做排名任务，通过最小化(Learn to rank的一种方法)
# max_depth：决策树最大深度
# silent：0 (silent), 1 (warning), 2 (info), 3 (debug)
# evals表示评价的时候选取什么
param = {'max_depth': 3, 'eta': 0.3, 'silent': 1, 'objective': 'binary:logistic'}
n_round = 6
watchlist = [(data_test, 'eval'), (data_train, 'train')]

# 训练
model = xgb.train(param, data_train, num_boost_round=n_round, evals=watchlist)

# 计算准确率
y_hat = model.predict(data_test)
# 把概率转换成0,1类
y_pred = y_hat.copy()
y_pred[y_pred >= 0.5] = 1
y_pred[y_pred < 0.5] = 0

# 获取实际标签
y = data_test.get_label()
print('准确率 =', accuracy_score(y, y_pred))

# 法一:采用to_graphviz法绘制树图
digraph = xgb.to_graphviz(model, num_trees=1)
digraph.format = 'png'
digraph.view('./xgb_view')

# 法二:采用plot_tree法绘制树图
fig = plt.figure(figsize=(10, 10))
ax = fig.subplots()
xgb.plot_tree(model, num_trees=1, ax=ax)
plt.show()



import xgboost as xgb
import warnings
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split

# 消除警告
warnings.filterwarnings(action='ignore')

# 读取数据并自动做一个分割（把libsvm格式读取成以前我们常用的二维数组形式）
print(X_train.toarray().shape)  # (6513, 126)

# 定义模型并设置参数，然后fit数据
model = xgb.XGBClassifier(max_depth=3, learning_rate=0.3, n_estimators=6, silent=True, objective='binary:logistic')
model.fit(X_train, y_train)

# 计算准确率
# 训练集上的准确率
train_preds = model.predict(X_train)
print('Train Accuary:%.2f%%' % (accuracy_score(y_train, train_preds) * 100))  # Train Accuary:99.88%
# 测试集上的准确率
test_preds = model.predict(X_test)
print('Test Accuary:%.2f%%' % (accuracy_score(y_test, test_preds) * 100))  # Test Accuary:100.00%

# GridSearchCV搜索最优参数
model = xgb.XGBClassifier(learning_rate=0.1, objective='binary:logistic')
param_grid = {
'n_estimators': range(1, 10, 1),
'max_depth': range(1, 5, 1)
}
clf = GridSearchCV(model, param_grid=param_grid, scoring='accuracy', cv=5)
clf.fit(X_train, y_train)

print(clf.best_params_)  # {'max_depth': 2, 'n_estimators': 30}
print(clf.best_score_)  # 0.9841860859908541

# early-stop及早停止
# 设置验证valid集，当我们迭代过程中发现在验证集上错误率增加，则提前停止迭代。
X_train_part, X_validate, y_train_part, y_validate = train_test_split(X_train, y_train, train_size=0.7, random_state=0)

# 设置boosting迭代计算次数
num_round = 100
bst = xgb.XGBClassifier(max_depth=2, learning_rate=0.1, n_estimators=num_round, silent=True,
objective="binary:logistic")
# 验证集
eval_set = [(X_validate, y_validate)]
# early_stopping_rounds
# eval_metric表示评价指数-错误率
bst.fit(X_train_part, y_train_part, early_stopping_rounds=10, eval_metric='error', eval_set=eval_set, verbose=True)

results = bst.evals_result()
print(results)

# 将上面的错误率进行可视化，方便观察
epochs = len(results['validation_0']['error'])
x_axis = range(0, epochs)
plt.plot(x_axis, results['validation_0']['error'], label='test')
plt.ylabel('Error')
plt.xlabel('Round')
plt.title('XGBoost Early Stop')
plt.show()

# 学习曲线
num_round = 100
bst = xgb.XGBClassifier(max_depth=2, learning_rate=0.1, n_estimators=num_round, silent=True,
objective="binary:logistic")
# 验证集
eval_set = [(X_train_part, y_train_part), (X_validate, y_validate)]
# eval_metric=['error','logloss']表示两个评价指标，一个错误率，一个是损失
bst.fit(X_train_part, y_train_part, eval_set=eval_set, eval_metric=['error', 'logloss'], verbose=True)
results = bst.evals_result()
print(results)

# 画图
epochs = len(results['validation_0']['error'])
x_axis = range(0, epochs)

fig, ax = plt.subplots()
ax.plot(x_axis, results['validation_0']['logloss'], label='Train')
ax.plot(x_axis, results['validation_1']['logloss'], label='Test')
# ax.legend()表示给图形添加图例
ax.legend()
plt.ylabel('Log Loss')
plt.title('XGBoost Log Loss')
plt.show()

fig, ax = plt.subplots()
ax.plot(x_axis, results['validation_0']['error'], label='Train')
ax.plot(x_axis, results['validation_1']['error'], label='Test')
ax.legend()
plt.ylabel('Classification Error')
plt.title('XGBoost Classification Error')
plt.show()



Vieu3.3主题

Q Q 登 录