# 6.5 XGBoost实战

### 1. 判断蘑菇是否有毒—二分类

# /usr/bin/python
# -*- encoding:utf-8 -*-

import xgboost as xgb
import numpy as np

# 自定义损失函数的梯度和二阶导
# 定义f: theta * x
def log_reg(y_hat, y):
p = 1.0 / (1.0 + np.exp(-y_hat))
g = p - y.get_label()
h = p * (1.0 - p)
return g, h

def error_rate(y_hat, y):
return 'error', float(sum(y.get_label() != (y_hat > 0.5))) / len(y_hat)

if __name__ == "__main__":
# 读取数据
data_train = xgb.DMatrix('agaricus_train.txt')
data_test = xgb.DMatrix('agaricus_test.txt')
print(data_train)
print(type(data_train))

# 读取数据并自动做一个分割（把libsvm格式读取成以前我们常用的二维数组形式）
# train函数下需要传入一个Dmatrix值
# dtrain = xgb.DMatrix(X_train, y_train)
# dtest = xgb.DMatrix(X_test, y_test)

# 设置参数
# 当使用binary:logistic时，默认使用logloss作为评价指标
# 当使用binary:logitraw时，默认使用AUC作为评价指标
param = {'max_depth': 3, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'}  # logitraw
# param = {'max_depth': 3, 'eta': 0.3, 'silent': 1, 'objective': 'reg:logistic'}
watchlist = [(data_test, 'eval'), (data_train, 'train')]
n_round = 7
bst = xgb.train(param, data_train, num_boost_round=n_round, evals=watchlist)
# bst = xgb.train(param, data_train, num_boost_round=n_round, evals=watchlist, obj=log_reg,
#                 feval=error_rate, early_stopping_rounds=30, verbose_eval=True)

# 计算错误率
y_hat = bst.predict(data_test)
y = data_test.get_label()
print(y_hat)
print(y)
error = sum(y != (y_hat > 0.5))
error_rate = float(error) / len(y_hat)
print(error_rate)
print('样本总数：/t', len(y_hat))
print('错误数目：/t%4d' % error)
print('错误率：/t%.5f%%' % (100 * error_rate))


[0]	eval-logloss:0.16396	train-logloss:0.16118
[1]	eval-logloss:0.06293	train-logloss:0.06473
[2]	eval-logloss:0.02504	train-logloss:0.02502
[3]	eval-logloss:0.01229	train-logloss:0.01297
[4]	eval-logloss:0.00718	train-logloss:0.00706
[5]	eval-logloss:0.00507	train-logloss:0.00488
[6]	eval-logloss:0.00277	train-logloss:0.00276
[0.0013066  0.9952118  0.0013066  ... 0.99917597 0.00155224 0.998858  ]
[0. 1. 0. ... 1. 0. 1.]
0.0



### 2.判断蘑菇是否有毒—手动读取数据

# /usr/bin/python
# -*- coding:utf-8 -*-

import xgboost as xgb
import numpy as np
import scipy.sparse
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 标签、行、列、值
y = []
row = []
col = []
values = []
r = 0  # 首行
for d in open(path):
d = d.strip().split()  # 以空格分开
y.append(int(d[0]))
d = d[1:]
for c in d:
key, value = c.split(':')
row.append(r)
col.append(int(key))
values.append(float(value))
r += 1

# 创建稀疏矩阵，将(row, col)对应位置的值赋值成values
x = scipy.sparse.csr_matrix((values, (row, col))).toarray()
y = np.array(y)
return x, y

if __name__ == '__main__':
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, train_size=0.7)

# Logistic回归
lr = LogisticRegression(penalty='l2')
lr.fit(x_train, y_train.ravel())
y_hat = lr.predict(x_test)
print('Logistic回归正确率：', accuracy_score(y_test, y_hat))

# XGBoost
data_train = xgb.DMatrix(x_train, label=y_train)
data_test = xgb.DMatrix(x_test, label=y_test)
watch_list = [(data_test, 'eval'), (data_train, 'train')]
param = {'max_depth': 3, 'eta': 1, 'silent': 0, 'objective': 'multi:softmax', 'num_class': 3}
bst = xgb.train(param, data_train, num_boost_round=4, evals=watch_list)
y_hat = bst.predict(data_test)
print('XGBoost正确率：', accuracy_score(y_test, y_hat))


[0]	eval-mlogloss:0.22262	train-mlogloss:0.23514
[1]	eval-mlogloss:0.07970	train-mlogloss:0.08438
[2]	eval-mlogloss:0.03086	train-mlogloss:0.03198
[3]	eval-mlogloss:0.01211	train-mlogloss:0.01313
XGBoost正确率： 1.0


### 3.鸢尾花分类

# /usr/bin/python
# -*- encoding:utf-8 -*-

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split

if __name__ == "__main__":
path = 'iris.data'  # 数据文件路径
x, y = data.iloc[:, :4], data.iloc[:, 4]
# 将类别数据转换成数值数据
# y = pd.Categorical(y).codes
y = y.map({'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}).astype(int)
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, test_size=50)

data_train = xgb.DMatrix(x_train, label=y_train)
data_test = xgb.DMatrix(x_test, label=y_test)
watch_list = [(data_test, 'eval'), (data_train, 'train')]
param = {'max_depth': 2, 'eta': 0.3, 'silent': 1, 'objective': 'multi:softmax', 'num_class': 3}

bst = xgb.train(param, data_train, num_boost_round=6, evals=watch_list)
y_hat = bst.predict(data_test)
result = y_test.values.reshape(1, -1) == y_hat
print('正确率:/t', float(np.sum(result)) / len(y_hat))
print('END...../n')


[0]	eval-mlogloss:0.75514	train-mlogloss:0.75286
[1]	eval-mlogloss:0.55165	train-mlogloss:0.54831
[2]	eval-mlogloss:0.41535	train-mlogloss:0.41431
[3]	eval-mlogloss:0.32765	train-mlogloss:0.32384
[4]	eval-mlogloss:0.26220	train-mlogloss:0.25719
[5]	eval-mlogloss:0.21648	train-mlogloss:0.20874

END.....


### 4.葡萄酒分类

# !/usr/bin/python
# -*- encoding:utf-8 -*-

import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split   # cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

if __name__ == "__main__":

y, x = np.split(data, (1,), axis=1)
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, test_size=0.3)

# Logistic回归
lr = LogisticRegression(penalty='l2')
lr.fit(x_train, y_train.ravel())
y_hat = lr.predict(x_test)
print('Logistic回归正确率：', accuracy_score(y_test, y_hat))

# XGBoost
# 把标记为3的都设置为0，因为softmax分类从0开始
y_train[y_train == 3] = 0
y_test[y_test == 3] = 0
data_train = xgb.DMatrix(x_train, label=y_train)
data_test = xgb.DMatrix(x_test, label=y_test)
watch_list = [(data_test, 'eval'), (data_train, 'train')]
params = {'max_depth': 3, 'eta': 1, 'silent': 0, 'objective': 'multi:softmax', 'num_class': 3}
bst = xgb.train(params, data_train, num_boost_round=20, evals=watch_list)
y_hat = bst.predict(data_test)
print('XGBoost正确率：', accuracy_score(y_test, y_hat))


Logistic回归正确率： 0.9629629629629629

[0]	eval-mlogloss:0.31720	train-mlogloss:0.25776
[1]	eval-mlogloss:0.16601	train-mlogloss:0.11004
[2]	eval-mlogloss:0.10526	train-mlogloss:0.05128
[3]	eval-mlogloss:0.08201	train-mlogloss:0.02994
[4]	eval-mlogloss:0.07387	train-mlogloss:0.02009
[5]	eval-mlogloss:0.06578	train-mlogloss:0.01496
[6]	eval-mlogloss:0.05878	train-mlogloss:0.01242
[7]	eval-mlogloss:0.05548	train-mlogloss:0.01168
[8]	eval-mlogloss:0.05426	train-mlogloss:0.01126
[9]	eval-mlogloss:0.05602	train-mlogloss:0.01098
[10]	eval-mlogloss:0.05326	train-mlogloss:0.01077
[11]	eval-mlogloss:0.05477	train-mlogloss:0.01059
[12]	eval-mlogloss:0.05255	train-mlogloss:0.01043
[13]	eval-mlogloss:0.05378	train-mlogloss:0.01032
[14]	eval-mlogloss:0.05370	train-mlogloss:0.01032
[15]	eval-mlogloss:0.05366	train-mlogloss:0.01032
[16]	eval-mlogloss:0.05364	train-mlogloss:0.01032
[17]	eval-mlogloss:0.05362	train-mlogloss:0.01032
[18]	eval-mlogloss:0.05361	train-mlogloss:0.01032
[19]	eval-mlogloss:0.05361	train-mlogloss:0.01032
XGBoost正确率： 0.9814814814814815


### 5.泰坦尼克号

survival : 是否活着 (0 = No; 1 = Yes)
pclass ： Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
name ：名字
sex ： 性别
age ：年龄
sibsp ：配偶的人数
parch：父母子女人数
ticket：机票编号
fare ：乘客票价
cabin：船舱
embarked：登船的港口（C =瑟堡; Q =皇后镇; S =南安普敦）

Pclass是社会经济地位（SES）状态
1st〜上层； 2nd〜中间； 3〜下

#!usr/bin/env python
# -*- coding:utf-8 -*-
"""
@file: 泰坦尼克号.py
@time: 2021/01/25
@desc:
"""
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import csv
import xgboost as xgb

# 读取数据，并数据处理
# 输出显示设置
pd.set_option('display.width', 200)
# 加载数据
# 查看数据前几行
# 显示数据整体情况-数据快速统计摘要
print('data.describe =/n', data.describe())
# 将空格转换为空值
data.replace(to_replace=' ', value=np.NAN, inplace=True)
# 数据的空值统计
print(data.isnull().sum())

# 将类别数据转换成数值数据-字典映射
data['Sex'] = data['Sex'].map({'female': 0, 'male': 1}).astype(int)

# 补充船票价格缺失值
# print(len(data['Fare'][data['Fare'] == 0])) # 15
if len(data.Fare[data['Fare'] == 0]) > 0:
fare = np.zeros(3)
for i in range(0, 3):
fare[i] = data[data.Pclass == i + 1]['Fare'].dropna().median()
print(fare)

# 填充对应等级船票价格
for i in range(0, 3):
data.loc[(data.Fare.isnull()) & (data.Pclass == i + 1), 'Fare'] = fare[i]

# 年龄处理
# 一种是常见的用均值代替缺失值
# mean_age = data['Age'].dropna().mean()
# data['Age'].fillna(mean_age, inplace=True)

# 使用随机森林预测年龄
if is_train:
print('随机森林开始预测年龄')
data_for_age = data[['Age', 'Survived', 'Fare', 'Parch', 'SibSp', 'Pclass']]
age_exist = data_for_age.loc[(data.Age.notnull())]
age_null = data_for_age.loc[(data.Age.isnull())]
print(age_exist)
print(age_null)
x = age_exist.iloc[:, 1:]
y = age_exist.iloc[:, 0]
clf = RandomForestRegressor(n_estimators=1000)
clf.fit(x, y)
age_hat = clf.predict(age_null.values[:, 1:])
# print(age_hat)
#  把预测的数据填充到Age列的空的那些行中
data.loc[(data.Age.isnull()), 'Age'] = age_hat
print('随机森林预测缺失年龄：--over--')
else:
# 如果是测试数据,则没有Survived这一项,
# 所以前面加一个is_train用来判段是测试数据还是训练数据
print('随机森林预测缺失年龄2：--start--')
data_for_age = data[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']]
age_exist = data_for_age.loc[(data.Age.notnull())]  # 年龄不缺失的数据
age_null = data_for_age.loc[(data.Age.isnull())]
print(age_exist.isnull().sum())
x = age_exist.values[:, 1:]
y = age_exist.values[:, 0]
rfr = RandomForestRegressor(n_estimators=1000)
rfr.fit(x, y)
age_hat = rfr.predict(age_null.values[:, 1:])
# print age_hat
data.loc[(data.Age.isnull()), 'Age'] = age_hat
print('随机森林预测缺失年龄2：--over--')

# 起始城市
data.loc[(data.Embarked.isnull()), 'Embarked'] = 'S'  # 保留缺失出发城市
# data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2, 'U': 0}).astype(int)
# print(data['Embarked'])

# 取出Embarked这一列的数据,pd.get_dummies表示获得出发城市的哑元，就是有什么值
embarked_data = pd.get_dummies(data.Embarked)
print('embarked_data = /n', embarked_data)
# 把所有出发城市拿出来,加上后缀,形成三个特征
embarked_data = embarked_data.rename(columns=lambda x: 'Embarked_' + str(x))
# 数据和这个新的特征组合在一起,形成新的数据
data = pd.concat([data, embarked_data], axis=1)
print(data.describe())
# 保存数据
data.to_csv('New_Data.csv')

# 把清洗后的数据提取出来作为x
x = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]
y = None
# 如果是训练集,提取y
if 'Survived' in data:
y = data['Survived']

# 转成对应的矩阵
x = np.array(x)
y = np.array(y)

# 平铺五行,让测试数据变得更多
# 可以显著提高准确率
x = np.tile(x, (5, 1))
y = np.tile(y, (5,))
if is_train:
return x, y
# print(data.index)
return x, data.index

# 输出结果
def write_result(c, c_type):
file_name = 'Titanic.test.csv'

if c_type == 3:
x = xgb.DMatrix(x)
y = c.predict(x)
y[y > 0.5] = 1
y[~(y > 0.5)] = 0

predictions_file = open("Prediction_%d.csv" % c_type, "w")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["PassengerId", "Survived"])
open_file_object.writerows(zip(passenger_id, y))
predictions_file.close()

if __name__ == "__main__":
# 读取数据
# 数据集的分割，这里的test其实是验证数据
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1)

# 逻辑回归
lr = LogisticRegression(penalty='l2')
lr.fit(x_train, y_train)
y_hat = lr.predict(x_test)
lr_acc = accuracy_score(y_test, y_hat)
write_result(lr, 1)

# 随机森林
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(x_train, y_train)
y_hat = rfc.predict(x_test)
rfc_acc = accuracy_score(y_test, y_hat)
write_result(rfc, 2)

# XGBoost
data_train = xgb.DMatrix(x_train, label=y_train)
data_test = xgb.DMatrix(x_test, label=y_test)
watch_list = [(data_test, 'eval'), (data_train, 'train')]
param = {'max_depth': 6, 'eta': 0.8, 'silent': 1, 'objective': 'binary:logistic'}
# 'subsample': 1, 'alpha': 0, 'lambda': 0, 'min_child_weight': 1}
bst = xgb.train(param, data_train, num_boost_round=100, evals=watch_list,
early_stopping_rounds=30, verbose_eval=True)
y_hat = bst.predict(data_test, ntree_limit=bst.best_ntree_limit)
write_result(bst, 3)
y_hat[y_hat > 0.5] = 1
y_hat[~(y_hat > 0.5)] = 0
xgb_acc = accuracy_score(y_test, y_hat)

print('Logistic回归：%.3f%%' % lr_acc)
print('随机森林：%.3f%%' % rfc_acc)
print('XGBoost：%.3f%%' % xgb_acc)


Logistic回归：0.797%

XGBoost：0.983%


Vieu3.3主题

Q Q 登 录