6.5 XGBoost实战

首页 » 算法 » 正文

1. 判断蘑菇是否有毒—二分类

蘑菇数据集下载—提取码:d16v

# /usr/bin/python
# -*- encoding:utf-8 -*-

import xgboost as xgb
import numpy as np
from sklearn.datasets import load_svmlight_file

# 自定义损失函数的梯度和二阶导
# 定义f: theta * x
def log_reg(y_hat, y):
    p = 1.0 / (1.0 + np.exp(-y_hat))
    g = p - y.get_label()
    h = p * (1.0 - p)
    return g, h


def error_rate(y_hat, y):
    return 'error', float(sum(y.get_label() != (y_hat > 0.5))) / len(y_hat)


if __name__ == "__main__":
    # 读取数据
    data_train = xgb.DMatrix('agaricus_train.txt')
    data_test = xgb.DMatrix('agaricus_test.txt')
    print(data_train)
    print(type(data_train))
    
	# 读取数据并自动做一个分割(把libsvm格式读取成以前我们常用的二维数组形式)
	# X_train, y_train = load_svmlight_file('agaricus_train.txt')
	# X_test, y_test = load_svmlight_file('agaricus_test.txt')
	# train函数下需要传入一个Dmatrix值
	# dtrain = xgb.DMatrix(X_train, y_train)
	# dtest = xgb.DMatrix(X_test, y_test)


    # 设置参数
    # 当使用binary:logistic时,默认使用logloss作为评价指标
    # 当使用binary:logitraw时,默认使用AUC作为评价指标
    param = {'max_depth': 3, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'}  # logitraw
    # param = {'max_depth': 3, 'eta': 0.3, 'silent': 1, 'objective': 'reg:logistic'}
    watchlist = [(data_test, 'eval'), (data_train, 'train')]
    n_round = 7
    bst = xgb.train(param, data_train, num_boost_round=n_round, evals=watchlist)
    # bst = xgb.train(param, data_train, num_boost_round=n_round, evals=watchlist, obj=log_reg,
    #                 feval=error_rate, early_stopping_rounds=30, verbose_eval=True)

    # 计算错误率
    y_hat = bst.predict(data_test)
    y = data_test.get_label()
    print(y_hat)
    print(y)
    error = sum(y != (y_hat > 0.5))
    error_rate = float(error) / len(y_hat)
    print(error_rate)
    print('样本总数:/t', len(y_hat))
    print('错误数目:/t%4d' % error)
    print('错误率:/t%.5f%%' % (100 * error_rate))

[0]	eval-logloss:0.16396	train-logloss:0.16118
[1]	eval-logloss:0.06293	train-logloss:0.06473
[2]	eval-logloss:0.02504	train-logloss:0.02502
[3]	eval-logloss:0.01229	train-logloss:0.01297
[4]	eval-logloss:0.00718	train-logloss:0.00706
[5]	eval-logloss:0.00507	train-logloss:0.00488
[6]	eval-logloss:0.00277	train-logloss:0.00276
[0.0013066  0.9952118  0.0013066  ... 0.99917597 0.00155224 0.998858  ]
[0. 1. 0. ... 1. 0. 1.]
0.0
样本总数:	 1611
错误数目:	   0
错误率:	0.00000%

2.判断蘑菇是否有毒—手动读取数据

# /usr/bin/python
# -*- coding:utf-8 -*-


import xgboost as xgb
import numpy as np
import scipy.sparse
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


def read_data(path):
    # 标签、行、列、值
    y = []
    row = []
    col = []
    values = []
    r = 0  # 首行
    for d in open(path):
        d = d.strip().split()  # 以空格分开
        y.append(int(d[0]))
        d = d[1:]
        for c in d:
            key, value = c.split(':')
            row.append(r)
            col.append(int(key))
            values.append(float(value))
        r += 1

    # 创建稀疏矩阵,将(row, col)对应位置的值赋值成values
    x = scipy.sparse.csr_matrix((values, (row, col))).toarray()
    y = np.array(y)
    return x, y


if __name__ == '__main__':
    x, y = read_data('agaricus_train.txt')
    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, train_size=0.7)

    # Logistic回归
    lr = LogisticRegression(penalty='l2')
    lr.fit(x_train, y_train.ravel())
    y_hat = lr.predict(x_test)
    print('Logistic回归正确率:', accuracy_score(y_test, y_hat))

    # XGBoost
    data_train = xgb.DMatrix(x_train, label=y_train)
    data_test = xgb.DMatrix(x_test, label=y_test)
    watch_list = [(data_test, 'eval'), (data_train, 'train')]
    param = {'max_depth': 3, 'eta': 1, 'silent': 0, 'objective': 'multi:softmax', 'num_class': 3}
    bst = xgb.train(param, data_train, num_boost_round=4, evals=watch_list)
    y_hat = bst.predict(data_test)
    print('XGBoost正确率:', accuracy_score(y_test, y_hat))

[0]	eval-mlogloss:0.22262	train-mlogloss:0.23514
[1]	eval-mlogloss:0.07970	train-mlogloss:0.08438
[2]	eval-mlogloss:0.03086	train-mlogloss:0.03198
[3]	eval-mlogloss:0.01211	train-mlogloss:0.01313
XGBoost正确率: 1.0

3.鸢尾花分类

鸢尾花数据集—提取码:rjdw

# /usr/bin/python
# -*- encoding:utf-8 -*-

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split

if __name__ == "__main__":
    path = 'iris.data'  # 数据文件路径
    data = pd.read_csv(path, header=None)
    x, y = data.iloc[:, :4], data.iloc[:, 4]
    # 将类别数据转换成数值数据
    # y = pd.Categorical(y).codes
    y = y.map({'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}).astype(int)
    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, test_size=50)

    data_train = xgb.DMatrix(x_train, label=y_train)
    data_test = xgb.DMatrix(x_test, label=y_test)
    watch_list = [(data_test, 'eval'), (data_train, 'train')]
    param = {'max_depth': 2, 'eta': 0.3, 'silent': 1, 'objective': 'multi:softmax', 'num_class': 3}

    bst = xgb.train(param, data_train, num_boost_round=6, evals=watch_list)
    y_hat = bst.predict(data_test)
    result = y_test.values.reshape(1, -1) == y_hat
    print('正确率:/t', float(np.sum(result)) / len(y_hat))
    print('END...../n')

[0]	eval-mlogloss:0.75514	train-mlogloss:0.75286
[1]	eval-mlogloss:0.55165	train-mlogloss:0.54831
[2]	eval-mlogloss:0.41535	train-mlogloss:0.41431
[3]	eval-mlogloss:0.32765	train-mlogloss:0.32384
[4]	eval-mlogloss:0.26220	train-mlogloss:0.25719
[5]	eval-mlogloss:0.21648	train-mlogloss:0.20874
正确率:	 0.98
END.....

4.葡萄酒分类

葡萄酒数据集—提取码:n6x1

# !/usr/bin/python
# -*- encoding:utf-8 -*-

import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split   # cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


if __name__ == "__main__":

    data = np.loadtxt('wine.data', dtype=np.float32, delimiter=',')
    y, x = np.split(data, (1,), axis=1)
    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, test_size=0.3)

    # Logistic回归
    lr = LogisticRegression(penalty='l2')
    lr.fit(x_train, y_train.ravel())
    y_hat = lr.predict(x_test)
    print('Logistic回归正确率:', accuracy_score(y_test, y_hat))

    # XGBoost
    # 把标记为3的都设置为0,因为softmax分类从0开始
    y_train[y_train == 3] = 0
    y_test[y_test == 3] = 0
    data_train = xgb.DMatrix(x_train, label=y_train)
    data_test = xgb.DMatrix(x_test, label=y_test)
    watch_list = [(data_test, 'eval'), (data_train, 'train')]
    params = {'max_depth': 3, 'eta': 1, 'silent': 0, 'objective': 'multi:softmax', 'num_class': 3}
    bst = xgb.train(params, data_train, num_boost_round=20, evals=watch_list)
    y_hat = bst.predict(data_test)
    print('XGBoost正确率:', accuracy_score(y_test, y_hat))

Logistic回归正确率: 0.9629629629629629

[0]	eval-mlogloss:0.31720	train-mlogloss:0.25776
[1]	eval-mlogloss:0.16601	train-mlogloss:0.11004
[2]	eval-mlogloss:0.10526	train-mlogloss:0.05128
[3]	eval-mlogloss:0.08201	train-mlogloss:0.02994
[4]	eval-mlogloss:0.07387	train-mlogloss:0.02009
[5]	eval-mlogloss:0.06578	train-mlogloss:0.01496
[6]	eval-mlogloss:0.05878	train-mlogloss:0.01242
[7]	eval-mlogloss:0.05548	train-mlogloss:0.01168
[8]	eval-mlogloss:0.05426	train-mlogloss:0.01126
[9]	eval-mlogloss:0.05602	train-mlogloss:0.01098
[10]	eval-mlogloss:0.05326	train-mlogloss:0.01077
[11]	eval-mlogloss:0.05477	train-mlogloss:0.01059
[12]	eval-mlogloss:0.05255	train-mlogloss:0.01043
[13]	eval-mlogloss:0.05378	train-mlogloss:0.01032
[14]	eval-mlogloss:0.05370	train-mlogloss:0.01032
[15]	eval-mlogloss:0.05366	train-mlogloss:0.01032
[16]	eval-mlogloss:0.05364	train-mlogloss:0.01032
[17]	eval-mlogloss:0.05362	train-mlogloss:0.01032
[18]	eval-mlogloss:0.05361	train-mlogloss:0.01032
[19]	eval-mlogloss:0.05361	train-mlogloss:0.01032
XGBoost正确率: 0.9814814814814815

5.泰坦尼克号

数据集下载-提取码:foib
变量说明

survival : 是否活着 (0 = No; 1 = Yes)
pclass : Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
name :名字
sex : 性别
age :年龄
sibsp :配偶的人数
parch:父母子女人数
ticket:机票编号
fare :乘客票价
cabin:船舱
embarked:登船的港口(C =瑟堡; Q =皇后镇; S =南安普敦)
特别说明:
Pclass是社会经济地位(SES)状态
1st〜上层; 2nd〜中间; 3〜下
年龄以年为单位; 如果年龄小于一(1),则为分数

#!usr/bin/env python
# -*- coding:utf-8 -*-
"""
@author: admin
@file: 泰坦尼克号.py
@time: 2021/01/25
@desc:
"""
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import csv
import xgboost as xgb


# 读取数据,并数据处理
def loaddata(filename, is_train):
    # 输出显示设置
    pd.set_option('display.width', 200)
    # 加载数据
    data = pd.read_csv(filename, header=0, index_col=0)
    # 查看数据前几行
    print(data.head())
    # 显示数据整体情况-数据快速统计摘要
    print('data.describe =/n', data.describe())
    # 将空格转换为空值
    data.replace(to_replace=' ', value=np.NAN, inplace=True)
    # 数据的空值统计
    print(data.isnull().sum())

    # 将类别数据转换成数值数据-字典映射
    data['Sex'] = data['Sex'].map({'female': 0, 'male': 1}).astype(int)

    # 补充船票价格缺失值
    # print(len(data['Fare'][data['Fare'] == 0])) # 15
    if len(data.Fare[data['Fare'] == 0]) > 0:
        fare = np.zeros(3)
        for i in range(0, 3):
            fare[i] = data[data.Pclass == i + 1]['Fare'].dropna().median()
        print(fare)

        # 填充对应等级船票价格
        for i in range(0, 3):
            data.loc[(data.Fare.isnull()) & (data.Pclass == i + 1), 'Fare'] = fare[i]

    # 年龄处理
    # 一种是常见的用均值代替缺失值
    # mean_age = data['Age'].dropna().mean()
    # data['Age'].fillna(mean_age, inplace=True)

    # 使用随机森林预测年龄
    if is_train:
        print('随机森林开始预测年龄')
        data_for_age = data[['Age', 'Survived', 'Fare', 'Parch', 'SibSp', 'Pclass']]
        age_exist = data_for_age.loc[(data.Age.notnull())]
        age_null = data_for_age.loc[(data.Age.isnull())]
        print(age_exist)
        print(age_null)
        x = age_exist.iloc[:, 1:]
        y = age_exist.iloc[:, 0]
        clf = RandomForestRegressor(n_estimators=1000)
        clf.fit(x, y)
        age_hat = clf.predict(age_null.values[:, 1:])
        # print(age_hat)
        #  把预测的数据填充到Age列的空的那些行中
        data.loc[(data.Age.isnull()), 'Age'] = age_hat
        print('随机森林预测缺失年龄:--over--')
    else:
        # 如果是测试数据,则没有Survived这一项,
        # 所以前面加一个is_train用来判段是测试数据还是训练数据
        print('随机森林预测缺失年龄2:--start--')
        data_for_age = data[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']]
        age_exist = data_for_age.loc[(data.Age.notnull())]  # 年龄不缺失的数据
        age_null = data_for_age.loc[(data.Age.isnull())]
        print(age_exist.isnull().sum())
        x = age_exist.values[:, 1:]
        y = age_exist.values[:, 0]
        rfr = RandomForestRegressor(n_estimators=1000)
        rfr.fit(x, y)
        age_hat = rfr.predict(age_null.values[:, 1:])
        # print age_hat
        data.loc[(data.Age.isnull()), 'Age'] = age_hat
        print('随机森林预测缺失年龄2:--over--')

    # 起始城市
    data.loc[(data.Embarked.isnull()), 'Embarked'] = 'S'  # 保留缺失出发城市
    # data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2, 'U': 0}).astype(int)
    # print(data['Embarked'])

    # 取出Embarked这一列的数据,pd.get_dummies表示获得出发城市的哑元,就是有什么值
    embarked_data = pd.get_dummies(data.Embarked)
    print('embarked_data = /n', embarked_data)
    # 把所有出发城市拿出来,加上后缀,形成三个特征
    embarked_data = embarked_data.rename(columns=lambda x: 'Embarked_' + str(x))
    # 数据和这个新的特征组合在一起,形成新的数据
    data = pd.concat([data, embarked_data], axis=1)
    print(data.describe())
    # 保存数据
    data.to_csv('New_Data.csv')

    # 把清洗后的数据提取出来作为x
    x = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]
    y = None
    # 如果是训练集,提取y
    if 'Survived' in data:
        y = data['Survived']

    # 转成对应的矩阵
    x = np.array(x)
    y = np.array(y)

    # 平铺五行,让测试数据变得更多
    # 可以显著提高准确率
    x = np.tile(x, (5, 1))
    y = np.tile(y, (5,))
    if is_train:
        return x, y
    # print(data.index)
    return x, data.index


# 输出结果
def write_result(c, c_type):
    file_name = 'Titanic.test.csv'
    x, passenger_id = loaddata(file_name, False)

    if c_type == 3:
        x = xgb.DMatrix(x)
    y = c.predict(x)
    y[y > 0.5] = 1
    y[~(y > 0.5)] = 0

    predictions_file = open("Prediction_%d.csv" % c_type, "w")
    open_file_object = csv.writer(predictions_file)
    open_file_object.writerow(["PassengerId", "Survived"])
    open_file_object.writerows(zip(passenger_id, y))
    predictions_file.close()


if __name__ == "__main__":
    # 读取数据
    x, y = loaddata('Titanic.train.csv', True)
    # 数据集的分割,这里的test其实是验证数据
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1)

    # 逻辑回归
    lr = LogisticRegression(penalty='l2')
    lr.fit(x_train, y_train)
    y_hat = lr.predict(x_test)
    lr_acc = accuracy_score(y_test, y_hat)
    write_result(lr, 1)

    # 随机森林
    rfc = RandomForestClassifier(n_estimators=100)
    rfc.fit(x_train, y_train)
    y_hat = rfc.predict(x_test)
    rfc_acc = accuracy_score(y_test, y_hat)
    write_result(rfc, 2)

    # XGBoost
    data_train = xgb.DMatrix(x_train, label=y_train)
    data_test = xgb.DMatrix(x_test, label=y_test)
    watch_list = [(data_test, 'eval'), (data_train, 'train')]
    param = {'max_depth': 6, 'eta': 0.8, 'silent': 1, 'objective': 'binary:logistic'}
    # 'subsample': 1, 'alpha': 0, 'lambda': 0, 'min_child_weight': 1}
    bst = xgb.train(param, data_train, num_boost_round=100, evals=watch_list,
                    early_stopping_rounds=30, verbose_eval=True)
    y_hat = bst.predict(data_test, ntree_limit=bst.best_ntree_limit)
    write_result(bst, 3)
    y_hat[y_hat > 0.5] = 1
    y_hat[~(y_hat > 0.5)] = 0
    xgb_acc = accuracy_score(y_test, y_hat)

    print('Logistic回归:%.3f%%' % lr_acc)
    print('随机森林:%.3f%%' % rfc_acc)
    print('XGBoost:%.3f%%' % xgb_acc)

Logistic回归:0.797%
随机森林:0.983%
XGBoost:0.983%

未经允许不得转载:作者:1147-柳同学, 转载或复制请以 超链接形式 并注明出处 拜师资源博客
原文地址:《6.5 XGBoost实战》 发布于2021-01-25

分享到:
赞(0) 打赏

评论 抢沙发

评论前必须登录!

  注册



长按图片转发给朋友

觉得文章有用就打赏一下文章作者

支付宝扫一扫打赏

微信扫一扫打赏

Vieu3.3主题
专业打造轻量级个人企业风格博客主题!专注于前端开发,全站响应式布局自适应模板。

登录

忘记密码 ?

您也可以使用第三方帐号快捷登录

Q Q 登 录
微 博 登 录