3.1 API : DecisionTreeClassifier、DecisionTreeRegressor

1147-柳同学

引言

sklearn中实现的决策树都是二叉树

1. DecisionTreeClassifier

from sklearn.tree import DecisionTreeClassifier

DecisionTreeClassifier(*, criterion='gini', splitter='best', max_depth=None, min_samples_split=2,
min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None,
max_leaf_nodes=None, min_impurity_decrease=0.0,min_impurity_split=None, class_weight=None, ccp_alpha=0.0)


Parameters

criterion : {“gini”, “entropy”}, default=”gini,

splitter : {“best”, “random”}, default=”best”

“best”支持最佳拆分，“random”支持最佳随机拆分

max_depth : int, default=None

min_samples_split : int or float, default=2

min_samples_leaf : int or float, default=1

min_weight_fraction_leaf : float, default=0.0

max_features : int, float or {“auto”, “sqrt”, “log2”}, default=None

random_state : int, RandomState instance or None, default=None

max_leaf_nodes : int, default=None

min_impurity_decrease : float, default=0.0
split损失阈值

class_weight : dict, list of dict or “balanced”, default=None

[{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of [{1:1}, {2:5}, {3:1}, {4:1}].
“balanced”模式使用y的值自动将权重与输入数据中的类频率成反比地调整为n_samples /（n_classes * np.bincount（y））

ccp_alpha : non-negative float, default=0.0

Attributes

classes_ : ndarray of shape (n_classes,) or list of ndarray

feature_importances_ : ndarray of shape (n_features,)

max_features_ : int

n_classes_ : int or list of int

n_features_ : int

n_outputs_ : int

tree_ : Tree instance
Tree对象

Methods

apply(X[, check_input])

cost_complexity_pruning_path(X, y[, …])

decision_path(X[, check_input])

fit(X, y[, sample_weight, check_input, …])

get_depth()

get_n_leaves()

get_params([deep])

predict(X[, check_input])

predict_log_proba(X)

predict_proba(X[, check_input])

score(X, y[, sample_weight])

set_params(**params)

>>> from sklearn.datasets import load_iris
>>> from sklearn.model_selection import cross_val_score
>>> from sklearn.tree import DecisionTreeClassifier
>>> clf = DecisionTreeClassifier(random_state=0)
>>> cross_val_score(clf, iris.data, iris.target, cv=10)
...
...
array([ 1.     ,  0.93...,  0.86...,  0.93...,  0.93...,
0.93...,  0.93...,  1.     ,  0.93...,  1.      ])


2.DecisionTreeRegressor

from sklearn.tree import DecisionTreeClassifier

DecisionTreeRegressor(*,  criterion='mse', splitter='best', max_depth=None, min_samples_split=2,
min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None,
max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, ccp_alpha=0.0)

DecisionTreeClassifier(*, criterion='gini', splitter='best', max_depth=None, min_samples_split=2,
min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None,
max_leaf_nodes=None, min_impurity_decrease=0.0,min_impurity_split=None, class_weight=None, ccp_alpha=0.0)


Parameters

criterion : {“mse”, “friedman_mse”, “mae”, “poisson”}, default=”mse”

Attributes

Methods

predict(X[, check_input])

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
import numpy as np

n_samples, n_features = 1000, 20
rng = np.random.RandomState(0)
X = rng.randn(n_samples, n_features)
# positive integer target correlated with X[:, 5] with many zeros:
y = rng.poisson(lam=np.exp(X[:, 5]) / 2)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
regressor = DecisionTreeRegressor(criterion='poisson', random_state=0)
regressor.fit(X_train, y_train)


3.案例

import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
import pydotplus
import matplotlib as mpl

# 加载数据
features = ["age", "work", "house", "credit"]
x_train = pd.DataFrame([
["青年", "否", "否", "一般"],
["青年", "否", "否", "好"],
["青年", "是", "否", "好"],
["青年", "是", "是", "一般"],
["青年", "否", "否", "一般"],
["中年", "否", "否", "一般"],
["中年", "否", "否", "好"],
["中年", "是", "是", "好"],
["中年", "否", "是", "非常好"],
["中年", "否", "是", "非常好"],
["老年", "否", "是", "非常好"],
["老年", "否", "是", "好"],
["老年", "是", "否", "好"],
["老年", "是", "否", "非常好"],
["老年", "否", "否", "一般"]
])
y_train = pd.DataFrame(["否", "否", "是", "是", "否", "否", "否", "是", "是", "是", "是", "是", "是", "是", "否"])
y_type = [str(k) for k in np.unique(y_train)]
# one-hot编码
le_x = LabelEncoder()
le_x.fit(np.unique(x_train))
x_train = x_train.apply(le_x.transform)

le_y = LabelEncoder()
le_y.fit(y_train)
y_train = le_y.transform(y_train)
return x_train, y_train,features,le_x,le_y

# 决策树可视化
def show(clf,feature,y_type):
dot_data = tree.export_graphviz(clf,out_file=None,
feature_names=feature,
class_names=y_type,filled=True,
rounded=True,special_characters=True)
# 生成图片
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_png('DT_show.png')

if __name__ == '__main__':
mpl.rcParams["font.sans-serif"] = [u'simHei']
mpl.rcParams['axes.unicode_minus'] = False
# 加载数据
# 分类
clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)
y_type = [str(k) for k in np.unique(y_train)]
# 可视化
show(clf, features,y_type)

# 预测
X_show = pd.DataFrame([["青年", "否", "否", "一般"]])
X_test = X_show.apply(le_x.transform)
y_predict = clf.predict(X_test)
# 结果输出
X_show = [{features[i] :X_show.values[0][i]} for i in range(len(features))]
print("{0}被分类为{1}".format(X_show,le_y.inverse_transform(y_predict)))

[{'age': '青年'}, {'work': '否'}, {'house': '否'}, {'credit': '一般'}]被分类为['否']


Vieu3.3主题

Q Q 登 录