# 天池工业蒸汽量预测

1138-魏同学

## 天池工业蒸汽量

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns#画图

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

#支持向量机
from sklearn.svm import SVR

#评价标准
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,StandardScaler,PolynomialFeatures



## 数据聚合

train = pd.read_csv('./zhengqi_train.txt',sep = '/t')

#将train,test进行级联，级联之前进行处理
#给train，test增加一列
train['origin'] = 'train'
test['origin'] = 'test'

#将train和test进行融合
data_all = pd.concat([train,test])
print(data_all.shape)




# 38个特征，将一些不重要的特征删除
# 特征分布情况，训练和测试数据特征分布不均匀，删除
plt.figure(figsize=(9,38*6))
for i,col in enumerate(data_all.columns[:-2]):
cond = data_all['origin'] == 'train'
train_col = data_all[col][cond] #训练数据
cond = data_all['origin'] == 'test'
test_col = data_all[col][cond] #测试数据
axes = plt.subplot(38,1,i+1)
plt.legend(['train','test'])
plt.xlabel(col)


plt.figure(figsize=(9,6))
for col in data_all.columns[:-2]:
g = sns.FacetGrid(data_all,col = 'origin')
g.map(sns.distplot,col)#distribute


drop_labels = ['V11','V17','V22','V5']
data_all.drop(drop_labels,axis = 1,inplace=True)
data_all.shape


## 相关性系数

# 协方差
cov = data_all.cov()


# 相关性系数
corr = data_all.corr()


# 通过相关性系数找到7个相关性不大的属性
cond = corr.loc['target'].abs() < 0.1
drop_labels = corr.loc['target'].index[cond]
# Index(['V14', 'V21', 'V25', 'V26', 'V32', 'V33', 'V34'], dtype='object')
drop_labels

# 查看了属性的分布，分布不好的删除
drop_labels = ['V14', 'V21']
data_all.drop(drop_labels,axis = 1,inplace=True)

data_all.shape


# 找出相关程度
plt.figure(figsize=(20, 16))  # 指定绘图对象宽度和高度
mcorr = train.corr()  # 相关系数矩阵，即给出了任意两个变量之间的相关系数
mask = np.zeros_like(mcorr, dtype=np.bool)  # 构造与mcorr同维数矩阵 为bool型

# 颜色
cmap = sns.diverging_palette(220, 10, as_cmap=True)  # 返回matplotlib colormap对象
plt.show()


## 标准化

data_all[data_all['origin'] == 'test'].describe()


data_all[data_all['origin'] == 'train'].describe()


stand = StandardScaler()
data = data_all.iloc[:,:-2]
data2 = stand.fit_transform(data)
data2


cols = data_all.columns
data_all_std = pd.DataFrame(data2,columns=cols[:-2])
data_all_std


data_all.index = np.arange(4813)
data_all


data_all_std = pd.merge(data_all_std,data_all.iloc[:,-2:],right_index=True,left_index=True)


data_all_std.describe()


## 使用不同算法进行训练和测试

# 异常值
from sklearn.linear_model import RidgeCV

data_all_std.head()


ridge = RidgeCV(alphas=[0.0001,0.001,0.01,0.1,0.2,0.5,1,2,3,4,5,10,20,30,50])

cond = data_all_std['origin'] == 'train'

X_train = data_all_std[cond].iloc[:,:-2]
# 真实值
y_train = data_all_std[cond]['target']
# 算法拟合数据和目标值的时候，不可能100%拟合
ridge.fit(X_train,y_train)
# 预测，预测值肯定会和真实值有一定的偏差，偏差特别大，当成异常值
y_ = ridge.predict(X_train)

cond = abs((y_train - y_ )) > y_train.std()*0.8
cond.sum()


# 画图
plt.figure(figsize=(12,6))
axes = plt.subplot(1,3,1)
axes.scatter(y_train,y_)
axes.scatter(y_train[cond],y_[cond],c = 'red',s = 20)

axes = plt.subplot(1,3,2)
axes.scatter(y_train,y_train - y_)
axes.scatter(y_train[cond],(y_train - y_)[cond],c = 'red')

axes = plt.subplot(1,3,3)
# _ = axes.hist(y_train,bins = 50)
(y_train - y_).plot.hist(bins = 50,ax = axes)
(y_train - y_).loc[cond].plot.hist(bins = 50,ax = axes,color = 'r')


data_all_std


# 将异常值点过滤
drop_index = cond[cond].index
print(data_all_std.shape)
data_all_std.drop(drop_index,axis = 0,inplace=True)
data_all_std.shape


def detect_model(etsimators,data):
for key,estimator in estimators.items():
estimator.fit(data[0],data[2])
y_ = estimator.predict(data[1])
mse = mean_squared_error(data[3],y_)
print('-------------------mse%s'%(key),mse)
r2 = estimator.score(data[1],data[3])
print('+++++++++++++++++++r2_score%s'%(key),r2)
print('/n')

cond = data_all_std['origin'] == 'train'
X = data_all_std[cond].iloc[:,:-2]
y = data_all_std[cond]['target']
data = train_test_split(X,y,test_size = 0.2)

estimators = {}
estimators['knn'] = KNeighborsRegressor()
estimators['linear'] = LinearRegression()
estimators['ridge'] = Ridge()
estimators['lasso'] = Lasso()
estimators['elasticnet'] = ElasticNet()
estimators['forest'] = RandomForestRegressor()
estimators['extreme'] = ExtraTreesRegressor()
estimators['svm_rbf'] = SVR(kernel='rbf')
estimators['svm_poly'] = SVR(kernel='poly')
estimators['light'] = LGBMRegressor()
estimators['xgb'] = XGBRegressor()

# 对于我们的测试数据而言：KNN、Lasso、ElasticNet、SVM_poly
detect_model(estimators,data)


#过滤掉效果不好的方法：线性回归
estimators = {}
# estimators['linear'] = LinearRegression()
# estimators['ridge'] = Ridge()
# estimators['lasso'] = Lasso()
estimators['forest'] = RandomForestRegressor()
estimators['extreme'] = ExtraTreesRegressor()
estimators['svm_rbf'] = SVR(kernel='rbf')
estimators['light'] = LGBMRegressor()
estimators['xgb'] = XGBRegressor()

cond = data_all_std['origin'] == 'train'

X_train = data_all_std[cond].iloc[:,:-2]
y_train = data_all_std[cond]['target']

cond = data_all_std['origin'] == 'test'
X_test = data_all_std[cond].iloc[:,:-2]

# 一个算法预测结果，将结果合并
y_pred = []
for key,model in estimators.items():
model.fit(X_train,y_train)
y_ = model.predict(X_test)
y_pred.append(y_)

y_ = np.mean(y_pred,axis = 0)

pd.Series(y_).to_csv('./ensemble2.txt',index =False)

# 预测的结果作为新特征，让我们的算法学习，寻找数据和目标值之间的关系
# y_ 预测值，和真实值之间差距，将预测值当成新的特征，让我们算法进行再学习
for key,model in estimators.items():
model.fit(X_train,y_train)
y_ = model.predict(X_train)
X_train[key] = y_
y_ = model.predict(X_test)
X_test[key] = y_

# 一个算法预测结果，将结果合并
y_pred = []
for key,model in estimators.items():
model.fit(X_train,y_train)
y_ = model.predict(X_test)
y_pred.append(y_)

y_ = np.mean(y_pred,axis = 0)

pd.Series(y_).to_csv('./ensemble3.txt',index = False)

sns.distplot(y_)


y_.mean()


y_.std()


#给y_加一个噪声
y_+=np.random.randn(1925)*0.1

pd.Series(y_).to_csv('./ensemble4.txt',index = False)


## 对数据进行归一化

data_all.head()


data = data_all.iloc[:,:-2]

minmaxscaler = MinMaxScaler()
data3 = minmaxscaler.fit_transform(data)
data3


#归一化的数据

data_all_norm = pd.DataFrame(data3,columns = data_all.columns[:-2])
data_all_norm


#进行级联
data_all_norm = pd.merge(data_all_norm,data_all.iloc[:,-2:],left_index = True,right_index = True)

data_all_norm.describe()


def scale_minmax(data):
return (data - data.min())/(data.max()-data.min())

#连续变量画图
from scipy import stats
fcols = 6
frows = len(data_all_norm.columns[:10])
plt.figure(figsize = (4*fcols,4*frows))
i = 0
for col in data_all_norm.columns[:10]:

dat = data_all_norm[[col,'target']].dropna()
# 第一个图数据分布图
i+=1
plt.subplot(frows,fcols,i)
sns.distplot(dat[col],fit = stats.norm)#正太分布图
plt.title(var + 'Original')
#第二个图：skew统计分析中的属性
#skewness:偏斜系数，是对正态分布的度量
i+=1
plt.subplot(frows,fcols,i)
_ = stats.probplot(dat[col],plot = plt)#画图：偏斜度
plt.title('skew='+'{:.4f}'.format(stats.skew(dat[col])))
plt.xlabel('')
plt.ylabel('')

#第三个图：散点图
i+=1
plt.subplot(frows,fcols,i)
#     plt.plot(dat[var], dat['target'],'.',alpha=0.5)
plt.scatter(dat[col],dat['target'],alpha=0.5)
plt.title('corr='+'{:.2f}'.format(np.corrcoef(dat[col], dat['target'])[0][1]))

#     ！！！对数据进行了处理！！！
#   数据分布图distribution
i+=1
plt.subplot(frows,fcols,i)
trans_var, lambda_var = stats.boxcox(dat[col].dropna()+1)
trans_var = scale_minmax(trans_var)
sns.distplot(trans_var , fit=stats.norm);
plt.title(var+' Tramsformed')
plt.xlabel('')

#     偏斜度
i+=1
plt.subplot(frows,fcols,i)
_=stats.probplot(trans_var, plot=plt)
plt.title('skew='+'{:.4f}'.format(stats.skew(trans_var)))
plt.xlabel('')
plt.ylabel('')

#     散点图
i+=1
plt.subplot(frows,fcols,i)
plt.plot(trans_var, dat['target'],'.',alpha=0.5)
plt.title('corr='+'{:.2f}'.format(np.corrcoef(trans_var,dat['target'])[0][1]))


#将数据进行box-cox转换（所有列，利用for循环）
#统计建模中常用的数据变化，让数据更加正态化，更加标准化
for col in data_all_norm.columns[:-2]:
boxcox,maxlog = stats.boxcox(data_all_norm[col]+1)#加1原因data_all_nrom最小值0，stats.boxcox返回两个值，进行接收
data_all_norm[col] = scale_minmax(boxcox)#进行归一化


## 过滤异常值

ridge = RidgeCV(alphas=[0.0001,0.001,0.01,0.1,0.2,0.5,1,2,3,4,5,10,20,30,50])

cond = data_all_norm['origin'] == 'train'

X_train = data_all_norm[cond].iloc[:,:-2]
# 真实值
y_train = data_all_norm[cond]['target']
# 算法拟合数据和目标值的时候，不可能100%拟合
ridge.fit(X_train,y_train)
# 预测，预测值肯定会和真实值有一定的偏差，偏差特别大，当成异常值
y_ = ridge.predict(X_train)

cond = abs(y_ - y_train)>y_train.std()
print(cond.sum)
# 画图
plt.figure(figsize=(12,6))
axes = plt.subplot(1,3,1)
axes.scatter(y_train,y_)
axes.scatter(y_train[cond],y_[cond],c = 'red',s = 20)

axes = plt.subplot(1,3,2)
axes.scatter(y_train,y_train - y_)
axes.scatter(y_train[cond],(y_train - y_)[cond],c = 'red')

axes = plt.subplot(1,3,3)
# _ = axes.hist(y_train,bins = 50)
(y_train - y_).plot.hist(bins = 50,ax = axes)
(y_train - y_).loc[cond].plot.hist(bins = 50,ax = axes,color = 'r')


#把异常值过滤
index = cond[cond].index
data_all_norm.drop(index,axis=0,inplace = True)

cond = data_all_norm['origin'] == 'train'
X_train = data_all_norm[cond].iloc[:,:-2]
y_train = data_all_norm[cond]['target']

cond = data_all_norm['origin'] == 'test'
X_test = data_all_norm[cond].iloc[:,:-2]

#过滤掉效果不好的方法：线性回归
estimators = {}
# estimators['linear'] = LinearRegression()
# estimators['ridge'] = Ridge()
# estimators['lasso'] = Lasso()
estimators['forest'] = RandomForestRegressor(n_estimators = 300)
estimators['extreme'] = ExtraTreesRegressor(n_estimators = 300)
estimators['svm_rbf'] = SVR(kernel='rbf')
estimators['light'] = LGBMRegressor(n_estimators = 300)
estimators['xgb'] = XGBRegressor(n_estimators = 300)

result = []
for key,model in estimators.items():
model.fit(X_train,y_train)
y_ = model.predict(X_test)
result.append(y_)

y_ = np.mean(result,axis = 0)

pd.Series(y_).to_csv('./norm.txt',index = False)


Vieu3.3主题

Q Q 登 录