天池工业蒸汽量预测

1138-魏同学

发表文章数:75

热门标签

首页 » 算法 » 正文

天池工业蒸汽量

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns#画图

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor,AdaBoostRegressor,ExtraTreesRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

#支持向量机
from sklearn.svm import SVR

#评价标准
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,StandardScaler,PolynomialFeatures

数据聚合

train = pd.read_csv('./zhengqi_train.txt',sep = '/t')
test = pd.read_csv('./zhengqi_test.txt',sep = '/t')
#将train,test进行级联,级联之前进行处理
#给train,test增加一列
train['origin'] = 'train'
test['origin'] = 'test'

#将train和test进行融合
data_all = pd.concat([train,test])
print(data_all.shape)
data_all.head()

天池工业蒸汽量预测



    
# 38个特征,将一些不重要的特征删除
# 特征分布情况,训练和测试数据特征分布不均匀,删除
plt.figure(figsize=(9,38*6))
for i,col in enumerate(data_all.columns[:-2]):
    cond = data_all['origin'] == 'train'
    train_col = data_all[col][cond] #训练数据
    cond = data_all['origin'] == 'test'
    test_col = data_all[col][cond] #测试数据
    axes = plt.subplot(38,1,i+1)
    ax = sns.kdeplot(train_col,shade = True)
    sns.kdeplot(test_col,shade = True,ax = ax)
    plt.legend(['train','test'])
    plt.xlabel(col)

天池工业蒸汽量预测

plt.figure(figsize=(9,6))
for col in data_all.columns[:-2]:
    g = sns.FacetGrid(data_all,col = 'origin')
    g.map(sns.distplot,col)#distribute

天池工业蒸汽量预测

天池工业蒸汽量预测

drop_labels = ['V11','V17','V22','V5']
data_all.drop(drop_labels,axis = 1,inplace=True)
data_all.shape

相关性系数

# 协方差
cov = data_all.cov()
cov.head()

天池工业蒸汽量预测

# 相关性系数
corr = data_all.corr()
corr.head()

天池工业蒸汽量预测

# 通过相关性系数找到7个相关性不大的属性
cond = corr.loc['target'].abs() < 0.1
drop_labels = corr.loc['target'].index[cond]
# Index(['V14', 'V21', 'V25', 'V26', 'V32', 'V33', 'V34'], dtype='object')
drop_labels

# 查看了属性的分布,分布不好的删除
drop_labels = ['V14', 'V21']
data_all.drop(drop_labels,axis = 1,inplace=True)
data_all.shape

天池工业蒸汽量预测

# 找出相关程度
plt.figure(figsize=(20, 16))  # 指定绘图对象宽度和高度
mcorr = train.corr()  # 相关系数矩阵,即给出了任意两个变量之间的相关系数
mask = np.zeros_like(mcorr, dtype=np.bool)  # 构造与mcorr同维数矩阵 为bool型

mask[np.triu_indices_from(mask)] = True  # 角分线右侧为True
# 颜色
cmap = sns.diverging_palette(220, 10, as_cmap=True)  # 返回matplotlib colormap对象
g = sns.heatmap(mcorr, mask=mask, cmap=cmap, square=True, annot=True, fmt='0.2f')  # 热力图(看两两相似度)
plt.show()

天池工业蒸汽量预测

标准化

data_all[data_all['origin'] == 'test'].describe()

天池工业蒸汽量预测

data_all[data_all['origin'] == 'train'].describe()

天池工业蒸汽量预测

stand = StandardScaler()
data = data_all.iloc[:,:-2]
data2 = stand.fit_transform(data)
data2

天池工业蒸汽量预测

cols = data_all.columns
data_all_std = pd.DataFrame(data2,columns=cols[:-2])
data_all_std

天池工业蒸汽量预测

data_all.index = np.arange(4813)
data_all

天池工业蒸汽量预测

data_all_std = pd.merge(data_all_std,data_all.iloc[:,-2:],right_index=True,left_index=True)
data_all_std.head()

天池工业蒸汽量预测

data_all_std.describe()

天池工业蒸汽量预测

使用不同算法进行训练和测试

# 异常值
from sklearn.linear_model import RidgeCV
data_all_std.head()

天池工业蒸汽量预测

ridge = RidgeCV(alphas=[0.0001,0.001,0.01,0.1,0.2,0.5,1,2,3,4,5,10,20,30,50])

cond = data_all_std['origin'] == 'train'

X_train = data_all_std[cond].iloc[:,:-2]
# 真实值
y_train = data_all_std[cond]['target']
# 算法拟合数据和目标值的时候,不可能100%拟合
ridge.fit(X_train,y_train)
# 预测,预测值肯定会和真实值有一定的偏差,偏差特别大,当成异常值
y_ = ridge.predict(X_train)
cond = abs((y_train - y_ )) > y_train.std()*0.8
cond.sum()

天池工业蒸汽量预测

# 画图
plt.figure(figsize=(12,6))
axes = plt.subplot(1,3,1)
axes.scatter(y_train,y_)
axes.scatter(y_train[cond],y_[cond],c = 'red',s = 20)

axes = plt.subplot(1,3,2)
axes.scatter(y_train,y_train - y_)
axes.scatter(y_train[cond],(y_train - y_)[cond],c = 'red')

axes = plt.subplot(1,3,3)
# _ = axes.hist(y_train,bins = 50)
(y_train - y_).plot.hist(bins = 50,ax = axes)
(y_train - y_).loc[cond].plot.hist(bins = 50,ax = axes,color = 'r')

天池工业蒸汽量预测

data_all_std

天池工业蒸汽量预测

# 将异常值点过滤
drop_index = cond[cond].index
print(data_all_std.shape)
data_all_std.drop(drop_index,axis = 0,inplace=True)
data_all_std.shape

天池工业蒸汽量预测

def detect_model(etsimators,data):
    for key,estimator in estimators.items():
        estimator.fit(data[0],data[2])
        y_ = estimator.predict(data[1])
        mse = mean_squared_error(data[3],y_)
        print('-------------------mse%s'%(key),mse)
        r2 = estimator.score(data[1],data[3])
        print('+++++++++++++++++++r2_score%s'%(key),r2)
        print('/n')
cond = data_all_std['origin'] == 'train'
X = data_all_std[cond].iloc[:,:-2]
y = data_all_std[cond]['target']
data = train_test_split(X,y,test_size = 0.2)
estimators = {}
estimators['knn'] = KNeighborsRegressor()
estimators['linear'] = LinearRegression()
estimators['ridge'] = Ridge()
estimators['lasso'] = Lasso()
estimators['elasticnet'] = ElasticNet()
estimators['forest'] = RandomForestRegressor()
estimators['gbdt'] = GradientBoostingRegressor()
estimators['ada'] = AdaBoostRegressor()
estimators['extreme'] = ExtraTreesRegressor()
estimators['svm_rbf'] = SVR(kernel='rbf')
estimators['svm_poly'] = SVR(kernel='poly')
estimators['light'] = LGBMRegressor()
estimators['xgb'] = XGBRegressor()
# 对于我们的测试数据而言:KNN、Lasso、ElasticNet、SVM_poly
detect_model(estimators,data)

天池工业蒸汽量预测

#过滤掉效果不好的方法:线性回归
estimators = {}
# estimators['linear'] = LinearRegression()
# estimators['ridge'] = Ridge()
# estimators['lasso'] = Lasso()
estimators['forest'] = RandomForestRegressor()
estimators['gbdt'] = GradientBoostingRegressor()
estimators['ada'] = AdaBoostRegressor()
estimators['extreme'] = ExtraTreesRegressor()
estimators['svm_rbf'] = SVR(kernel='rbf')
estimators['light'] = LGBMRegressor()
estimators['xgb'] = XGBRegressor()
cond = data_all_std['origin'] == 'train'

X_train = data_all_std[cond].iloc[:,:-2]
y_train = data_all_std[cond]['target']

cond = data_all_std['origin'] == 'test'
X_test = data_all_std[cond].iloc[:,:-2]
# 一个算法预测结果,将结果合并
y_pred = []
for key,model in estimators.items():
    model.fit(X_train,y_train)
    y_ = model.predict(X_test)
    y_pred.append(y_)

y_ = np.mean(y_pred,axis = 0)
pd.Series(y_).to_csv('./ensemble2.txt',index =False)
# 预测的结果作为新特征,让我们的算法学习,寻找数据和目标值之间的关系
# y_ 预测值,和真实值之间差距,将预测值当成新的特征,让我们算法进行再学习
for key,model in estimators.items():
    model.fit(X_train,y_train)
    y_ = model.predict(X_train)
    X_train[key] = y_
    y_ = model.predict(X_test)
    X_test[key] = y_
# 一个算法预测结果,将结果合并
y_pred = []
for key,model in estimators.items():
    model.fit(X_train,y_train)
    y_ = model.predict(X_test)
    y_pred.append(y_)

y_ = np.mean(y_pred,axis = 0)
pd.Series(y_).to_csv('./ensemble3.txt',index = False)
sns.distplot(y_)

天池工业蒸汽量预测

y_.mean()

天池工业蒸汽量预测

y_.std()

天池工业蒸汽量预测

#给y_加一个噪声
y_+=np.random.randn(1925)*0.1
pd.Series(y_).to_csv('./ensemble4.txt',index = False)

对数据进行归一化

data_all.head()

天池工业蒸汽量预测

data = data_all.iloc[:,:-2]

minmaxscaler = MinMaxScaler()
data3 = minmaxscaler.fit_transform(data)
data3

天池工业蒸汽量预测

#归一化的数据

data_all_norm = pd.DataFrame(data3,columns = data_all.columns[:-2])
data_all_norm

天池工业蒸汽量预测

#进行级联
data_all_norm = pd.merge(data_all_norm,data_all.iloc[:,-2:],left_index = True,right_index = True)
data_all_norm.describe()

天池工业蒸汽量预测

def scale_minmax(data):
    return (data - data.min())/(data.max()-data.min())
#连续变量画图
from scipy import stats
fcols = 6
frows = len(data_all_norm.columns[:10])
plt.figure(figsize = (4*fcols,4*frows))
i = 0
for col in data_all_norm.columns[:10]:
  
    dat = data_all_norm[[col,'target']].dropna()
# 第一个图数据分布图
    i+=1
    plt.subplot(frows,fcols,i)
    sns.distplot(dat[col],fit = stats.norm)#正太分布图
    plt.title(var + 'Original')
#第二个图:skew统计分析中的属性
#skewness:偏斜系数,是对正态分布的度量
    i+=1
    plt.subplot(frows,fcols,i)
    _ = stats.probplot(dat[col],plot = plt)#画图:偏斜度
    plt.title('skew='+'{:.4f}'.format(stats.skew(dat[col])))
    plt.xlabel('')
    plt.ylabel('')
    
    
#第三个图:散点图
    i+=1
    plt.subplot(frows,fcols,i)
#     plt.plot(dat[var], dat['target'],'.',alpha=0.5)
    plt.scatter(dat[col],dat['target'],alpha=0.5)
    plt.title('corr='+'{:.2f}'.format(np.corrcoef(dat[col], dat['target'])[0][1]))
    
    
#     !!!对数据进行了处理!!!
#   数据分布图distribution
    i+=1
    plt.subplot(frows,fcols,i)
    trans_var, lambda_var = stats.boxcox(dat[col].dropna()+1)
    trans_var = scale_minmax(trans_var)      
    sns.distplot(trans_var , fit=stats.norm);
    plt.title(var+' Tramsformed')
    plt.xlabel('')

 #     偏斜度
    i+=1
    plt.subplot(frows,fcols,i)
    _=stats.probplot(trans_var, plot=plt)
    plt.title('skew='+'{:.4f}'.format(stats.skew(trans_var)))
    plt.xlabel('')
    plt.ylabel('')

#     散点图
    i+=1
    plt.subplot(frows,fcols,i)
    plt.plot(trans_var, dat['target'],'.',alpha=0.5)
    plt.title('corr='+'{:.2f}'.format(np.corrcoef(trans_var,dat['target'])[0][1]))

天池工业蒸汽量预测

#将数据进行box-cox转换(所有列,利用for循环)
#统计建模中常用的数据变化,让数据更加正态化,更加标准化
for col in data_all_norm.columns[:-2]:
    boxcox,maxlog = stats.boxcox(data_all_norm[col]+1)#加1原因data_all_nrom最小值0,stats.boxcox返回两个值,进行接收
    data_all_norm[col] = scale_minmax(boxcox)#进行归一化

过滤异常值

ridge = RidgeCV(alphas=[0.0001,0.001,0.01,0.1,0.2,0.5,1,2,3,4,5,10,20,30,50])

cond = data_all_norm['origin'] == 'train'

X_train = data_all_norm[cond].iloc[:,:-2]
# 真实值
y_train = data_all_norm[cond]['target']
# 算法拟合数据和目标值的时候,不可能100%拟合
ridge.fit(X_train,y_train)
# 预测,预测值肯定会和真实值有一定的偏差,偏差特别大,当成异常值
y_ = ridge.predict(X_train)

cond = abs(y_ - y_train)>y_train.std()
print(cond.sum)
# 画图
plt.figure(figsize=(12,6))
axes = plt.subplot(1,3,1)
axes.scatter(y_train,y_)
axes.scatter(y_train[cond],y_[cond],c = 'red',s = 20)

axes = plt.subplot(1,3,2)
axes.scatter(y_train,y_train - y_)
axes.scatter(y_train[cond],(y_train - y_)[cond],c = 'red')

axes = plt.subplot(1,3,3)
# _ = axes.hist(y_train,bins = 50)
(y_train - y_).plot.hist(bins = 50,ax = axes)
(y_train - y_).loc[cond].plot.hist(bins = 50,ax = axes,color = 'r')

天池工业蒸汽量预测

#把异常值过滤
index = cond[cond].index
data_all_norm.drop(index,axis=0,inplace = True)
cond = data_all_norm['origin'] == 'train'
X_train = data_all_norm[cond].iloc[:,:-2]
y_train = data_all_norm[cond]['target']

cond = data_all_norm['origin'] == 'test'
X_test = data_all_norm[cond].iloc[:,:-2]
#过滤掉效果不好的方法:线性回归
estimators = {}
# estimators['linear'] = LinearRegression()
# estimators['ridge'] = Ridge()
# estimators['lasso'] = Lasso()
estimators['forest'] = RandomForestRegressor(n_estimators = 300)
estimators['gbdt'] = GradientBoostingRegressor(n_estimators = 300)
estimators['ada'] = AdaBoostRegressor(n_estimators = 300)
estimators['extreme'] = ExtraTreesRegressor(n_estimators = 300)
estimators['svm_rbf'] = SVR(kernel='rbf')
estimators['light'] = LGBMRegressor(n_estimators = 300)
estimators['xgb'] = XGBRegressor(n_estimators = 300)
result = []
for key,model in estimators.items():
    model.fit(X_train,y_train)
    y_ = model.predict(X_test)
    result.append(y_)

y_ = np.mean(result,axis = 0)

pd.Series(y_).to_csv('./norm.txt',index = False)
标签:

未经允许不得转载:作者:1138-魏同学, 转载或复制请以 超链接形式 并注明出处 拜师资源博客
原文地址:《天池工业蒸汽量预测》 发布于2020-10-30

分享到:
赞(0) 打赏

评论 抢沙发

评论前必须登录!

  注册



长按图片转发给朋友

觉得文章有用就打赏一下文章作者

支付宝扫一扫打赏

微信扫一扫打赏

Vieu3.3主题
专业打造轻量级个人企业风格博客主题!专注于前端开发,全站响应式布局自适应模板。

登录

忘记密码 ?

您也可以使用第三方帐号快捷登录

Q Q 登 录
微 博 登 录