pandas简单操作(2):处理丢失数据、数据运算、数据合并、数据分组等

1334-甘同学

发表文章数:18

热门标签

, ,
首页 » 数据科学库 » 正文

pandas简单操作(2):处理丢失数据、数据运算、数据合并、数据分组等

input

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib as plt
dates = pd.date_range(start ='20160301',periods = 6)
df = pd.DataFrame(np.random.randn(6,4), index = datas, columns = list('ABCD'))

input

df1 = df.reindex(index=dates[0:4],columns=list(df.columns)+['E'])
df1

output

A B C D E
2016-03-01 -0.589989 0.781182 -1.539535 -0.564109 NaN
2016-03-02 -0.013672 1.247939 -0.079588 1.121198 NaN
2016-03-03 0.502634 -0.022906 -0.558544 -0.057238 NaN
2016-03-04 1.131296 -0.722159 1.838251 1.780309 NaN

input

df1.loc[dates[1:3],'E'] = 2
df1

output

A B C D E
2016-03-01 -0.589989 0.781182 -1.539535 -0.564109 NaN
2016-03-02 -0.013672 1.247939 -0.079588 1.121198 2.0
2016-03-03 0.502634 -0.022906 -0.558544 -0.057238 2.0
2016-03-04 1.131296 -0.722159 1.838251 1.780309 NaN

input

df1.dropna() #把空数据扔掉

output

A B C D E
2016-03-02 -0.013672 1.247939 -0.079588 1.121198 2.0
2016-03-03 0.502634 -0.022906 -0.558544 -0.057238 2.0

input

df1.fillna(value=5) #把空数据替换掉

output

A B C D E
2016-03-01 -0.589989 0.781182 -1.539535 -0.564109 5.0
2016-03-02 -0.013672 1.247939 -0.079588 1.121198 2.0
2016-03-03 0.502634 -0.022906 -0.558544 -0.057238 2.0
2016-03-04 1.131296 -0.722159 1.838251 1.780309 5.0

input

pd.isnull(df1) #判断空数据

output

A B C D E
2016-03-01 False False False False True
2016-03-02 False False False False False
2016-03-03 False False False False False
2016-03-04 False False False False True

input

 pd.isnull(df1).any() #判断每一列有没有空数据

output

A    False
B    False
C    False
D    False
E     True
dtype: bool

input

pd.isnull(df1).any().any() #判断整个表有没有空数据
True

input

df1.mean() #空数据是不参与计算的

output

A    0.257567
B    0.321014
C   -0.084854
D    0.570040
E    2.000000
dtype: float64

input

 df1.mean(axis=1)  #按行求平均值

output

2016-03-01   -0.478113
2016-03-02    0.855175
2016-03-03    0.372789
2016-03-04    1.006924
Freq: D, dtype: float64

input

s = pd.Series([1,3,5,np.nan,6,8],index=datas).shift(2) #创建一个序列
s
2016-03-01    NaN
2016-03-02    NaN
2016-03-03    1.0
2016-03-04    3.0
2016-03-05    5.0
2016-03-06    NaN
Freq: D, dtype: float64

input

df

output

A B C D
2016-03-01 -0.589989 0.781182 -1.539535 -0.564109
2016-03-02 -0.013672 1.247939 -0.079588 1.121198
2016-03-03 0.502634 -0.022906 -0.558544 -0.057238
2016-03-04 1.131296 -0.722159 1.838251 1.780309
2016-03-05 0.167379 -1.178604 0.554787 -1.018810
2016-03-06 -1.870176 -0.277785 0.946150 -1.130179

input

df.sub(s,axis='index') #df的每一列都减去s这个序列

output

A B C D
2016-03-01 NaN NaN NaN NaN
2016-03-02 NaN NaN NaN NaN
2016-03-03 -0.497366 -1.022906 -1.558544 -1.057238
2016-03-04 -1.868704 -3.722159 -1.161749 -1.219691
2016-03-05 -4.832621 -6.178604 -4.445213 -6.018810
2016-03-06 NaN NaN NaN NaN

input

 df

output

A B C D
2016-03-01 -0.589989 0.781182 -1.539535 -0.564109
2016-03-02 -0.013672 1.247939 -0.079588 1.121198
2016-03-03 0.502634 -0.022906 -0.558544 -0.057238
2016-03-04 1.131296 -0.722159 1.838251 1.780309
2016-03-05 0.167379 -1.178604 0.554787 -1.018810
2016-03-06 -1.870176 -0.277785 0.946150 -1.130179

input

df.apply(np.cumsum) #累加 e.g.第一列的第三个数字是前两个累加,第四个是前三个累加

output

A B C D
2016-03-01 -0.589989 0.781182 -1.539535 -0.564109
2016-03-02 -0.603661 2.029122 -1.619123 0.557089
2016-03-03 -0.101027 2.006216 -2.177667 0.499851
2016-03-04 1.030269 1.284057 -0.339417 2.280160
2016-03-05 1.197648 0.105453 0.215370 1.261350
2016-03-06 -0.672528 -0.172332 1.161520 0.131171

input

df.apply(lambda x : x.max()-x.min()) #每一列返回每一列的最大值减去最小值

output

A    3.001471
B    2.426544
C    3.377786
D    2.910488
dtype: float64

input

def _sum(x):
    print(type(x))
    return x.sum()
df.apply(_sum)   #了解apply函数的用法
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>





A   -0.672528
B   -0.172332
C    1.161520
D    0.131171
dtype: float64

input

 s = pd.Series(np.random.randint(10,20,size=20)) #创建一个序列
 s

output

0     12
1     10
2     12
3     19
4     18
5     12
6     14
7     10
8     14
9     13
10    14
11    14
12    17
13    18
14    14
15    15
16    12
17    19
18    15
19    10
dtype: int64

input

s.value_counts() #value_counts(): 统计每个数出现了多少次

output

14    5
12    4
10    3
19    2
18    2
15    2
17    1
13    1
dtype: int64

input

s.mode() #产生了最多的数字。0是标签,最多的数是14

output

0    14
dtype: int64

input

df = pd.DataFrame(np.random.randn(10,4),columns = list('ABCD'))
df

output

A B C D
0 0.336937 -1.536305 1.065385 0.443815
1 0.258999 -0.492226 0.031624 0.293882
2 1.170411 -0.769640 0.031939 0.410354
3 -0.664322 -0.767787 -0.173439 2.197244
4 1.874829 1.213179 0.697452 -0.463159
5 2.366758 -1.818289 -0.567515 -0.429355
6 1.113581 0.370977 -1.514833 0.768817
7 -0.481723 -0.342043 -0.261016 0.402664
8 1.585774 0.170456 -0.950196 0.780025
9 0.950888 -0.699442 0.076922 1.226366

input

df.iloc[:3] #取前三行

output

A B C D
0 0.336937 -1.536305 1.065385 0.443815
1 0.258999 -0.492226 0.031624 0.293882
2 1.170411 -0.769640 0.031939 0.410354

input

df.iloc[3:7]

output

A B C D
3 -0.664322 -0.767787 -0.173439 2.197244
4 1.874829 1.213179 0.697452 -0.463159
5 2.366758 -1.818289 -0.567515 -0.429355
6 1.113581 0.370977 -1.514833 0.768817

input

df.iloc[7:]

output

A B C D
7 -0.481723 -0.342043 -0.261016 0.402664
8 1.585774 0.170456 -0.950196 0.780025
9 0.950888 -0.699442 0.076922 1.226366

input

df1 = pd .concat([df.iloc[:3],df.iloc[3:7],df.iloc[7:]]) #concat:合并

input

df == df1 #看看合并后的和原来的是否相等

output

A B C D
0 True True True True
1 True True True True
2 True True True True
3 True True True True
4 True True True True
5 True True True True
6 True True True True
7 True True True True
8 True True True True
9 True True True True

input

(df == df1).all().all() #比上面那个看起来简单
True

input

left = pd.DataFrame({'key':['foo','foo'],'lval':[1,2]})
right = pd.DataFrame({'key':['foo','foo'],'rval':[4,5]})

input

left
key lval
0 foo 1
1 foo 2

input

right

output

key rval
0 foo 4
1 foo 5

input

pd.merge(left,right,on='key') # merge:通过中介key来把两个表合起来

output

key lval rval
0 foo 1 4
1 foo 1 5
2 foo 2 4
3 foo 2 5

input

s = pd.Series(np.random.randint(1,5,size=4),index=list('ABCD'))
s

output

A    2
B    4
C    2
D    3
dtype: int64

input

df.append(s, ignore_index=True) #和merge功能差不多

output

A B C D
0 0.336937 -1.536305 1.065385 0.443815
1 0.258999 -0.492226 0.031624 0.293882
2 1.170411 -0.769640 0.031939 0.410354
3 -0.664322 -0.767787 -0.173439 2.197244
4 1.874829 1.213179 0.697452 -0.463159
5 2.366758 -1.818289 -0.567515 -0.429355
6 1.113581 0.370977 -1.514833 0.768817
7 -0.481723 -0.342043 -0.261016 0.402664
8 1.585774 0.170456 -0.950196 0.780025
9 0.950888 -0.699442 0.076922 1.226366
10 2.000000 4.000000 2.000000 3.000000

input

df = pd.DataFrame({'A' : ['foo','bar','foo','bar',
                       'foo','bar','foo','foo'],
                  'B' : ['one','one','two','three',
                       'two','two','one','three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})
df

output

A B C D
0 foo one 1.328673 1.462918
1 bar one 0.941917 0.617067
2 foo two 1.401242 -0.877970
3 bar three -0.913739 1.528186
4 foo two -0.843309 1.093093
5 bar two 0.572083 1.853178
6 foo one -0.725831 0.983818
7 foo three 0.006845 -0.916164

input

df.groupby('A').sum( ) #把A进行分组,分完组后进行一定的计算,这里是求和

output

C D
A
bar 0.600261 3.998431
foo 1.167620 1.745695

input

df.groupby(['A','B']).sum()

output

C D
A B
bar one 0.941917 0.617067
three -0.913739 1.528186
two 0.572083 1.853178
foo one 0.602842 2.446736
three 0.006845 -0.916164
two 0.557933 0.215123

input

df.groupby(['B','A']).sum()

output

C D
B A
one bar 0.941917 0.617067
foo 0.602842 2.446736
three bar -0.913739 1.528186
foo 0.006845 -0.916164
two bar 0.572083 1.853178
foo 0.557933 0.215123

未经允许不得转载:作者:1334-甘同学, 转载或复制请以 超链接形式 并注明出处 拜师资源博客
原文地址:《pandas简单操作(2):处理丢失数据、数据运算、数据合并、数据分组等》 发布于2020-11-06

分享到:
赞(0) 打赏

评论 抢沙发

评论前必须登录!

  注册



长按图片转发给朋友

觉得文章有用就打赏一下文章作者

支付宝扫一扫打赏

微信扫一扫打赏

Vieu3.3主题
专业打造轻量级个人企业风格博客主题!专注于前端开发,全站响应式布局自适应模板。

登录

忘记密码 ?

您也可以使用第三方帐号快捷登录

Q Q 登 录
微 博 登 录