pandas基础(1)

788-彭同学

发表文章数:64

首页 » 算法 » 正文

pandas最重要的数据结构:DataFrame

包括行索引index,列索引columns,二维数组ndarray。
行索引为Series一维数组

# 设置为 inline 风格
%matplotlib inline
# 包导入
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

创建数据集对象

# Series 对象可以理解为一维数组
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s
0     1
1     3
2     5
3   NaN
4     6
5     8
dtype: float64
# DataFrame 对象可以理解为二维数组,可以指定索引格式
dates = pd.date_range('20160301', periods=6)
dates
DatetimeIndex(['2016-03-01', '2016-03-02', '2016-03-03', '2016-03-04',
               '2016-03-05', '2016-03-06'],
              dtype='datetime64[ns]', freq='D')

第一种:DataFrame 对象基本创建方法:数组+行索引index(一个Series)+列索引(传入一个列表)

df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df
A B C D
2016-03-01 1.188983 -1.150119 -0.700588 0.439065
2016-03-02 -2.041544 1.084507 -0.335441 1.969754
2016-03-03 1.204151 -1.277714 -0.230671 0.629063
2016-03-04 -0.352351 -1.701585 -0.034294 -0.330139
2016-03-05 0.627601 -0.292939 0.457975 2.262402
2016-03-06 -1.121869 -0.533223 0.627452 0.412665
df里边的元素输出结果ndarray类型
df.values
array([[ 1.18898298, -1.15011854, -0.70058776,  0.43906549],
       [-2.04154443,  1.08450747, -0.33544069,  1.96975377],
       [ 1.2041512 , -1.27771421, -0.23067059,  0.62906316],
       [-0.35235094, -1.70158492, -0.03429361, -0.33013878],
       [ 0.62760104, -0.29293918,  0.45797463,  2.26240237],
       [-1.12186945, -0.53322343,  0.6274522 ,  0.41266481]])

第二种创建方法:字典


# 使用字典来创建:key 为 DataFrame 的列;value 为对应列下的值
df = pd.DataFrame({
                  'A': 1,
                  'B': pd.Timestamp('20160301'),
                  'C': range(4),
                  'D': np.arange(5, 9),
                  'E': 'text',
                  'F': ['AA', 'BB', 'CC', 'DD']})
df
A B C D E F
0 1 2016-03-01 0 5 text AA
1 1 2016-03-01 1 6 text BB
2 1 2016-03-01 2 7 text CC
3 1 2016-03-01 3 8 text DD
df.dtypes
A             int64
B    datetime64[ns]
C             int64
D             int64
E            object
F            object
dtype: object
df.A
0    1
1    1
2    1
3    1
Name: A, dtype: int64
type(df.A)
pandas.core.series.Series

查看数据

# 创建数据集
n_rows = 6
dates = pd.date_range('20160301', periods=n_rows)
df = pd.DataFrame(np.random.randn(n_rows, 4), index=dates, columns=list('ABCD'))
df
A B C D
2016-03-01 1.313419 0.826457 -1.574146 0.525008
2016-03-02 0.028397 -1.009349 0.327014 0.918248
2016-03-03 -0.857008 -1.682695 0.646229 -0.183377
2016-03-04 -1.112885 -1.491662 -1.114824 -0.115619
2016-03-05 -0.448713 -0.163651 -1.230295 1.106656
2016-03-06 -0.267867 0.092313 -0.480238 -0.809923
df.shape
(6, 4)
#()内啥也不输,默认显示前五条
df.head()
A B C D
2016-03-01 1.313419 0.826457 -1.574146 0.525008
2016-03-02 0.028397 -1.009349 0.327014 0.918248
2016-03-03 -0.857008 -1.682695 0.646229 -0.183377
2016-03-04 -1.112885 -1.491662 -1.114824 -0.115619
2016-03-05 -0.448713 -0.163651 -1.230295 1.106656
df.head(3)
A B C D
2016-03-01 1.313419 0.826457 -1.574146 0.525008
2016-03-02 0.028397 -1.009349 0.327014 0.918248
2016-03-03 -0.857008 -1.682695 0.646229 -0.183377
df.tail()
A B C D
2016-03-02 0.028397 -1.009349 0.327014 0.918248
2016-03-03 -0.857008 -1.682695 0.646229 -0.183377
2016-03-04 -1.112885 -1.491662 -1.114824 -0.115619
2016-03-05 -0.448713 -0.163651 -1.230295 1.106656
2016-03-06 -0.267867 0.092313 -0.480238 -0.809923
df.tail(2)
A B C D
2016-03-05 -0.448713 -0.163651 -1.230295 1.106656
2016-03-06 -0.267867 0.092313 -0.480238 -0.809923
df.index
DatetimeIndex(['2016-03-01', '2016-03-02', '2016-03-03', '2016-03-04',
               '2016-03-05', '2016-03-06'],
              dtype='datetime64[ns]', freq='D')
df.columns
Index([u'A', u'B', u'C', u'D'], dtype='object')
df.values
array([[ 1.31341924,  0.82645709, -1.57414606,  0.52500758],
       [ 0.02839742, -1.00934929,  0.32701362,  0.91824786],
       [-0.85700833, -1.68269525,  0.646229  , -0.18337746],
       [-1.11288513, -1.49166212, -1.11482404, -0.11561882],
       [-0.44871305, -0.16365107, -1.23029491,  1.10665563],
       [-0.26786722,  0.09231292, -0.48023763, -0.80992272]])
df.describe()
A B C D
count 6.000000 6.000000 6.000000 6.000000
mean -0.224110 -0.571431 -0.571043 0.240165
std 0.856808 0.983304 0.898112 0.734900
min -1.112885 -1.682695 -1.574146 -0.809923
25% -0.754935 -1.371084 -1.201427 -0.166438
50% -0.358290 -0.586500 -0.797531 0.204694
75% -0.045669 0.028322 0.125201 0.819938
max 1.313419 0.826457 0.646229 1.106656

df对象的转置:

df.T
2016-03-01 00:00:00 2016-03-02 00:00:00 2016-03-03 00:00:00 2016-03-04 00:00:00 2016-03-05 00:00:00 2016-03-06 00:00:00
A 1.313419 0.028397 -0.857008 -1.112885 -0.448713 -0.267867
B 0.826457 -1.009349 -1.682695 -1.491662 -0.163651 0.092313
C -1.574146 0.327014 0.646229 -1.114824 -1.230295 -0.480238
D 0.525008 0.918248 -0.183377 -0.115619 1.106656 -0.809923
df.T.shape
(4, 6)
df.sort_index(axis=1, ascending=False)
D C B A
2016-03-01 0.525008 -1.574146 0.826457 1.313419
2016-03-02 0.918248 0.327014 -1.009349 0.028397
2016-03-03 -0.183377 0.646229 -1.682695 -0.857008
2016-03-04 -0.115619 -1.114824 -1.491662 -1.112885
2016-03-05 1.106656 -1.230295 -0.163651 -0.448713
2016-03-06 -0.809923 -0.480238 0.092313 -0.267867

按c列元素升序排列

df.sort_values(by='C')
A B C D
2016-03-01 1.313419 0.826457 -1.574146 0.525008
2016-03-05 -0.448713 -0.163651 -1.230295 1.106656
2016-03-04 -1.112885 -1.491662 -1.114824 -0.115619
2016-03-06 -0.267867 0.092313 -0.480238 -0.809923
2016-03-02 0.028397 -1.009349 0.327014 0.918248
2016-03-03 -0.857008 -1.682695 0.646229 -0.183377

数据选择

df['A']
2016-03-01    1.313419
2016-03-02    0.028397
2016-03-03   -0.857008
2016-03-04   -1.112885
2016-03-05   -0.448713
2016-03-06   -0.267867
Freq: D, Name: A, dtype: float64
df[2:4]
A B C D
2016-03-03 -0.857008 -1.682695 0.646229 -0.183377
2016-03-04 -1.112885 -1.491662 -1.114824 -0.115619

注意:用行索引名字切片输出包括截止时间,与正常情况不一样

df['20160302':'20160305']
A B C D
2016-03-02 0.028397 -1.009349 0.327014 0.918248
2016-03-03 -0.857008 -1.682695 0.646229 -0.183377
2016-03-04 -1.112885 -1.491662 -1.114824 -0.115619
2016-03-05 -0.448713 -0.163651 -1.230295 1.106656

通过标签选择

iloc(),loc()方法效率比上边要高,常用

df.loc['20160301']
A    1.313419
B    0.826457
C   -1.574146
D    0.525008
Name: 2016-03-01 00:00:00, dtype: float64
type(df.loc['20160301'])
pandas.core.series.Series
df.loc[:, ['A', 'B']]
A B
2016-03-01 1.313419 0.826457
2016-03-02 0.028397 -1.009349
2016-03-03 -0.857008 -1.682695
2016-03-04 -1.112885 -1.491662
2016-03-05 -0.448713 -0.163651
2016-03-06 -0.267867 0.092313
df.loc['20160301':'20160305', ['A', 'B']]
A B
2016-03-01 1.313419 0.826457
2016-03-02 0.028397 -1.009349
2016-03-03 -0.857008 -1.682695
2016-03-04 -1.112885 -1.491662
2016-03-05 -0.448713 -0.163651
df.loc['2016-03-01', 'A']
1.3134192362700037
df.at[pd.Timestamp('2016-03-01'), 'A']
# df.at['2016-03-01', 'A'] will raise error
1.3134192362700037

通过位置选择

df.iloc[1]
A    0.028397
B   -1.009349
C    0.327014
D    0.918248
Name: 2016-03-02 00:00:00, dtype: float64
df.iloc[2:5, 0:2]
A B
2016-03-03 -0.857008 -1.682695
2016-03-04 -1.112885 -1.491662
2016-03-05 -0.448713 -0.163651
df.iloc[1:5, :]
A B C D
2016-03-02 0.028397 -1.009349 0.327014 0.918248
2016-03-03 -0.857008 -1.682695 0.646229 -0.183377
2016-03-04 -1.112885 -1.491662 -1.114824 -0.115619
2016-03-05 -0.448713 -0.163651 -1.230295 1.106656
df.iloc[1, 1]
-1.009349292057921
df.iat[1, 1]
-1.009349292057921

布尔索引

df[df.A < 0]
A B C D
2016-03-03 -0.857008 -1.682695 0.646229 -0.183377
2016-03-04 -1.112885 -1.491662 -1.114824 -0.115619
2016-03-05 -0.448713 -0.163651 -1.230295 1.106656
2016-03-06 -0.267867 0.092313 -0.480238 -0.809923
df[df > 0]
A B C D
2016-03-01 1.313419 0.826457 NaN 0.525008
2016-03-02 0.028397 NaN 0.327014 0.918248
2016-03-03 NaN NaN 0.646229 NaN
2016-03-04 NaN NaN NaN NaN
2016-03-05 NaN NaN NaN 1.106656
2016-03-06 NaN 0.092313 NaN NaN
df['tag'] = ['a'] * 2 + ['b'] * 2 + ['c'] * 2
df
A B C D tag
2016-03-01 1.313419 0.826457 -1.574146 0.525008 a
2016-03-02 0.028397 -1.009349 0.327014 0.918248 a
2016-03-03 -0.857008 -1.682695 0.646229 -0.183377 b
2016-03-04 -1.112885 -1.491662 -1.114824 -0.115619 b
2016-03-05 -0.448713 -0.163651 -1.230295 1.106656 c
2016-03-06 -0.267867 0.092313 -0.480238 -0.809923 c
df[df.tag.isin(['a', 'c'])]
A B C D tag
2016-03-01 1.313419 0.826457 -1.574146 0.525008 a
2016-03-02 0.028397 -1.009349 0.327014 0.918248 a
2016-03-05 -0.448713 -0.163651 -1.230295 1.106656 c
2016-03-06 -0.267867 0.092313 -0.480238 -0.809923 c

修改数据

df
A B C D tag
2016-03-01 1.313419 0.826457 -1.574146 0.525008 a
2016-03-02 0.028397 -1.009349 0.327014 0.918248 a
2016-03-03 -0.857008 -1.682695 0.646229 -0.183377 b
2016-03-04 -1.112885 -1.491662 -1.114824 -0.115619 b
2016-03-05 -0.448713 -0.163651 -1.230295 1.106656 c
2016-03-06 -0.267867 0.092313 -0.480238 -0.809923 c
s = pd.Series(np.arange(6), index=pd.date_range('20160301', periods=6))
s
2016-03-01    0
2016-03-02    1
2016-03-03    2
2016-03-04    3
2016-03-05    4
2016-03-06    5
Freq: D, dtype: int64
df['E'] = s
df
A B C D tag E
2016-03-01 1.313419 0.826457 -1.574146 0.525008 a 0
2016-03-02 0.028397 -1.009349 0.327014 0.918248 a 1
2016-03-03 -0.857008 -1.682695 0.646229 -0.183377 b 2
2016-03-04 -1.112885 -1.491662 -1.114824 -0.115619 b 3
2016-03-05 -0.448713 -0.163651 -1.230295 1.106656 c 4
2016-03-06 -0.267867 0.092313 -0.480238 -0.809923 c 5
df.loc['20160301', 'A'] = 0.2
# df.['20160301', 'A'] = 0.2 will not have effect
df
A B C D tag E
2016-03-01 0.200000 0.826457 -1.574146 0.525008 a 0
2016-03-02 0.028397 -1.009349 0.327014 0.918248 a 1
2016-03-03 -0.857008 -1.682695 0.646229 -0.183377 b 2
2016-03-04 -1.112885 -1.491662 -1.114824 -0.115619 b 3
2016-03-05 -0.448713 -0.163651 -1.230295 1.106656 c 4
2016-03-06 -0.267867 0.092313 -0.480238 -0.809923 c 5
df.at[pd.Timestamp('20160301'), 'A'] = 0.4
df
A B C D tag E
2016-03-01 0.400000 0.826457 -1.574146 0.525008 a 0
2016-03-02 0.028397 -1.009349 0.327014 0.918248 a 1
2016-03-03 -0.857008 -1.682695 0.646229 -0.183377 b 2
2016-03-04 -1.112885 -1.491662 -1.114824 -0.115619 b 3
2016-03-05 -0.448713 -0.163651 -1.230295 1.106656 c 4
2016-03-06 -0.267867 0.092313 -0.480238 -0.809923 c 5
df.iat[0, 0] = 0.6
df
A B C D tag E
2016-03-01 0.600000 0.826457 -1.574146 0.525008 a 0
2016-03-02 0.028397 -1.009349 0.327014 0.918248 a 1
2016-03-03 -0.857008 -1.682695 0.646229 -0.183377 b 2
2016-03-04 -1.112885 -1.491662 -1.114824 -0.115619 b 3
2016-03-05 -0.448713 -0.163651 -1.230295 1.106656 c 4
2016-03-06 -0.267867 0.092313 -0.480238 -0.809923 c 5
df.loc[:, 'A'] = np.arange(10, 16)
df
A B C D tag E
2016-03-01 10 0.826457 -1.574146 0.525008 a 0
2016-03-02 11 -1.009349 0.327014 0.918248 a 1
2016-03-03 12 -1.682695 0.646229 -0.183377 b 2
2016-03-04 13 -1.491662 -1.114824 -0.115619 b 3
2016-03-05 14 -0.163651 -1.230295 1.106656 c 4
2016-03-06 15 0.092313 -0.480238 -0.809923 c 5
df2 = df.loc[:, ['B', 'C']].copy()
df2[df2 > 0] = -df2
df2
B C
2016-03-01 -0.826457 -1.574146
2016-03-02 -1.009349 -0.327014
2016-03-03 -1.682695 -0.646229
2016-03-04 -1.491662 -1.114824
2016-03-05 -0.163651 -1.230295
2016-03-06 -0.092313 -0.480238

拜师教育学员文章:作者:788-彭同学, 转载或复制请以 超链接形式 并注明出处 拜师资源博客
原文地址:《pandas基础(1)》 发布于2020-03-01

分享到:
赞(0) 打赏

评论 抢沙发

评论前必须登录!

  注册



长按图片转发给朋友

觉得文章有用就打赏一下文章作者

支付宝扫一扫打赏

微信扫一扫打赏

Vieu3.3主题
专业打造轻量级个人企业风格博客主题!专注于前端开发,全站响应式布局自适应模板。

登录

忘记密码 ?

您也可以使用第三方帐号快捷登录

Q Q 登 录
微 博 登 录