数据科学包-Day2-pandas(一)

1026-徐同学

发表文章数:41

首页 » 数据科学库 » 正文

pandas

快速入门

In [1]: import pandas as pd

In [2]: import numpy as np

s = pd.Series([1,3,5,np.NaN,8,4])

In [6]: s
Out[6]:
0    1.0
1    3.0
2    5.0
3    NaN
4    8.0
5    4.0
dtype: float64

In [7]: dates = pd.date_range('20200513',periods = 6)

In [8]: dates
Out[8]:
DatetimeIndex(['2020-05-13', '2020-05-14', '2020-05-15', '2020-05-16',
               '2020-05-17', '2020-05-18'],
              dtype='datetime64[ns]', freq='D')

In [10]: data = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))

In [11]: data
Out[11]:
                   A         B         C         D
2020-05-13  0.007099  0.577779  0.505243 -0.371364
2020-05-14 -1.152069  0.126131  0.521131  0.626994
2020-05-15  1.220163  1.642453 -0.395683 -0.202970
2020-05-16  0.109201 -1.749882  1.761463 -1.048466
2020-05-17  0.174510 -0.201592 -0.014987  0.417585
2020-05-18 -0.387181  0.261729  0.543224 -0.508142


In [12]: data.head(2)
Out[12]:
                   A         B         C         D
2020-05-13  0.007099  0.577779  0.505243 -0.371364
2020-05-14 -1.152069  0.126131  0.521131  0.626994

In [13]: data.index
Out[13]:
DatetimeIndex(['2020-05-13', '2020-05-14', '2020-05-15', '2020-05-16',
               '2020-05-17', '2020-05-18'],
              dtype='datetime64[ns]', freq='D')

In [14]: data.columns
Out[14]: Index(['A', 'B', 'C', 'D'], dtype='object')

In [15]: data.values
Out[15]:
array([[ 0.00709894,  0.57777939,  0.50524292, -0.37136365],
       [-1.15206909,  0.12613115,  0.5211315 ,  0.62699405],
       [ 1.220163  ,  1.64245282, -0.3956833 , -0.20297029],
       [ 0.10920079, -1.74988173,  1.76146273, -1.04846558],
       [ 0.17451048, -0.20159239, -0.01498698,  0.41758489],
       [-0.38718054,  0.26172943,  0.5432244 , -0.50814176]])

In [16]: data.describe()
Out[16]:
              A         B         C         D
count  6.000000  6.000000  6.000000  6.000000
mean  -0.004713  0.109436  0.486732 -0.181060
std    0.775663  1.108761  0.729388  0.617640
min   -1.152069 -1.749882 -0.395683 -1.048466
25%   -0.288611 -0.119662  0.115070 -0.473947
50%    0.058150  0.193930  0.513187 -0.287167
75%    0.158183  0.498767  0.537701  0.262446
max    1.220163  1.642453  1.761463  0.626994

In [17]: data.T
Out[17]:
   2020-05-13  2020-05-14  2020-05-15  2020-05-16  2020-05-17  2020-05-18
A    0.007099   -1.152069    1.220163    0.109201    0.174510   -0.387181
B    0.577779    0.126131    1.642453   -1.749882   -0.201592    0.261729
C    0.505243    0.521131   -0.395683    1.761463   -0.014987    0.543224
D   -0.371364    0.626994   -0.202970   -1.048466    0.417585   -0.508142

In [18]: data.T.shape
Out[18]: (4, 6)

In [19]: data.sort_index(axis=1)
Out[19]:
                   A         B         C         D
2020-05-13  0.007099  0.577779  0.505243 -0.371364
2020-05-14 -1.152069  0.126131  0.521131  0.626994
2020-05-15  1.220163  1.642453 -0.395683 -0.202970
2020-05-16  0.109201 -1.749882  1.761463 -1.048466
2020-05-17  0.174510 -0.201592 -0.014987  0.417585
2020-05-18 -0.387181  0.261729  0.543224 -0.508142

In [20]: data.sort_index(axis=1,ascending=False)
Out[20]:
                   D         C         B         A
2020-05-13 -0.371364  0.505243  0.577779  0.007099
2020-05-14  0.626994  0.521131  0.126131 -1.152069
2020-05-15 -0.202970 -0.395683  1.642453  1.220163
2020-05-16 -1.048466  1.761463 -1.749882  0.109201
2020-05-17  0.417585 -0.014987 -0.201592  0.174510
2020-05-18 -0.508142  0.543224  0.261729 -0.387181

In [21]: data.iloc[2:4]
Out[21]:
                   A         B         C         D
2020-05-15  1.220163  1.642453 -0.395683 -0.202970
2020-05-16  0.109201 -1.749882  1.761463 -1.048466

创建数据集对象

# Series 对象可以理解为一维数组
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

# DataFrame 对象可以理解为二维数组,可以指定索引格式
dates = pd.date_range('20160301', periods=6)
dates

查看数据

# 创建数据集
n_rows = 6
dates = pd.date_range('20160301', periods=n_rows)
df = pd.DataFrame(np.random.randn(n_rows, 4), index=dates, columns=list('ABCD'))
df

df.shape
df.head()
df.head(3)
df.tail()
df.tail(2)
df.index
df.columns
df.values
df.describe()
df.T
df.sort_values(by='C')

数据选择

df['A']
df[2:4]
df['20160302':'20160305']

通过标签选择

df.loc['20160301']
type(df.loc['20160301'])
df.loc[:, ['A', 'B']]
df.loc['20160301':'20160305', ['A', 'B']]
df.loc['2016-03-01', 'A']
df.at[pd.Timestamp('2016-03-01'), 'A']
# df.at['2016-03-01', 'A'] will raise error

通过位置选择

df.iloc[1]
df.iloc[2:5, 0:2]
df.iloc[1:5, :]
df.iloc[1, 1]
df.iat[1, 1]

布尔索引

df[df.A < 0]
df[df > 0]
df['tag'] = ['a'] * 2 + ['b'] * 2 + ['c'] * 2
df
df[df.tag.isin(['a', 'c'])]

修改数据

s = pd.Series(np.arange(6), index=pd.date_range('20160301', periods=6))
s
df['E'] = s
df
df.loc['20160301', 'A'] = 0.2
# df.['20160301', 'A'] = 0.2 will not have effect
df
df.at[pd.Timestamp('20160301'), 'A'] = 0.4
df
df.iat[0, 0] = 0.6
df
df.loc[:, 'A'] = np.arange(10, 16)
df
df2 = df.loc[:, ['B', 'C']].copy()
df2[df2 > 0] = -df2
df2

处理丢失数据

dates = pd.date_range('20160301', periods=6)
df = pd.DataFrame(data=np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1
df1.loc[dates[1:3], 'E'] = 1
df1
df1.dropna(how='any')
df1
df1.fillna(value=5)
df1
pd.isnull(df1)

统计

df1.mean()
df1.mean(axis=1)
df.mean(axis=1)
df.sum()
df.sum(axis='columns')
df.cumsum()
s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2)
s
df
df.sub(s, axis='index')
df.apply(np.cumsum)
df.apply(lambda x: x.max() - x.min())
s = pd.Series(np.random.randint(0, 7, size=10))
s
s.value_counts()
s.mode()

数据合并

df = pd.DataFrame(np.random.randn(10, 4), columns=list('ABCD'))
df
df.iloc[:3]
df.iloc[3:7]
df.iloc[7:]
df1 = pd.concat([df.iloc[:3], df.iloc[3:7], df.iloc[7:]])
df1
(df1 == df).all().all()
# SQL 样式的联合查询
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})
# SELECT * FROM left INNER JOIN right ON left.key = right.key;
pd.merge(left, right, on='key')
s = pd.Series(np.random.randint(1, 5, size=4), index=list('ABCD'))
df.append(s, ignore_index=True)

分组统计

df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                          'foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three',
                           'two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})
df
df.groupby('A').sum()
# df.groupby(['B', 'A']).sum()
df.groupby(['A', 'B']).sum()

数据整形

tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
                     'foo', 'foo', 'qux', 'qux'],
                    ['one', 'two', 'one', 'two',
                     'one', 'two', 'one', 'two']]))
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])
df
df.loc['bar']
df.loc['bar'].loc['one']
stacked = df.stack()
stacked
stacked.loc['bar'].loc['one'].loc['A']
stacked.unstack()
stacked.unstack().unstack()
stacked.unstack(1)

数据透视表

df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
                    'B' : ['A', 'B', 'C'] * 4,
                    'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
                    'D' : np.random.randn(12),
                    'E' : np.random.randn(12)})
df
pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])
pd.pivot_table(df, values=['E'], index=['A'], columns=['C'])
df[df.A=='one'].groupby('C').mean()

时间序列

rng = pd.date_range('20160301', periods=600, freq='s')
rng
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
ts
ts.resample('2Min', how='sum')

类别数据

df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})
df
df["grade"] = df["raw_grade"].astype("category")
df
df["grade"].cat.categories
df["grade"].cat.categories = ["very good", "good", "very bad"]
df
df.sort_values(by='grade', ascending=True)
df.groupby("grade").size()

画图

ts = pd.Series(np.random.randn(1000), index=pd.date_range('20000101', periods=1000))
ts = ts.cumsum()
ts
ts.plot()

数据读写

df = pd.DataFrame(np.random.randn(100, 4), columns=list('ABCD'))
df
df.to_csv('data.csv')
# pd.read_csv('data.csv')
pd.read_csv('data.csv', index_col=0)

未经允许不得转载:作者:1026-徐同学, 转载或复制请以 超链接形式 并注明出处 拜师资源博客
原文地址:《数据科学包-Day2-pandas(一)》 发布于2020-05-16

分享到:
赞(0) 打赏

评论 抢沙发

评论前必须登录!

  注册



长按图片转发给朋友

觉得文章有用就打赏一下文章作者

支付宝扫一扫打赏

微信扫一扫打赏

Vieu3.3主题
专业打造轻量级个人企业风格博客主题!专注于前端开发,全站响应式布局自适应模板。

登录

忘记密码 ?

您也可以使用第三方帐号快捷登录

Q Q 登 录
微 博 登 录