Python数据科学包(二)—– Pandas快速入门

756-周同学

发表文章数:47

首页 » 数据科学库 » 正文

一. 快速入门(一)

1. 创建数据

#创建series
s = pd.Series([1,3,5,np.NaN,8,4])
s
Out[6]: 
0    1.0
1    3.0
2    5.0
3    NaN
4    8.0
5    4.0
dtype: float64

#创建日期
dates = pd.date_range('20160301',periods=6)
dates
Out[10]: 
DatetimeIndex(['2016-03-01', '2016-03-02', '2016-03-03', '2016-03-04',
               '2016-03-05', '2016-03-06'],
              dtype='datetime64[ns]', freq='D')
#创建数据
data = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))
data
Out[12]: 
                   A         B         C         D
2016-03-01  1.544355 -0.822489  0.982824  1.031441
2016-03-02 -1.615765 -1.594909  0.236910 -0.787619
2016-03-03  0.902210 -0.863211 -0.095991 -0.627598
2016-03-04 -1.415922 -0.536842 -0.499592  0.173474
2016-03-05 -1.361461  1.683070  0.042079 -0.805467
2016-03-06 -0.058033  0.189439 -1.066178 -1.374988
data.shape
Out[13]: (6, 4)
data.values
Out[14]: 
array([[ 1.54435512, -0.82248877,  0.98282361,  1.03144142],
       [-1.61576481, -1.59490856,  0.2369097 , -0.78761942],
       [ 0.90221017, -0.86321057, -0.09599111, -0.62759812],
       [-1.41592193, -0.53684182, -0.49959242,  0.17347383],
       [-1.361461  ,  1.68307009,  0.04207949, -0.80546661],
       [-0.05803274,  0.18943875, -1.06617839, -1.3749882 ]])

#创建字典表
d = {'A':1,'B':pd.Timestamp('20130301'),'C':range(4),'D':np.arange(4)}
d
Out[16]: 
{'A': 1,
 'B': Timestamp('2013-03-01 00:00:00'),
 'C': range(0, 4),
 'D': array([0, 1, 2, 3])}
df = pd.DataFrame(d)
df
Out[18]: 
   A          B  C  D
0  1 2013-03-01  0  0
1  1 2013-03-01  1  1
2  1 2013-03-01  2  2
3  1 2013-03-01  3  3
df.dtypes
Out[19]: 
A             int64
B    datetime64[ns]
C             int64
D             int64
dtype: object
df.A
Out[20]: 
0    1
1    1
2    1
3    1
Name: A, dtype: int64

2. 查看数据

data.head()
Out[21]: 
                   A         B         C         D
2016-03-01  1.544355 -0.822489  0.982824  1.031441
2016-03-02 -1.615765 -1.594909  0.236910 -0.787619
2016-03-03  0.902210 -0.863211 -0.095991 -0.627598
2016-03-04 -1.415922 -0.536842 -0.499592  0.173474
2016-03-05 -1.361461  1.683070  0.042079 -0.805467
data.tail()
Out[22]: 
                   A         B         C         D
2016-03-02 -1.615765 -1.594909  0.236910 -0.787619
2016-03-03  0.902210 -0.863211 -0.095991 -0.627598
2016-03-04 -1.415922 -0.536842 -0.499592  0.173474
2016-03-05 -1.361461  1.683070  0.042079 -0.805467
2016-03-06 -0.058033  0.189439 -1.066178 -1.374988
data.index
Out[23]: 
DatetimeIndex(['2016-03-01', '2016-03-02', '2016-03-03', '2016-03-04',
               '2016-03-05', '2016-03-06'],
              dtype='datetime64[ns]', freq='D')
data.columns
Out[24]: Index(['A', 'B', 'C', 'D'], dtype='object')
data.describe()
Out[25]: 
              A         B         C         D
count  6.000000  6.000000  6.000000  6.000000
mean  -0.334103 -0.324157 -0.066658 -0.398460
std    1.341771  1.139447  0.691769  0.859644
min   -1.615765 -1.594909 -1.066178 -1.374988
25%   -1.402307 -0.853030 -0.398692 -0.801005
50%   -0.709747 -0.679665 -0.026956 -0.707609
75%    0.662149  0.007869  0.188202 -0.026794
max    1.544355  1.683070  0.982824  1.031441

3. 排序

data.sort_index(axis=1)
Out[27]: 
                   A         B         C         D
2016-03-01  1.544355 -0.822489  0.982824  1.031441
2016-03-02 -1.615765 -1.594909  0.236910 -0.787619
2016-03-03  0.902210 -0.863211 -0.095991 -0.627598
2016-03-04 -1.415922 -0.536842 -0.499592  0.173474
2016-03-05 -1.361461  1.683070  0.042079 -0.805467
2016-03-06 -0.058033  0.189439 -1.066178 -1.374988
data.sort_index(axis=0,ascending = False)
Out[28]: 
                   A         B         C         D
2016-03-06 -0.058033  0.189439 -1.066178 -1.374988
2016-03-05 -1.361461  1.683070  0.042079 -0.805467
2016-03-04 -1.415922 -0.536842 -0.499592  0.173474
2016-03-03  0.902210 -0.863211 -0.095991 -0.627598
2016-03-02 -1.615765 -1.594909  0.236910 -0.787619
2016-03-01  1.544355 -0.822489  0.982824  1.031441
data.sort_values(by='A')
Out[29]: 
                   A         B         C         D
2016-03-02 -1.615765 -1.594909  0.236910 -0.787619
2016-03-04 -1.415922 -0.536842 -0.499592  0.173474
2016-03-05 -1.361461  1.683070  0.042079 -0.805467
2016-03-06 -0.058033  0.189439 -1.066178 -1.374988
2016-03-03  0.902210 -0.863211 -0.095991 -0.627598
2016-03-01  1.544355 -0.822489  0.982824  1.031441

选择数据

data['A']
Out[30]: 
2016-03-01    1.544355
2016-03-02   -1.615765
2016-03-03    0.902210
2016-03-04   -1.415922
2016-03-05   -1.361461
2016-03-06   -0.058033
Freq: D, Name: A, dtype: float64
data[2:4]
Out[31]: 
                   A         B         C         D
2016-03-03  0.902210 -0.863211 -0.095991 -0.627598
2016-03-04 -1.415922 -0.536842 -0.499592  0.173474
data.loc[:,['B','C']]
Out[32]: 
                   B         C
2016-03-01 -0.822489  0.982824
2016-03-02 -1.594909  0.236910
2016-03-03 -0.863211 -0.095991
2016-03-04 -0.536842 -0.499592
2016-03-05  1.683070  0.042079
2016-03-06  0.189439 -1.066178
data.loc['20160302':'20160305',['B','C']]
Out[33]: 
                   B         C
2016-03-02 -1.594909  0.236910
2016-03-03 -0.863211 -0.095991
2016-03-04 -0.536842 -0.499592
2016-03-05  1.683070  0.042079
data.at[pd.Timestamp('20160302'),'B']
Out[34]: -1.594908560945633
data.iloc[1]
Out[35]: 
A   -1.615765
B   -1.594909
C    0.236910
D   -0.787619
Name: 2016-03-02 00:00:00, dtype: float64
data.iloc[1:3,2:4]
Out[36]: 
                   C         D
2016-03-02  0.236910 -0.787619
2016-03-03 -0.095991 -0.627598
data.iat[1,1]
Out[37]: -1.594908560945633
data[data.A>0]
Out[38]: 
                   A         B         C         D
2016-03-01  1.544355 -0.822489  0.982824  1.031441
2016-03-03  0.902210 -0.863211 -0.095991 -0.627598
data2 = data.copy()
tag = ['a']*2 +['b']*2+['c']*2
data2['Tag'] = tag
data2
Out[42]: 
                   A         B         C         D Tag
2016-03-01  1.544355 -0.822489  0.982824  1.031441   a
2016-03-02 -1.615765 -1.594909  0.236910 -0.787619   a
2016-03-03  0.902210 -0.863211 -0.095991 -0.627598   b
2016-03-04 -1.415922 -0.536842 -0.499592  0.173474   b
2016-03-05 -1.361461  1.683070  0.042079 -0.805467   c
2016-03-06 -0.058033  0.189439 -1.066178 -1.374988   c
data2[data2.Tag.isin(['a','b'])]
Out[43]: 
                   A         B         C         D Tag
2016-03-01  1.544355 -0.822489  0.982824  1.031441   a
2016-03-02 -1.615765 -1.594909  0.236910 -0.787619   a
2016-03-03  0.902210 -0.863211 -0.095991 -0.627598   b
2016-03-04 -1.415922 -0.536842 -0.499592  0.173474   b

4. 修改元素

data.iat[0,0] = 100
data
Out[45]: 
                     A         B         C         D
2016-03-01  100.000000 -0.822489  0.982824  1.031441
2016-03-02   -1.615765 -1.594909  0.236910 -0.787619
2016-03-03    0.902210 -0.863211 -0.095991 -0.627598
2016-03-04   -1.415922 -0.536842 -0.499592  0.173474
2016-03-05   -1.361461  1.683070  0.042079 -0.805467
2016-03-06   -0.058033  0.189439 -1.066178 -1.374988
data.A = range(6)
data
Out[47]: 
            A         B         C         D
2016-03-01  0 -0.822489  0.982824  1.031441
2016-03-02  1 -1.594909  0.236910 -0.787619
2016-03-03  2 -0.863211 -0.095991 -0.627598
2016-03-04  3 -0.536842 -0.499592  0.173474
2016-03-05  4  1.683070  0.042079 -0.805467
2016-03-06  5  0.189439 -1.066178 -1.374988
data.B = 200

5. 处理缺失数据

创建缺失表格

df1 = data.reindex(index=dates[0:4],columns=list(data.columns) + ['E'])
df1
Out[54]: 
            A    B         C         D   E
2016-03-01  0  200  0.982824  1.031441 NaN
2016-03-02  1  200  0.236910 -0.787619 NaN
2016-03-03  2  200 -0.095991 -0.627598 NaN
2016-03-04  3  200 -0.499592  0.173474 NaN
df1.loc[dates[1:3],'E'] = 2
df1
Out[57]: 
            A    B         C         D    E
2016-03-01  0  200  0.982824  1.031441  NaN
2016-03-02  1  200  0.236910 -0.787619  2.0
2016-03-03  2  200 -0.095991 -0.627598  2.0
2016-03-04  3  200 -0.499592  0.173474  NaN

处理缺失值

df1.dropna()
Out[58]: 
            A    B         C         D    E
2016-03-02  1  200  0.236910 -0.787619  2.0
2016-03-03  2  200 -0.095991 -0.627598  2.0
df1.fillna(value=5)
Out[59]: 
            A    B         C         D    E
2016-03-01  0  200  0.982824  1.031441  5.0
2016-03-02  1  200  0.236910 -0.787619  2.0
2016-03-03  2  200 -0.095991 -0.627598  2.0
2016-03-04  3  200 -0.499592  0.173474  5.0
pd.isnull(df1)
Out[60]: 
                A      B      C      D      E
2016-03-01  False  False  False  False   True
2016-03-02  False  False  False  False  False
2016-03-03  False  False  False  False  False
2016-03-04  False  False  False  False   True

计算

df1.mean()
Out[61]: 
A      1.500000
B    200.000000
C      0.156037
D     -0.052576
E      2.000000
dtype: float64
df1.cumsum()
Out[62]: 
            A    B         C         D    E
2016-03-01  0  200  0.982824  1.031441  NaN
2016-03-02  1  400  1.219733  0.243822  2.0
2016-03-03  3  600  1.123742 -0.383776  4.0
2016-03-04  6  800  0.624150 -0.210302  NaN
df.sub(s,axis='index')

##apply用法
data.apply(lambda x:x.max()-x.min())
Out[65]: 
A    5.000000
B    0.000000
C    2.049002
D    2.406430
dtype: float64
def sum(x):
   ...:     print(type(x))
   ...:     return x.sum()
   ...: data.apply(sum)
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
Out[66]: 
A      15.000000
B    1200.000000
C      -0.399949
D      -2.390757
dtype: float64
s = pd.Series(np.random.randint(10,20,size=20))
s.value_counts()
Out[68]: 
15    4
13    3
12    3
17    2
16    2
14    2
10    2
18    1
11    1
dtype: int64
s.mode()
Out[69]: 
0    15
dtype: int64

6. 合并数据

df = pd.DataFrame(np.random.randn(10,4),columns=list('ABCD'))
df1 = pd.concat([df.iloc[:3],df.iloc[3:7],df.iloc[7:]])
df1
Out[74]: 
          A         B         C         D
0  0.143628  0.752585 -0.729864  0.316444
1 -1.259617  0.808502  0.926428  0.276563
2 -0.063887  0.363763  0.284600 -0.021021
3  0.184363 -1.349010 -1.854798  1.257927
4 -0.660593  0.200365 -0.360575 -1.201731
5  0.753510  1.137488 -1.060917 -1.703415
6  0.885780  1.736457  0.396292 -1.751817
7 -0.812100 -0.020186 -0.727871  0.950167
8  0.650730  0.039977 -0.855242 -0.569816
9  0.406987  1.787581  2.378510 -0.628562
df1 == df
Out[75]: 
      A     B     C     D
0  True  True  True  True
1  True  True  True  True
2  True  True  True  True
3  True  True  True  True
4  True  True  True  True
5  True  True  True  True
6  True  True  True  True
7  True  True  True  True
8  True  True  True  True
9  True  True  True  True
(df == df1).all().all()
Out[76]: True

#等同于sql
left = pd.DataFrame({'key':['foo','foo'],'lval':[1,2]})
right = pd.DataFrame({'key':['foo','foo'],'lval':[4,5]})
pd.merge(left,right,on='key')
Out[80]: 
   key  lval_x  lval_y
0  foo       1       4
1  foo       1       5
2  foo       2       4
3  foo       2       5

#插入一列
s = pd.Series(np.random.randint(1,5),index=list('ABCDE'))
df.append(s,ignore_index=True)
Out[84]: 
           A         B         C         D    E
0   0.143628  0.752585 -0.729864  0.316444  NaN
1  -1.259617  0.808502  0.926428  0.276563  NaN
2  -0.063887  0.363763  0.284600 -0.021021  NaN
3   0.184363 -1.349010 -1.854798  1.257927  NaN
4  -0.660593  0.200365 -0.360575 -1.201731  NaN
5   0.753510  1.137488 -1.060917 -1.703415  NaN
6   0.885780  1.736457  0.396292 -1.751817  NaN
7  -0.812100 -0.020186 -0.727871  0.950167  NaN
8   0.650730  0.039977 -0.855242 -0.569816  NaN
9   0.406987  1.787581  2.378510 -0.628562  NaN
10  2.000000  2.000000  2.000000  2.000000  2.0

7. 分类统计

分组计算

df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
   ...:                           'foo', 'bar', 'foo', 'foo'],
   ...:                    'B' : ['one', 'one', 'two', 'three',
   ...:                            'two', 'two', 'one', 'three'],
   ...:                    'C' : np.random.randn(8),
   ...:                    'D' : np.random.randn(8)})
df
Out[86]: 
     A      B         C         D
0  foo    one -1.329037  0.818405
1  bar    one  0.823417 -1.160418
2  foo    two  1.123663 -1.225055
3  bar  three -0.630443 -0.780871
4  foo    two  0.904849  2.038818
5  bar    two  2.001044  0.430133
6  foo    one -0.224188 -0.212802
7  foo  three  0.274152 -1.200993
df.groupby('A').sum()
Out[88]: 
            C         D
A                      
bar  2.194018 -1.511156
foo  0.749439  0.218373
df.groupby(['A','B']).sum()
Out[89]: 
                  C         D
A   B                        
bar one    0.823417 -1.160418
    three -0.630443 -0.780871
    two    2.001044  0.430133
foo one   -1.553225  0.605603
    three  0.274152 -1.200993
    two    2.028512  0.813763

二. 快速入门(二)

1. 数据的整形

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
Backend MacOSX is interactive backend. Turning interactive mode on.
tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
   ...:                      'foo', 'foo', 'qux', 'qux'],
   ...:                     ['one', 'two', 'one', 'two',
   ...:                      'one', 'two', 'one', 'two']]))
   ...: index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
index
Out[94]: 
MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])
df
Out[96]: 
                     A         B
first second                    
bar   one     0.420161  0.121795
      two    -1.117661 -0.142775
baz   one    -0.192747 -0.441343
      two     0.720518  0.698498
foo   one     1.699954  0.678692
      two    -1.115893 -0.541516
qux   one     0.984971 -0.227957
      two    -0.919770  0.035390
stacked = df.stack()
stacked
Out[100]: 
first  second   
bar    one     A    0.420161
               B    0.121795
       two     A   -1.117661
               B   -0.142775
baz    one     A   -0.192747
               B   -0.441343
       two     A    0.720518
               B    0.698498
foo    one     A    1.699954
               B    0.678692
       two     A   -1.115893
               B   -0.541516
qux    one     A    0.984971
               B   -0.227957
       two     A   -0.919770
               B    0.035390
dtype: float64
stacked.index
Out[101]: 
MultiIndex([('bar', 'one', 'A'),
            ('bar', 'one', 'B'),
            ('bar', 'two', 'A'),
            ('bar', 'two', 'B'),
            ('baz', 'one', 'A'),
            ('baz', 'one', 'B'),
            ('baz', 'two', 'A'),
            ('baz', 'two', 'B'),
            ('foo', 'one', 'A'),
            ('foo', 'one', 'B'),
            ('foo', 'two', 'A'),
            ('foo', 'two', 'B'),
            ('qux', 'one', 'A'),
            ('qux', 'one', 'B'),
            ('qux', 'two', 'A'),
            ('qux', 'two', 'B')],
           names=['first', 'second', None])
stacked.unstack()
Out[102]: 
                     A         B
first second                    
bar   one     0.420161  0.121795
      two    -1.117661 -0.142775
baz   one    -0.192747 -0.441343
      two     0.720518  0.698498
foo   one     1.699954  0.678692
      two    -1.115893 -0.541516
qux   one     0.984971 -0.227957
      two    -0.919770  0.035390
stacked.unstack().unstack()
Out[103]: 
               A                   B          
second       one       two       one       two
first                                         
bar     0.420161 -1.117661  0.121795 -0.142775
baz    -0.192747  0.720518 -0.441343  0.698498
foo     1.699954 -1.115893  0.678692 -0.541516
qux     0.984971 -0.919770 -0.227957  0.035390

2. 数据透视表

df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
    ...:                     'B' : ['A', 'B', 'C'] * 4,
    ...:                     'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
    ...:                     'D' : np.random.randn(12),
    ...:                     'E' : np.random.randn(12)})
    ...: 
df
Out[105]: 
        A  B    C         D         E
0     one  A  foo -0.309637 -0.972758
1     one  B  foo  0.176570  0.015976
2     two  C  foo -1.031974  1.200191
3   three  A  bar -0.216964 -0.403420
4     one  B  bar -0.117683  0.420872
5     one  C  bar -1.114209 -0.184610
6     two  A  foo  1.391508  1.030261
7   three  B  foo -0.217724  0.030135
8     one  C  foo -0.398388  1.767420
9     one  A  bar -0.101332  0.122136
10    two  B  bar -0.717255 -1.078189
11  three  C  bar  0.198217 -0.675727
df.pivot_table(values=['D'],index=['A','B'],columns=['C'])
Out[107]: 
                D          
C             bar       foo
A     B                    
one   A -0.101332 -0.309637
      B -0.117683  0.176570
      C -1.114209 -0.398388
three A -0.216964       NaN
      B       NaN -0.217724
      C  0.198217       NaN
two   A       NaN  1.391508
      B -0.717255       NaN
      C       NaN -1.031974
df.pivot_table(values=['E'],index=['A'],columns=['C'])
Out[108]: 
              E          
C           bar       foo
A                        
one    0.119466  0.270213
three -0.539573  0.030135
two   -1.078189  1.115226
df[df.A=='one']
Out[109]: 
     A  B    C         D         E
0  one  A  foo -0.309637 -0.972758
1  one  B  foo  0.176570  0.015976
4  one  B  bar -0.117683  0.420872
5  one  C  bar -1.114209 -0.184610
8  one  C  foo -0.398388  1.767420
9  one  A  bar -0.101332  0.122136
df[df.A=='one'].groupby('C').mean()
Out[110]: 
            D         E
C                      
bar -0.444408  0.119466
foo -0.177151  0.270213

3. 时间序列

rng = pd.date_range('20160301', periods=600, freq='s')
rng
Out[112]: 
DatetimeIndex(['2016-03-01 00:00:00', '2016-03-01 00:00:01',
               '2016-03-01 00:00:02', '2016-03-01 00:00:03',
               '2016-03-01 00:00:04', '2016-03-01 00:00:05',
               '2016-03-01 00:00:06', '2016-03-01 00:00:07',
               '2016-03-01 00:00:08', '2016-03-01 00:00:09',
               ...
               '2016-03-01 00:09:50', '2016-03-01 00:09:51',
               '2016-03-01 00:09:52', '2016-03-01 00:09:53',
               '2016-03-01 00:09:54', '2016-03-01 00:09:55',
               '2016-03-01 00:09:56', '2016-03-01 00:09:57',
               '2016-03-01 00:09:58', '2016-03-01 00:09:59'],
              dtype='datetime64[ns]', length=600, freq='S')
s = pd.Series(np.random.randint(0,500,len(rng)),index=rng)
s
Out[115]: 
2016-03-01 00:00:00     41
2016-03-01 00:00:01    497
2016-03-01 00:00:02    226
2016-03-01 00:00:03    191
2016-03-01 00:00:04    140
                      ... 
2016-03-01 00:09:55     61
2016-03-01 00:09:56    286
2016-03-01 00:09:57    207
2016-03-01 00:09:58    430
2016-03-01 00:09:59    111
Freq: S, Length: 600, dtype: int64
rng = pd.period_range('2000Q1','2006Q1',freq='Q')
rng
Out[119]: 
PeriodIndex(['2000Q1', '2000Q2', '2000Q3', '2000Q4', '2001Q1', '2001Q2',
             '2001Q3', '2001Q4', '2002Q1', '2002Q2', '2002Q3', '2002Q4',
             '2003Q1', '2003Q2', '2003Q3', '2003Q4', '2004Q1', '2004Q2',
             '2004Q3', '2004Q4', '2005Q1', '2005Q2', '2005Q3', '2005Q4',
             '2006Q1'],
            dtype='period[Q-DEC]', freq='Q-DEC')
rng.to_timestamp()
Out[120]: 
DatetimeIndex(['2000-01-01', '2000-04-01', '2000-07-01', '2000-10-01',
               '2001-01-01', '2001-04-01', '2001-07-01', '2001-10-01',
               '2002-01-01', '2002-04-01', '2002-07-01', '2002-10-01',
               '2003-01-01', '2003-04-01', '2003-07-01', '2003-10-01',
               '2004-01-01', '2004-04-01', '2004-07-01', '2004-10-01',
               '2005-01-01', '2005-04-01', '2005-07-01', '2005-10-01',
               '2006-01-01'],
              dtype='datetime64[ns]', freq='QS-OCT')
pd.Timestamp('20160301') - pd.Timestamp('20160201')
Out[121]: Timedelta('29 days 00:00:00')
pd.Timestamp('20160301') + pd.Timedelta(days=5)
Out[122]: Timestamp('2016-03-06 00:00:00')

4. 类别数据

df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})
    ...: df
Out[123]: 
   id raw_grade
0   1         a
1   2         b
2   3         b
3   4         a
4   5         a
5   6         e
df["grade"] = df["raw_grade"].astype("category")
    ...: df
Out[124]: 
   id raw_grade grade
0   1         a     a
1   2         b     b
2   3         b     b
3   4         a     a
4   5         a     a
5   6         e     e
df.grade
Out[125]: 
0    a
1    b
2    b
3    a
4    a
5    e
Name: grade, dtype: category
Categories (3, object): [a, b, e]
df.grade.cat.categories
Out[126]: Index(['a', 'b', 'e'], dtype='object')
df.grade.cat.categories = ['very good','good','bad']
df
Out[128]: 
   id raw_grade      grade
0   1         a  very good
1   2         b       good
2   3         b       good
3   4         a  very good
4   5         a  very good
5   6         e        bad
df.sort_values(by='grade',ascending=True)
Out[129]: 
   id raw_grade      grade
0   1         a  very good
3   4         a  very good
4   5         a  very good
1   2         b       good
2   3         b       good
5   6         e        bad

5. 数据可视化

s = pd.Series(np.random.randn(1000), index=pd.date_range('20000101', periods=1000))
    ...: s = s.cumsum()
s.plot()
Out[131]: <AxesSubplot:>

Python数据科学包(二)----- Pandas快速入门

6. 数据读写

df = pd.DataFrame(np.random.randn(100, 4), columns=list('ABCD'))
df.to_csv('data.csv')
%ls
data.csv
数据科学包/
pd.read_csv('data.csv')
Out[135]: 
    Unnamed: 0         A         B         C         D
0            0 -1.999558 -1.370733 -0.907842  0.674820
1            1  0.228398  0.648693 -0.356987  0.890468
2            2 -0.448148 -1.557300 -0.030299 -1.054521
3            3  0.527413 -0.513595 -1.363460  0.121549
4            4  1.516076  0.843181  0.327307 -0.327884
..         ...       ...       ...       ...       ...
95          95 -0.116893  0.698820 -0.129136  0.761577
96          96 -0.357027 -1.630429 -0.952050  0.994391
97          97  0.437282 -0.107047  1.177280  0.014935
98          98 -1.606144  1.957082 -1.210236 -0.040398
99          99  1.151170  2.242048 -0.506228  1.706791
[100 rows x 5 columns]
pd.read_csv('data.csv',index_col=0)
Out[136]: 
           A         B         C         D
0  -1.999558 -1.370733 -0.907842  0.674820
1   0.228398  0.648693 -0.356987  0.890468
2  -0.448148 -1.557300 -0.030299 -1.054521
3   0.527413 -0.513595 -1.363460  0.121549
4   1.516076  0.843181  0.327307 -0.327884
..       ...       ...       ...       ...
95 -0.116893  0.698820 -0.129136  0.761577
96 -0.357027 -1.630429 -0.952050  0.994391
97  0.437282 -0.107047  1.177280  0.014935
98 -1.606144  1.957082 -1.210236 -0.040398
99  1.151170  2.242048 -0.506228  1.706791
[100 rows x 4 columns]

未经允许不得转载:作者:756-周同学, 转载或复制请以 超链接形式 并注明出处 拜师资源博客
原文地址:《Python数据科学包(二)—– Pandas快速入门》 发布于2020-07-27

分享到:
赞(0) 打赏

评论 抢沙发

评论前必须登录!

  注册



长按图片转发给朋友

觉得文章有用就打赏一下文章作者

支付宝扫一扫打赏

微信扫一扫打赏

Vieu3.3主题
专业打造轻量级个人企业风格博客主题!专注于前端开发,全站响应式布局自适应模板。

登录

忘记密码 ?

您也可以使用第三方帐号快捷登录

Q Q 登 录
微 博 登 录