1338-代同学

# 索引

## 多级索引（MultiIndex）

### 创建多级索引

#### Series数据结构

import numpy as np
import pandas as pd

a=[['a','a','a','b','b','c','c'],[1,2,3,1,2,2,3]]
t=list(zip(*a))      #zip函数将两个列表组装起来
print(t)

index=pd.MultiIndex.from_tuples(t,names=['level1','level2'])
s=pd.Series(np.random.rand(7),index=index)
print(s)


[('a', 1), ('a', 2), ('a', 3), ('b', 1), ('b', 2), ('c', 2), ('c', 3)]
level1  level2
a       1         0.924655
2         0.099581
3         0.954186
b       1         0.769564
2         0.953621
c       2         0.893699
3         0.949448
dtype: float64


#### DataFrame数据结构

import numpy as np
import pandas as pd

df=pd.DataFrame(np.random.randint(1,10,(4,3)),index=[['a','a','b','b'],[1,2,1,2]],columns=[['one','one','two'],['blue','red','blue']])
df.index.names=['row-1','row-2']
df.index.names=['col-1','col-2']
print(df)


col-1        one      two
col-2       blue red blue
row-1 row-2
a     1        9   5    5
2        1   1    9
b     1        1   8    2
2        7   2    9


### 索引交换

df2=df.swaplevel('row-1','row-2')


col-1        one      two
col-2       blue red blue
row-2 row-1
1     a        9   5    5
2     a        1   1    9
1     b        1   8    2
2     b        7   2    9


### 按索引层次统计

#### 按一级索引求和

df.sum(level=0)


col-1  one      two
col-2 blue red blue
row-1
a       10   6   14
b        8  10   11


#### 按二级索引求和

df.sum(level=1)


col-1  one      two
col-2 blue red blue
row-2
1       10  13    7
2        8   3   18


### 索引与列的转换

import numpy as np
import pandas as pd

df=pd.DataFrame({
'a':range(7),
'b':range(7,0,-1),
'c':['one','one','one','two','two','two','two'],
'd':[0,1,2,0,1,2,3]
})

print(df)

a  b    c  d
0  0  7  one  0
1  1  6  one  1
2  2  5  one  2
3  3  4  two  0
4  4  3  two  1
5  5  2  two  2
6  6  1  two  3


#### 将c设置为索引值

print(df.set_index('c'))


     a  b  d
c
one  0  7  0
one  1  6  1
one  2  5  2
two  3  4  0
two  4  3  1
two  5  2  2
two  6  1  3


#### 生成二级索引，c为第一级，d为第二级

df2=df.set_index(['c','d'])
print(df2)


       a  b
c   d
one 0  0  7
1  1  6
2  2  5
two 0  3  4
1  4  3
2  5  2
3  6  1


#### 将多级索引重新转换为平面DataFrame

print(df2.reset_index())


     c  d  a  b
0  one  0  0  7
1  one  1  1  6
2  one  2  2  5
3  two  0  3  4
4  two  1  4  3
5  two  2  5  2
6  two  3  6  1


#### 此时顺序发生了变化，可以重新排序一下

print(df2.reset_index().sort_index('columns'))


   a  b    c  d
0  0  7  one  0
1  1  6  one  1
2  2  5  one  2
3  3  4  two  0
4  4  3  two  1
5  5  2  two  2
6  6  1  two  3


# 分组计算

:root { --mermaid-font-family: "trebuchet ms", verdana, arial;}

import numpy as np
import pandas as pd

df=pd.DataFrame({
'key1':['a','a','b','b','a'],
'key2':['one','two','one','one','two'],
'data1':np.random.randint(1,10,5),
'data2':np.random.randint(1,10,5)})
print(df)

key1 key2  data1  data2
0    a  one      4      3
1    a  two      7      8
2    b  one      3      6
3    b  one      4      9
4    a  two      2      2


## 直接分组

print(df['data1'].groupby(df['key1']).mean())


key1
a    4.333333
b    3.500000
Name: data1, dtype: float64


## 多层列表分组

print(df['data1'].groupby([df['key1'],df['key2']]).sum())


key1  key2
a     one     4
two     9
b     one     7
Name: data1, dtype: int32


## 按python迭代器协议分组

for name,group in df.groupby('key1'):
print(name)
print(group)


a
key1 key2  data1  data2
0    a  one      4      3
1    a  two      7      8
4    a  two      2      2
b
key1 key2  data1  data2
2    b  one      3      6
3    b  one      4      9


## 通过字典进行分组

df = pd.DataFrame(np.random.randint(1, 10, (5, 5)),
columns=['a', 'b', 'c', 'd', 'e'],
index=['Alice', 'Bob', 'Candy', 'Dark', 'Emily'])

a  b  c  d  e
Alice  4  4  9  6  6
Bob    2  6  4  2  9
Candy  6  8  4  4  4
Dark   3  1  8  2  1
Emily  6  6  5  5  8


mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'orange', 'e': 'blue'}
grouped = df.groupby(mapping, axis=1)
print(grouped.sum())
print(grouped.count())
print(grouped.size())    #查看分组后元素个数


       blue  orange  red
Alice    15       6    8
Bob      13       2    8
Candy     8       4   14
Dark      9       2    4
Emily    13       5   12

blue  orange  red
Alice     2       1    2
Bob       2       1    2
Candy     2       1    2
Dark      2       1    2
Emily     2       1    2

blue      2
orange    1
red       2
dtype: int64


## 通过函数进行分组

df = pd.DataFrame(np.random.randint(1, 10, (5, 5)),
columns=['a', 'b', 'c', 'd', 'e'],
index=['Alice', 'Bob', 'Candy', 'Dark', 'Emily'])

a  b  c  d  e
Alice  3  7  3  7  8
Bob    7  3  4  6  4
Candy  6  2  7  7  3
Dark   4  6  6  8  5
Emily  9  5  8  2  6


def _dummy_group(idx):
print(idx)
return idx

print(df.groupby(_dummy_group))
print(df.groupby(_dummy_group, axis=1))

Alice
Bob
Candy
Dark
Emily

a
b
c
d
e


grouped = df.groupby(len)
print(grouped.sum())
print(grouped.size())
print(grouped.count())


    a   b   c   d   e
3   7   3   4   6   4
4   4   6   6   8   5
5  18  14  18  16  17
3    1
4    1
5    3
dtype: int64
a  b  c  d  e
3  1  1  1  1  1
4  1  1  1  1  1
5  3  3  3  3  3


## 多级索引数据根据索引级别来分组

columns = pd.MultiIndex.from_arrays([['China', 'USA', 'China', 'USA', 'China'],
['A', 'A', 'B', 'C', 'B']], names=['country', 'index'])
df = pd.DataFrame(np.random.randint(1, 10, (5, 5)), columns=columns)
print(df)

country China USA China USA China
index       A   A     B   C     B
0           1   7     3   9     7
1           9   3     4   5     8
2           4   2     7   5     7
3           4   4     6   6     4
4           8   9     4   6     3


print(df.groupby(level='country', axis=1).count())
print(df.groupby(level='country', axis=1).sum())
print(df.groupby(level='index', axis=1).count())

country  China  USA
0            3    2
1            3    2
2            3    2
3            3    2
4            3    2
country  China  USA
0           11   16
1           21    8
2           18    7
3           14   10
4           15   15
index  A  B  C
0      2  2  1
1      2  2  1
2      2  2  1
3      2  2  1
4      2  2  1


# 数据聚合

## 内置聚合函数

df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'],
'key2': ['one', 'two', 'one', 'two', 'one'],
'data1': np.random.randint(1, 10, 5),
'data2': np.random.randint(1, 10, 5)})
print(df.describe())


  key1 key2  data1  data2
0    a  one      3      9
1    a  two      5      1
2    b  one      1      2
3    b  two      7      7
4    a  one      6      7
data1    data2
count  5.000000  5.00000
mean   4.400000  5.20000
std    2.408319  3.49285
min    1.000000  1.00000
25%    3.000000  2.00000
50%    5.000000  7.00000
75%    6.000000  7.00000
max    7.000000  9.00000


## 自定义聚合函数

def peak_verbose(s):
print(type(s))
return s.max() - s.min()

def peak(s):
return s.max() - s.min()


grouped = df.groupby('key1')
print(grouped.agg(peak_verbose))


  key1 key2  data1  data2
0    a  one      8      8
1    a  two      8      8
2    b  one      1      7
3    b  two      1      8
4    a  one      5      5
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
data1  data2
key1
a         3      3
b         0      1


## 应用多个聚合函数（agg）

print(grouped['data1', 'data2'].agg(['mean', 'std', peak]))

data1                data2
mean       std peak  mean       std peak
key1
a        7  1.732051    3   7.0  1.732051    3
b        1  0.000000    0   7.5  0.707107    1


## 给不同的列应用不同的聚合函数

d = {'data1': ['mean', peak, 'max', 'min'],
'data2': 'sum'}
print(grouped.agg(d))  #此时key1被作为列索引


     data1              data2
mean peak max min   sum
key1
a        7    3   8   5    21
b        1    0   1   1    15


## 重置索引

print(grouped.agg(d).reset_index())
print(df.groupby('key1', as_index=False).agg(d))


  key1 data1              data2
mean peak max min   sum
0    a     7    3   8   5    21
1    b     1    0   1   1    15
key1 data1              data2
mean peak max min   sum
0    a     7    3   8   5    21
1    b     1    0   1   1    15


## 分组运算和转换

import numpy as np
import pandas as pd
df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'],
'key2': ['one', 'two', 'one', 'two', 'one'],
'data1': np.random.randint(1, 10, 5),
'data2': np.random.randint(1, 10, 5)})
df
key1 key2  data1  data2
0    a  one      2      7
1    a  two      1      4
2    b  one      2      2
3    b  two      3      9
4    a  one      5      1


### transform

k1_mean = df.groupby('key1').transform(np.mean).add_prefix('mean_')
df[k1_mean.columns] = k1_mean

key1 key2  data1  data2  mean_data1  mean_data2
0    a  one      2      7    2.666667         4.0
1    a  two      1      4    2.666667         4.0
2    b  one      2      2    2.500000         5.5
3    b  two      3      9    2.500000         5.5
4    a  one      5      1    2.666667         4.0


### 距平化

df = pd.DataFrame(np.random.randint(1, 10, (5, 5)),
columns=['a', 'b', 'c', 'd', 'e'],
index=['Alice', 'Bob', 'Candy', 'Dark', 'Emily'])
df
a  b  c  d  e
Alice  3  5  2  8  5
Bob    3  8  1  5  6
Candy  6  4  4  7  1
Dark   6  9  5  3  7
Emily  1  7  8  3  1


def demean(s):
return s - s.mean()

key = ['one', 'one', 'two', 'one', 'two']
demeaned = df.groupby(key).transform(demean)
demeaned

a         b         c         d    e
Alice -1.0 -2.333333 -0.666667  2.666667 -1.0
Bob   -1.0  0.666667 -1.666667 -0.333333  0.0
Candy  2.5 -1.500000 -2.000000  2.000000  0.0
Dark   2.0  1.666667  2.333333 -2.333333  1.0
Emily -2.5  1.500000  2.000000 -2.000000  0.0


### apply 函数

df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a', 'a', 'a', 'b', 'b', 'a'],
'key2': ['one', 'two', 'one', 'two', 'one', 'one', 'two', 'one', 'two', 'one'],
'data1': np.random.randint(1, 10, 10),
'data2': np.random.randint(1, 10, 10)})
df
key1 key2  data1  data2
0    a  one      3      7
1    a  two      6      5
2    b  one      9      8
3    b  two      9      7
4    a  one      7      9
5    a  one      9      2
6    a  two      4      6
7    b  one      6      7
8    b  two      6      8
9    a  one      5      6


def top(df, n=2, column='data1'):
return df.sort_values(by=column, ascending=False)[:n]

top(df, n=5)
key1 key2  data1  data2
2    b  one      9      8
3    b  two      9      7
5    a  one      9      2
4    a  one      7      9
1    a  two      6      5


df.groupby('key1').apply(top)

key1 key2  data1  data2
key1
a    5    a  one      9      2
4    a  one      7      9
b    2    b  one      9      8
3    b  two      9      7


df.groupby('key1').apply(top, n=3, column='data2')

key1 key2  data1  data2
key1
a    4    a  one      7      9
0    a  one      3      7
6    a  two      4      6
b    2    b  one      9      8
8    b  two      6      8
3    b  two      9      7


### apply 应用示例：用不同的分组平均值填充空缺数据

states = ['Ohio', 'New York', 'Vermont', 'Florida',
group_key = ['East'] * 4 + ['West'] * 4
data = pd.Series(np.random.randn(8), index=states)
data

Ohio         -1.730269
New York      1.183595
Vermont            NaN
Florida      -0.832253
Oregon        0.376810
California    0.636360
Idaho              NaN
dtype: float64


data.groupby(group_key).mean()

East   -0.459643
West    0.506585
dtype: float64


fill_mean = lambda g: g.fillna(g.mean())
data.groupby(group_key).apply(fill_mean)

Ohio         -1.730269
New York      1.183595
Vermont      -0.459643
Florida      -0.832253
Oregon        0.376810
California    0.636360
Idaho         0.506585
dtype: float64


# 数据IO

• 索引：将一个列或多个列读取出来构成 DataFrame，其中涉及是否从文件中读取索引以及列名
• 类型推断和数据转换：包括用户自定义的转换以及缺失值标记
• 日期解析
• 迭代：针对大文件进行逐块迭代。这个是Pandas和Python原生的csv库的最大区别
• 不规整数据问题：跳过一些行，或注释等等

## 载入数据到 Pandas

• 索引：将一个列或多个列读取出来构成 DataFrame，其中涉及是否从文件中读取索引以及列名
• 类型推断和数据转换：包括用户自定义的转换以及缺失值标记
• 日期解析
• 迭代：针对大文件进行逐块迭代。这个是Pandas和Python原生的csv库的最大区别
• 不规整数据问题：跳过一些行，或注释等等
注意先cd到文件所在目录

### 索引及列名

df = pd.read_csv('ex1.csv')
df

a   b   c   d message
0  1   2   3   4   hello
1  5   6   7   8   world
2  9  10  11  12     foo

df

a   b   c   d message
0  1   2   3   4   hello
1  5   6   7   8   world
2  9  10  11  12     foo

# 列名缺失
0   1   2   3      4
0  1   2   3   4  hello
1  5   6   7   8  world
2  9  10  11  12    foo

# 指定列名

a   b   c   d    msg
0  1   2   3   4  hello
1  5   6   7   8  world
2  9  10  11  12    foo

# 指定行索引

a   b   c   d
msg
hello  1   2   3   4
world  5   6   7   8
foo    9  10  11  12

# 多层行索引

b   c   d
msg   a
hello 1   2   3   4
world 5   6   7   8
foo   9  10  11  12


### 处理不规则的分隔符

# 正则表达式

A         B         C
aaa -0.264438 -1.026059 -0.619500
bbb  0.927272  0.302904 -0.032399
ccc -0.264273 -0.386314 -0.217601
ddd -0.871858 -0.348382  1.100491


### 缺失值处理

pd.read_csv('data/ex5.csv')

something  a   b     c   d message
0       one  1   2   3.0   4     NaN
1       two  5   6   NaN   8   world
2     three  9  10  11.0  12     foo

something  a   b     c   d message
0       one  1   2   3.0   4     NaN
1       two  5   6   NaN   8   world
2     three  9  10  11.0  12     NaN

pd.read_csv('data/ex5.csv', na_values={'message': ['foo', 'NA'], 'something': ['two']})

something  a   b     c   d message
0       one  1   2   3.0   4     NaN
1       NaN  5   6   NaN   8   world
2     three  9  10  11.0  12     NaN


### 逐块读取数据

pd.read_csv('data/ex6.csv', nrows=10)

one       two     three      four key
0  0.467976 -0.038649 -0.295344 -1.824726   L
1 -0.358893  1.404453  0.704965 -0.200638   B
2 -0.501840  0.659254 -0.421691 -0.057688   G
3  0.204886  1.074134  1.388361 -0.982404   R
4  0.354628 -0.133116  0.283763 -0.837063   Q
5  1.817480  0.742273  0.419395 -2.251035   Q
6 -0.776764  0.935518 -0.332872 -1.875641   U
7 -0.913135  1.530624 -0.572657  0.477252   K
8  0.358480 -0.497572 -0.367016  0.507702   S
9 -1.740877 -1.160417 -1.637830  2.172201   G

# 统计每个 key 出现的次数

key_count = pd.Series([])
for pieces in tr:
key_count = key_count.sort_values(ascending=False)
key_count[:10]

E    368.0
X    364.0
L    346.0
O    343.0
Q    340.0
M    338.0
J    337.0
F    335.0
K    334.0
H    330.0
dtype: float64


### 保存数据到磁盘

df = pd.read_csv('data/ex5.csv')
df

something  a   b     c   d message
0       one  1   2   3.0   4     NaN
1       two  5   6   NaN   8   world
2     three  9  10  11.0  12     foo

df.to_csv('data/ex5_out.csv')
df

Unnamed: 0 something  a   b     c   d message
0           0       one  1   2   3.0   4     NaN
1           1       two  5   6   NaN   8   world
2           2     three  9  10  11.0  12     foo

# 不写索引
df.to_csv('data/ex5_out.csv', index=False)
df

something  a   b     c   d message
0       one  1   2   3.0   4     NaN
1       two  5   6   NaN   8   world
2     three  9  10  11.0  12     foo

# 不写列名称
df

one  1   2   3.0   4 Unnamed: 5
0    two  5   6   NaN   8      world
1  three  9  10  11.0  12        foo

#指定分隔符
df.to_csv('data/ex5_out.csv', index=False, sep='|')
df

something|a|b|c|d|message
0            one|1|2|3.0|4|
1          two|5|6||8|world
2    three|9|10|11.0|12|foo

# 只写出一部分列
df.to_csv('data/ex5_out.csv', index=False, columns=['a', 'b', 'message'])
df

a   b message
0  1   2     NaN
1  5   6   world
2  9  10     foo


Vieu3.3主题

Q Q 登 录