python 多级索引表格合并 python 多重索引

转载

云端筑梦大师 2024-02-27 06:58:17

文章标签 python 多级索引表格合并 python 元组 sed 数据 文章分类 Python 后端开发

# http://pandas.pydata.org/pandas-docs/stable/advanced.html
# MultiIndex / Advanced Indexing
# pandas 0.22.0 
# http://pandas.pydata.org/pandas-docs/stable/advanced.html
# MultiIndex / Advanced Indexing
# pandas 0.22.0

import pandas as pd
import numpy as np
import random; random.shuffle(tuples)
import pandas as pd
import numpy as np
import random; random.shuffle(tuples)

# Hierarchical indexing (MultiIndex)  分层索引 多重索引

# 创建多重索引对象，如同标准的索引类，他们存放轴axis标签labels
# 创建方式：
# from a list of arrays -- using MultiIndex.from_arrays
# from an array of tuples -- using MultiIndex.from_tuples
# from a crossed set of iterables -- using MultiIndex.from_product 

arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
          ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
arrays
# *arrays
zip(*arrays)  # 这里的* 是“解开”list ， list[0] list[1] ……
list(zip(*arrays))
list(zip(arrays[0], arrays[1]))
tuples = list(zip(*arrays))
tuples
# Hierarchical indexing (MultiIndex)  分层索引 多重索引

# 创建多重索引对象，如同标准的索引类，他们存放轴axis标签labels
# 创建方式：
# from a list of arrays -- using MultiIndex.from_arrays
# from an array of tuples -- using MultiIndex.from_tuples
# from a crossed set of iterables -- using MultiIndex.from_product 

arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
          ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
arrays
# *arrays
zip(*arrays)  # 这里的* 是“解开”list ， list[0] list[1] ……
list(zip(*arrays))
list(zip(arrays[0], arrays[1]))
tuples = list(zip(*arrays))
tuples

[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
 ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]






<zip at 0xf96f888>






[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]






[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]






[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
index
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
index

MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
           labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],
           names=['first', 'second'])

s = pd.Series(np.random.randn(8), index=index)
s
s = pd.Series(np.random.randn(8), index=index)
s

first  second
bar    one       0.271000
       two      -1.276230
baz    one      -1.018103
       two      -0.620292
foo    one       1.008070
       two       0.759145
qux    one      -2.141050
       two      -0.927688
dtype: float64

# 更简洁的方式, 当每个元素对都来自于可迭代对象时
iterables = [['bar', 'baz', 'foo', 'qux'], ['one', 'two']]  # 创建了与上面相同的index
pd.MultiIndex.from_product(iterables, names=['first', 'second'])  # 多重索引可以接受命名，默认为None
# 更简洁的方式, 当每个元素对都来自于可迭代对象时
iterables = [['bar', 'baz', 'foo', 'qux'], ['one', 'two']]  # 创建了与上面相同的index
pd.MultiIndex.from_product(iterables, names=['first', 'second'])  # 多重索引可以接受命名，默认为None

MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
           labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],
           names=['first', 'second'])

# 也可以直接在创建df或series时传入矩阵array 的 列表list
arrays = [np.array(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux']),
          np.array(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'])]
arrays  
# 也可以直接在创建df或series时传入矩阵array 的 列表list
arrays = [np.array(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux']),
          np.array(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'])]
arrays

[array(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
       dtype='<U3'),
 array(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'],
       dtype='<U3')]

s = pd.Series(np.random.randn(8), index=arrays)
s
s = pd.Series(np.random.randn(8), index=arrays)
s

bar  one    0.817791
     two    0.510420
baz  one   -0.494160
     two   -0.529997
foo  one    0.641282
     two   -0.202762
qux  one    0.050320
     two    2.097300
dtype: float64

df = pd.DataFrame(np.random.randn(8, 4), index=arrays)
df
df = pd.DataFrame(np.random.randn(8, 4), index=arrays)
df

		0	1	2	3
bar	one	-2.090989	0.001052	1.467637	0.267938
two	1.224610	0.851894	0.765531	-0.505116
baz	one	1.444246	-0.247795	0.267462	-0.945641
two	0.836046	0.274732	0.530525	-0.560081
foo	one	-3.709465	-0.157089	0.608778	-0.003217
two	-0.848818	1.478306	-0.389401	-1.205956
qux	one	-1.069775	1.272440	-0.797613	-0.194223
two	1.597218	0.454815	-0.756022	0.481038

# 可以对不同的轴向（如行索引/行名或列索引/列名）设置
df = pd.DataFrame(np.random.randn(3, 8), index=['A', 'B', 'C'], columns=index)
df
# 可以对不同的轴向（如行索引/行名或列索引/列名）设置
df = pd.DataFrame(np.random.randn(3, 8), index=['A', 'B', 'C'], columns=index)
df

first	bar	baz	foo	qux
second	one	two	one	two	one	two	one	two
A	-1.608162	-0.007312	1.048244	-0.029907	-0.437866	-1.853398	2.026875	0.359521
B	1.207609	-0.272366	-0.530191	-0.689641	-0.244362	-1.476252	0.818493	0.353771
C	-0.369463	1.862253	-0.118297	-0.148326	1.147616	-1.389965	0.817716	0.787394

# 同时对两个方向设置index，注意index的长度与数据在不同方向上的长度
pd.DataFrame(np.random.randn(6, 6), index=index[:6], columns=index[:6])
# 同时对两个方向设置index，注意index的长度与数据在不同方向上的长度
pd.DataFrame(np.random.randn(6, 6), index=index[:6], columns=index[:6])

				first	bar	baz	foo
	second	one	two	one	two	one	two
first	second
bar	one	0.020204	-0.549089	0.381830	0.326558	-1.420590	-1.551863
two	1.311775	2.294908	0.203981	1.381199	-0.743387	2.119027
baz	one	0.640856	1.089627	-1.463503	0.727607	-0.959549	-0.037316
two	-0.906859	-0.720702	0.862614	0.082066	0.209276	-0.391039
foo	one	-0.328704	-1.015117	0.279826	0.141166	-0.053601	-1.171920
two	0.342074	-0.196049	-0.387946	0.196228	-1.264932	0.144251

pd.Series(np.random.randn(8), index=tuples)  # 多重索引，相当于元组index
pd.Series(np.random.randn(8), index=tuples)  # 多重索引，相当于元组index

(bar, one)   -0.267177
(bar, two)   -0.239632
(baz, one)    1.212249
(baz, two)    0.289517
(foo, one)    1.311922
(foo, two)   -0.797733
(qux, one)   -1.395485
(qux, two)   -0.451327
dtype: float64

# 可以控制索引的显示方式，通过在 pandas.set_options() 设置 multi_sparse 选项
pd.set_option('display.multi_sparse', False)
df
pd.set_option('display.multi_sparse', True)
df
# 可以控制索引的显示方式，通过在 pandas.set_options() 设置 multi_sparse 选项
pd.set_option('display.multi_sparse', False)
df
pd.set_option('display.multi_sparse', True)
df

first	bar	bar	baz	baz	foo	foo	qux	qux
second	one	two	one	two	one	two	one	two
A	-1.608162	-0.007312	1.048244	-0.029907	-0.437866	-1.853398	2.026875	0.359521
B	1.207609	-0.272366	-0.530191	-0.689641	-0.244362	-1.476252	0.818493	0.353771
C	-0.369463	1.862253	-0.118297	-0.148326	1.147616	-1.389965	0.817716	0.787394

first	bar	baz	foo	qux
second	one	two	one	two	one	two	one	two
A	-1.608162	-0.007312	1.048244	-0.029907	-0.437866	-1.853398	2.026875	0.359521
B	1.207609	-0.272366	-0.530191	-0.689641	-0.244362	-1.476252	0.818493	0.353771
C	-0.369463	1.862253	-0.118297	-0.148326	1.147616	-1.389965	0.817716	0.787394

# Reconstructing the level labels 重建层级标签
# The method get_level_values will return a vector of the labels for each location at a particular level:
# get_level_values 方法返回指定层级的标签向量
index.get_level_values(0)  # 使用整数序号
index.get_level_values("second")  # 使用name
# Reconstructing the level labels 重建层级标签
# The method get_level_values will return a vector of the labels for each location at a particular level:
# get_level_values 方法返回指定层级的标签向量
index.get_level_values(0)  # 使用整数序号
index.get_level_values("second")  # 使用name

Index(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], dtype='object', name='first')






Index(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'], dtype='object', name='second')

# Basic indexing on axis with MultiIndex # 在轴方向的基础索引
df['bar']
df['bar', 'one']
df['bar']['one']  # 不建议使用，链式
s['qux']
# Basic indexing on axis with MultiIndex # 在轴方向的基础索引
df['bar']
df['bar', 'one']
df['bar']['one']  # 不建议使用，链式
s['qux']

second	one	two
A	-1.608162	-0.007312
B	1.207609	-0.272366
C	-0.369463	1.862253

A   -1.608162
B    1.207609
C   -0.369463
Name: (bar, one), dtype: float64






A   -1.608162
B    1.207609
C   -0.369463
Name: one, dtype: float64






one    0.05032
two    2.09730
dtype: float64

# Defined Levels 指定层级
df.columns  # 原index
df[['foo','qux']].columns  # 切片后的结果，层级levels中的项目没有减少，labels减少了
# 这样做避免了重新计算层级，使切片保持高效
# Defined Levels 指定层级
df.columns  # 原index
df[['foo','qux']].columns  # 切片后的结果，层级levels中的项目没有减少，labels减少了
# 这样做避免了重新计算层级，使切片保持高效

MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
           labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],
           names=['first', 'second'])






MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
           labels=[[2, 2, 3, 3], [0, 1, 0, 1]],
           names=['first', 'second'])

# 查看切片实际选择levels
df[['foo','qux']].columns.values
df[['foo','qux']].columns.get_level_values(0)  # 指定层级
# 查看切片实际选择levels
df[['foo','qux']].columns.values
df[['foo','qux']].columns.get_level_values(0)  # 指定层级

array([('foo', 'one'), ('foo', 'two'), ('qux', 'one'), ('qux', 'two')],
      dtype=object)






Index(['foo', 'foo', 'qux', 'qux'], dtype='object', name='first')

# 用有效的used层级重建多重索引
df[['foo','qux']].columns.remove_unused_levels()
# 用有效的used层级重建多重索引
df[['foo','qux']].columns.remove_unused_levels()

MultiIndex(levels=[['foo', 'qux'], ['one', 'two']],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]],
           names=['first', 'second'])

# Data alignment and using reindex 数据定位和使用reindex
s
# Data alignment and using reindex 数据定位和使用reindex
s

bar  one    0.817791
     two    0.510420
baz  one   -0.494160
     two   -0.529997
foo  one    0.641282
     two   -0.202762
qux  one    0.050320
     two    2.097300
dtype: float64

# 当两个index不同的对象计算时与一般的index一样
s + s[:-2]
s + s[::2]
# 当两个index不同的对象计算时与一般的index一样
s + s[:-2]
s + s[::2]

bar  one    1.635582
     two    1.020840
baz  one   -0.988319
     two   -1.059993
foo  one    1.282563
     two   -0.405523
qux  one         NaN
     two         NaN
dtype: float64






bar  one    1.635582
     two         NaN
baz  one   -0.988319
     two         NaN
foo  one    1.282563
     two         NaN
qux  one    0.100640
     two         NaN
dtype: float64

# reindex 可以被另外一个 multiindex 或者 元组的list 或 array 调用
index
index[:3]
s.reindex(index[:3])
s.reindex([('foo', 'two'), ('bar', 'one'), ('qux', 'one'), ('baz', 'one')])
# reindex 可以被另外一个 multiindex 或者 元组的list 或 array 调用
index
index[:3]
s.reindex(index[:3])
s.reindex([('foo', 'two'), ('bar', 'one'), ('qux', 'one'), ('baz', 'one')])

MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
           labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],
           names=['first', 'second'])






MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
           labels=[[0, 0, 1], [0, 1, 0]],
           names=['first', 'second'])






first  second
bar    one       0.817791
       two       0.510420
baz    one      -0.494160
dtype: float64






foo  two   -0.202762
bar  one    0.817791
qux  one    0.050320
baz  one   -0.494160
dtype: float64

# Advanced indexing with hierarchical index
# 使用层次索引的高级索引方法
df = df.T
df
# Advanced indexing with hierarchical index
# 使用层次索引的高级索引方法
df = df.T
df

		A	B	C
first	second
bar	one	-1.608162	1.207609	-0.369463
two	-0.007312	-0.272366	1.862253
baz	one	1.048244	-0.530191	-0.118297
two	-0.029907	-0.689641	-0.148326
foo	one	-0.437866	-0.244362	1.147616
two	-1.853398	-1.476252	-1.389965
qux	one	2.026875	0.818493	0.817716
two	0.359521	0.353771	0.787394

# .loc 定位
df.loc["bar"]
df.loc["bar", "two"]  # 返回了一个series（不是一行，而是“一列”），其索引是原df的列名
# .loc 定位
df.loc["bar"]
df.loc["bar", "two"]  # 返回了一个series（不是一行，而是“一列”），其索引是原df的列名

	A	B	C
second
one	-1.608162	1.207609	-0.369463
two	-0.007312	-0.272366	1.862253

A   -0.007312
B   -0.272366
C    1.862253
Name: (bar, two), dtype: float64

# loc 中使用切片，切片的值可以是元组
df.loc['baz':'foo']
df.loc[('baz', 'two'):('qux', 'one')]
df.loc[('baz', 'two'):'foo']
# loc 中使用切片，切片的值可以是元组
df.loc['baz':'foo']
df.loc[('baz', 'two'):('qux', 'one')]
df.loc[('baz', 'two'):'foo']

		A	B	C
first	second
baz	one	1.048244	-0.530191	-0.118297
two	-0.029907	-0.689641	-0.148326
foo	one	-0.437866	-0.244362	1.147616
two	-1.853398	-1.476252	-1.389965

		A	B	C
first	second
baz	two	-0.029907	-0.689641	-0.148326
foo	one	-0.437866	-0.244362	1.147616
two	-1.853398	-1.476252	-1.389965
qux	one	2.026875	0.818493	0.817716

		A	B	C
first	second
baz	two	-0.029907	-0.689641	-0.148326
foo	one	-0.437866	-0.244362	1.147616
two	-1.853398	-1.476252	-1.389965

# 可以给loc传入元组或标签列表，取得不连续的索引
df.loc[[('bar', 'two'), ('qux', 'one')]]
# 可以给loc传入元组或标签列表，取得不连续的索引
df.loc[[('bar', 'two'), ('qux', 'one')]]

		A	B	C
first	second
bar	two	-0.007312	-0.272366	1.862253
qux	one	2.026875	0.818493	0.817716

# Using slicers 使用切片
# 可以用多重索引对象进行切片。可以用 切片值、 标签或标签列表、 布尔索引等选择器
# 可以用slice(None) 选择那一级的所有的内容。不用特别指定所有深度的级别，他们默认是slice(None)
# 注意：使用loc应该规定所有的轴方向，包括行index和列columns。
# 推荐方式：df.loc[(slice('A1','A3'),.....), :]   注意 冒号前面的逗号，逗号前表示行方向切片（选择器），逗号后面表示列方向切片（选择器）
# 不推荐：  df.loc[(slice('A1','A3'),.....)]  可能产生歧义
# Using slicers 使用切片
# 可以用多重索引对象进行切片。可以用 切片值、 标签或标签列表、 布尔索引等选择器
# 可以用slice(None) 选择那一级的所有的内容。不用特别指定所有深度的级别，他们默认是slice(None)
# 注意：使用loc应该规定所有的轴方向，包括行index和列columns。
# 推荐方式：df.loc[(slice('A1','A3'),.....), :]   注意 冒号前面的逗号，逗号前表示行方向切片（选择器），逗号后面表示列方向切片（选择器）
# 不推荐：  df.loc[(slice('A1','A3'),.....)]  可能产生歧义

def mklbl(prefix, n):
    # mklbl("a", 3) --> ['a0', 'a1', 'a2']
    return ["%s%s" % (prefix, i) for i in range(n)]
def mklbl(prefix, n):
    # mklbl("a", 3) --> ['a0', 'a1', 'a2']
    return ["%s%s" % (prefix, i) for i in range(n)]

mklbl("a", 3)
mklbl("a", 3)

['a0', 'a1', 'a2']

miindex = pd.MultiIndex.from_product([mklbl('A',4),
                                      mklbl('B',2),
                                      mklbl('C',4),
                                      mklbl('D',2)])
miindex  # 由列表生成4重（4级）索引对象，共生成4*2*4*2=64行
miindex = pd.MultiIndex.from_product([mklbl('A',4),
                                      mklbl('B',2),
                                      mklbl('C',4),
                                      mklbl('D',2)])
miindex  # 由列表生成4重（4级）索引对象，共生成4*2*4*2=64行

MultiIndex(levels=[['A0', 'A1', 'A2', 'A3'], ['B0', 'B1'], ['C0', 'C1', 'C2', 'C3'], ['D0', 'D1']],
           labels=[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 1, 1, 2, 2, 3, 3, 0, 0, 1, 1, 2, 2, 3, 3, 0, 0, 1, 1, 2, 2, 3, 3, 0, 0, 1, 1, 2, 2, 3, 3, 0, 0, 1, 1, 2, 2, 3, 3, 0, 0, 1, 1, 2, 2, 3, 3, 0, 0, 1, 1, 2, 2, 3, 3, 0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]])

micolumns = pd.MultiIndex.from_tuples([('a','foo'),('a','bar'),
                                       ('b','foo'),('b','bah')],
                                      names=['lvl0', 'lvl1'])
micolumns  # 由元组生成2重索引对象，共4行，并对2重（两级）分别命名
micolumns = pd.MultiIndex.from_tuples([('a','foo'),('a','bar'),
                                       ('b','foo'),('b','bah')],
                                      names=['lvl0', 'lvl1'])
micolumns  # 由元组生成2重索引对象，共4行，并对2重（两级）分别命名

MultiIndex(levels=[['a', 'b'], ['bah', 'bar', 'foo']],
           labels=[[0, 0, 1, 1], [2, 1, 2, 0]],
           names=['lvl0', 'lvl1'])

row_l = len(miindex)
col_l = len(micolumns)
dfmi = pd.DataFrame(np.arange(row_l * col_l).reshape((row_l, col_l)),
                        index=miindex,
                        columns=micolumns).sort_index().sort_index(axis=1)
dfmi 
row_l = len(miindex)
col_l = len(micolumns)
dfmi = pd.DataFrame(np.arange(row_l * col_l).reshape((row_l, col_l)),
                        index=miindex,
                        columns=micolumns).sort_index().sort_index(axis=1)
dfmi

			lvl0	a	b
			lvl1	bar	foo	bah	foo
A0	B0	C0	D0	1	0	3	2
D1	5	4	7	6
C1	D0	9	8	11	10
D1	13	12	15	14
C2	D0	17	16	19	18
D1	21	20	23	22
C3	D0	25	24	27	26
D1	29	28	31	30
B1	C0	D0	33	32	35	34
D1	37	36	39	38
C1	D0	41	40	43	42
D1	45	44	47	46
C2	D0	49	48	51	50
D1	53	52	55	54
C3	D0	57	56	59	58
D1	61	60	63	62
A1	B0	C0	D0	65	64	67	66
D1	69	68	71	70
C1	D0	73	72	75	74
D1	77	76	79	78
C2	D0	81	80	83	82
D1	85	84	87	86
C3	D0	89	88	91	90
D1	93	92	95	94
B1	C0	D0	97	96	99	98
D1	101	100	103	102
C1	D0	105	104	107	106
D1	109	108	111	110
C2	D0	113	112	115	114
D1	117	116	119	118
...	...	...	...	...	...	...	...
A2	B0	C1	D0	137	136	139	138
D1	141	140	143	142
C2	D0	145	144	147	146
D1	149	148	151	150
C3	D0	153	152	155	154
D1	157	156	159	158
B1	C0	D0	161	160	163	162
D1	165	164	167	166
C1	D0	169	168	171	170
D1	173	172	175	174
C2	D0	177	176	179	178
D1	181	180	183	182
C3	D0	185	184	187	186
D1	189	188	191	190
A3	B0	C0	D0	193	192	195	194
D1	197	196	199	198
C1	D0	201	200	203	202
D1	205	204	207	206
C2	D0	209	208	211	210
D1	213	212	215	214
C3	D0	217	216	219	218
D1	221	220	223	222
B1	C0	D0	225	224	227	226
D1	229	228	231	230
C1	D0	233	232	235	234
D1	237	236	239	238
C2	D0	241	240	243	242
D1	245	244	247	246
C3	D0	249	248	251	250
D1	253	252	255	254

64 rows × 4 columns

# Basic multi-index slicing using slices, lists, and labels.
dfmi.loc[(slice('A1','A3'), slice(None), ['C1', 'C3']), :]  # slice('A1','A3') 相当于 ['A1':'A3']
# dfmi.loc[(slice('A1','A3'), slice(None), ['C1', 'C3']) :]  # 错误 冒号前面必须有逗号

# Basic multi-index slicing using slices, lists, and labels.
dfmi.loc[(slice('A1','A3'), slice(None), ['C1', 'C3']), :]  # slice('A1','A3') 相当于 ['A1':'A3']
# dfmi.loc[(slice('A1','A3'), slice(None), ['C1', 'C3']) :]  # 错误 冒号前面必须有逗号

					lvl0	a	b
			lvl1	bar	foo	bah	foo
A1	B0	C1	D0	73	72	75	74
D1	77	76	79	78
C3	D0	89	88	91	90
D1	93	92	95	94
B1	C1	D0	105	104	107	106
D1	109	108	111	110
C3	D0	121	120	123	122
D1	125	124	127	126
A2	B0	C1	D0	137	136	139	138
D1	141	140	143	142
C3	D0	153	152	155	154
D1	157	156	159	158
B1	C1	D0	169	168	171	170
D1	173	172	175	174
C3	D0	185	184	187	186
D1	189	188	191	190
A3	B0	C1	D0	201	200	203	202
D1	205	204	207	206
C3	D0	217	216	219	218
D1	221	220	223	222
B1	C1	D0	233	232	235	234
D1	237	236	239	238
C3	D0	249	248	251	250
D1	253	252	255	254

# You can use a pd.IndexSlice to have a more natural syntax using : 
# rather than using slice(None)
# 使用pd.IndexSlice 可以用冒号 : 代替 slice(None)

idx = pd.IndexSlice
dfmi.loc[idx[:, :, ['C1', 'C3']], idx[:, 'foo']]  # 默认必须是一个元组()来指定关键字，而且全选的话只能使用slice(None)
# You can use a pd.IndexSlice to have a more natural syntax using : 
# rather than using slice(None)
# 使用pd.IndexSlice 可以用冒号 : 代替 slice(None)

idx = pd.IndexSlice
dfmi.loc[idx[:, :, ['C1', 'C3']], idx[:, 'foo']]  # 默认必须是一个元组()来指定关键字，而且全选的话只能使用slice(None)

			lvl0	a	b
			lvl1	foo	foo
A0	B0	C1	D0	8	10
D1	12	14
C3	D0	24	26
D1	28	30
B1	C1	D0	40	42
D1	44	46
C3	D0	56	58
D1	60	62
A1	B0	C1	D0	72	74
D1	76	78
C3	D0	88	90
D1	92	94
B1	C1	D0	104	106
D1	108	110
C3	D0	120	122
D1	124	126
A2	B0	C1	D0	136	138
D1	140	142
C3	D0	152	154
D1	156	158
B1	C1	D0	168	170
D1	172	174
C3	D0	184	186
D1	188	190
A3	B0	C1	D0	200	202
D1	204	206
C3	D0	216	218
D1	220	222
B1	C1	D0	232	234
D1	236	238
C3	D0	248	250
D1	252	254

# 一次执行复杂选取
dfmi.loc['A1', (slice(None), 'foo')]
# 一次执行复杂选取
dfmi.loc['A1', (slice(None), 'foo')]

		lvl0	a	b
		lvl1	foo	foo
B0	C0	D0	64	66
D1	68	70
C1	D0	72	74
D1	76	78
C2	D0	80	82
D1	84	86
C3	D0	88	90
D1	92	94
B1	C0	D0	96	98
D1	100	102
C1	D0	104	106
D1	108	110
C2	D0	112	114
D1	116	118
C3	D0	120	122
D1	124	126

dfmi.loc[idx[:, :, ['C1', 'C3']], idx[:, 'foo']]
dfmi.loc[idx[:, :, ['C1', 'C3']], idx[:, 'foo']]

			lvl0	a	b
			lvl1	foo	foo
A0	B0	C1	D0	8	10
D1	12	14
C3	D0	24	26
D1	28	30
B1	C1	D0	40	42
D1	44	46
C3	D0	56	58
D1	60	62
A1	B0	C1	D0	72	74
D1	76	78
C3	D0	88	90
D1	92	94
B1	C1	D0	104	106
D1	108	110
C3	D0	120	122
D1	124	126
A2	B0	C1	D0	136	138
D1	140	142
C3	D0	152	154
D1	156	158
B1	C1	D0	168	170
D1	172	174
C3	D0	184	186
D1	188	190
A3	B0	C1	D0	200	202
D1	204	206
C3	D0	216	218
D1	220	222
B1	C1	D0	232	234
D1	236	238
C3	D0	248	250
D1	252	254

# Using a boolean indexer you can provide selection related to the values. 
# 使用布尔索引
mask = dfmi[('a', 'foo')] > 200
mask
dfmi.loc[idx[mask, :, ['C1', 'C3']], idx[:, 'foo']]
# Using a boolean indexer you can provide selection related to the values. 
# 使用布尔索引
mask = dfmi[('a', 'foo')] > 200
mask
dfmi.loc[idx[mask, :, ['C1', 'C3']], idx[:, 'foo']]

A0  B0  C0  D0    False
            D1    False
        C1  D0    False
            D1    False
        C2  D0    False
            D1    False
        C3  D0    False
            D1    False
    B1  C0  D0    False
            D1    False
        C1  D0    False
            D1    False
        C2  D0    False
            D1    False
        C3  D0    False
            D1    False
A1  B0  C0  D0    False
            D1    False
        C1  D0    False
            D1    False
        C2  D0    False
            D1    False
        C3  D0    False
            D1    False
    B1  C0  D0    False
            D1    False
        C1  D0    False
            D1    False
        C2  D0    False
            D1    False
                  ...  
A2  B0  C1  D0    False
            D1    False
        C2  D0    False
            D1    False
        C3  D0    False
            D1    False
    B1  C0  D0    False
            D1    False
        C1  D0    False
            D1    False
        C2  D0    False
            D1    False
        C3  D0    False
            D1    False
A3  B0  C0  D0    False
            D1    False
        C1  D0    False
            D1     True
        C2  D0     True
            D1     True
        C3  D0     True
            D1     True
    B1  C0  D0     True
            D1     True
        C1  D0     True
            D1     True
        C2  D0     True
            D1     True
        C3  D0     True
            D1     True
Name: (a, foo), Length: 64, dtype: bool

			lvl0	a	b
			lvl1	foo	foo
A3	B0	C1	D1	204	206
C3	D0	216	218
D1	220	222
B1	C1	D0	232	234
D1	236	238
C3	D0	248	250
D1	252	254

# 指定轴参数axis，说明传入的切片在一个轴上
dfmi.loc(axis=0)[:, :, ['C1', 'C3']]
# 指定轴参数axis，说明传入的切片在一个轴上
dfmi.loc(axis=0)[:, :, ['C1', 'C3']]

					lvl0	a	b
			lvl1	bar	foo	bah	foo
A0	B0	C1	D0	9	8	11	10
D1	13	12	15	14
C3	D0	25	24	27	26
D1	29	28	31	30
B1	C1	D0	41	40	43	42
D1	45	44	47	46
C3	D0	57	56	59	58
D1	61	60	63	62
A1	B0	C1	D0	73	72	75	74
D1	77	76	79	78
C3	D0	89	88	91	90
D1	93	92	95	94
B1	C1	D0	105	104	107	106
D1	109	108	111	110
C3	D0	121	120	123	122
D1	125	124	127	126
A2	B0	C1	D0	137	136	139	138
D1	141	140	143	142
C3	D0	153	152	155	154
D1	157	156	159	158
B1	C1	D0	169	168	171	170
D1	173	172	175	174
C3	D0	185	184	187	186
D1	189	188	191	190
A3	B0	C1	D0	201	200	203	202
D1	205	204	207	206
C3	D0	217	216	219	218
D1	221	220	223	222
B1	C1	D0	233	232	235	234
D1	237	236	239	238
C3	D0	249	248	251	250
D1	253	252	255	254

# 可以使用这种指定轴方向的方式赋值
df2 = dfmi.copy()
df2.loc(axis=0)[:, :, ['C1', 'C3']] = -10
df2
# 可以使用这种指定轴方向的方式赋值
df2 = dfmi.copy()
df2.loc(axis=0)[:, :, ['C1', 'C3']] = -10
df2

					lvl0	a	b
			lvl1	bar	foo	bah	foo
A0	B0	C0	D0	1	0	3	2
D1	5	4	7	6
C1	D0	-10	-10	-10	-10
D1	-10	-10	-10	-10
C2	D0	17	16	19	18
D1	21	20	23	22
C3	D0	-10	-10	-10	-10
D1	-10	-10	-10	-10
B1	C0	D0	33	32	35	34
D1	37	36	39	38
C1	D0	-10	-10	-10	-10
D1	-10	-10	-10	-10
C2	D0	49	48	51	50
D1	53	52	55	54
C3	D0	-10	-10	-10	-10
D1	-10	-10	-10	-10
A1	B0	C0	D0	65	64	67	66
D1	69	68	71	70
C1	D0	-10	-10	-10	-10
D1	-10	-10	-10	-10
C2	D0	81	80	83	82
D1	85	84	87	86
C3	D0	-10	-10	-10	-10
D1	-10	-10	-10	-10
B1	C0	D0	97	96	99	98
D1	101	100	103	102
C1	D0	-10	-10	-10	-10
D1	-10	-10	-10	-10
C2	D0	113	112	115	114
D1	117	116	119	118
...	...	...	...	...	...	...	...
A2	B0	C1	D0	-10	-10	-10	-10
D1	-10	-10	-10	-10
C2	D0	145	144	147	146
D1	149	148	151	150
C3	D0	-10	-10	-10	-10
D1	-10	-10	-10	-10
B1	C0	D0	161	160	163	162
D1	165	164	167	166
C1	D0	-10	-10	-10	-10
D1	-10	-10	-10	-10
C2	D0	177	176	179	178
D1	181	180	183	182
C3	D0	-10	-10	-10	-10
D1	-10	-10	-10	-10
A3	B0	C0	D0	193	192	195	194
D1	197	196	199	198
C1	D0	-10	-10	-10	-10
D1	-10	-10	-10	-10
C2	D0	209	208	211	210
D1	213	212	215	214
C3	D0	-10	-10	-10	-10
D1	-10	-10	-10	-10
B1	C0	D0	225	224	227	226
D1	229	228	231	230
C1	D0	-10	-10	-10	-10
D1	-10	-10	-10	-10
C2	D0	241	240	243	242
D1	245	244	247	246
C3	D0	-10	-10	-10	-10
D1	-10	-10	-10	-10

64 rows × 4 columns

# You can use a right-hand-side of an alignable object as well.
# 可以在右侧使用可定位的对象？
df2 = dfmi.copy()
df2.loc[idx[:, :, ['C1', 'C3']], :] = df2 * 1000
df2
# You can use a right-hand-side of an alignable object as well.
# 可以在右侧使用可定位的对象？
df2 = dfmi.copy()
df2.loc[idx[:, :, ['C1', 'C3']], :] = df2 * 1000
df2

			lvl0	a	b
			lvl1	bar	foo	bah	foo
A0	B0	C0	D0	1	0	3	2
D1	5	4	7	6
C1	D0	9000	8000	11000	10000
D1	13000	12000	15000	14000
C2	D0	17	16	19	18
D1	21	20	23	22
C3	D0	25000	24000	27000	26000
D1	29000	28000	31000	30000
B1	C0	D0	33	32	35	34
D1	37	36	39	38
C1	D0	41000	40000	43000	42000
D1	45000	44000	47000	46000
C2	D0	49	48	51	50
D1	53	52	55	54
C3	D0	57000	56000	59000	58000
D1	61000	60000	63000	62000
A1	B0	C0	D0	65	64	67	66
D1	69	68	71	70
C1	D0	73000	72000	75000	74000
D1	77000	76000	79000	78000
C2	D0	81	80	83	82
D1	85	84	87	86
C3	D0	89000	88000	91000	90000
D1	93000	92000	95000	94000
B1	C0	D0	97	96	99	98
D1	101	100	103	102
C1	D0	105000	104000	107000	106000
D1	109000	108000	111000	110000
C2	D0	113	112	115	114
D1	117	116	119	118
...	...	...	...	...	...	...	...
A2	B0	C1	D0	137000	136000	139000	138000
D1	141000	140000	143000	142000
C2	D0	145	144	147	146
D1	149	148	151	150
C3	D0	153000	152000	155000	154000
D1	157000	156000	159000	158000
B1	C0	D0	161	160	163	162
D1	165	164	167	166
C1	D0	169000	168000	171000	170000
D1	173000	172000	175000	174000
C2	D0	177	176	179	178
D1	181	180	183	182
C3	D0	185000	184000	187000	186000
D1	189000	188000	191000	190000
A3	B0	C0	D0	193	192	195	194
D1	197	196	199	198
C1	D0	201000	200000	203000	202000
D1	205000	204000	207000	206000
C2	D0	209	208	211	210
D1	213	212	215	214
C3	D0	217000	216000	219000	218000
D1	221000	220000	223000	222000
B1	C0	D0	225	224	227	226
D1	229	228	231	230
C1	D0	233000	232000	235000	234000
D1	237000	236000	239000	238000
C2	D0	241	240	243	242
D1	245	244	247	246
C3	D0	249000	248000	251000	250000
D1	253000	252000	255000	254000

64 rows × 4 columns

# Cross-section 断面
# xs 方法 另外提供了一个级别level参数 用来选择多重索引中的部分级别
# xs 当提供轴参数时，也可用于列的选择
df

# Cross-section 断面
# xs 方法 另外提供了一个级别level参数 用来选择多重索引中的部分级别
# xs 当提供轴参数时，也可用于列的选择
df

		A	B	C
first	second
bar	one	-1.608162	1.207609	-0.369463
two	-0.007312	-0.272366	1.862253
baz	one	1.048244	-0.530191	-0.118297
two	-0.029907	-0.689641	-0.148326
foo	one	-0.437866	-0.244362	1.147616
two	-1.853398	-1.476252	-1.389965
qux	one	2.026875	0.818493	0.817716
two	0.359521	0.353771	0.787394

df.xs("one", level="second")
df.loc[(slice(None), "one"), :]  # 使用切片得到同样的选择
df.xs("one", level="second")
df.loc[(slice(None), "one"), :]  # 使用切片得到同样的选择

	A	B	C
first
bar	-1.608162	1.207609	-0.369463
baz	1.048244	-0.530191	-0.118297
foo	-0.437866	-0.244362	1.147616
qux	2.026875	0.818493	0.817716

		A	B	C
first	second
bar	one	-1.608162	1.207609	-0.369463
baz	one	1.048244	-0.530191	-0.118297
foo	one	-0.437866	-0.244362	1.147616
qux	one	2.026875	0.818493	0.817716

df = df.T
df.xs('one', level='second', axis=1)  # 在列方向选择
df.loc[:, (slice(None),'one')]  # 使用切片方式
df = df.T
df.xs('one', level='second', axis=1)  # 在列方向选择
df.loc[:, (slice(None),'one')]  # 使用切片方式

first	bar	baz	foo	qux
A	-1.608162	1.048244	-0.437866	2.026875
B	1.207609	-0.530191	-0.244362	0.818493
C	-0.369463	-0.118297	1.147616	0.817716

first	bar	baz	foo	qux
second	one	one	one	one
A	-1.608162	1.048244	-0.437866	2.026875
B	1.207609	-0.530191	-0.244362	0.818493
C	-0.369463	-0.118297	1.147616	0.817716

# xs 方法使用多重关键字keys，关键字元组
df.xs(('one', 'bar'), level=('second', 'first'), axis=1)  # 关键字元组可以与级层顺序不一致，与给定level顺序一致
df.loc[:, ("bar", "one")]
# df.loc[:, ("one", "bar")]  # 错误，元组元素的顺序与级层顺序一致
# xs 方法使用多重关键字keys，关键字元组
df.xs(('one', 'bar'), level=('second', 'first'), axis=1)  # 关键字元组可以与级层顺序不一致，与给定level顺序一致
df.loc[:, ("bar", "one")]
# df.loc[:, ("one", "bar")]  # 错误，元组元素的顺序与级层顺序一致

first	bar
second	one
A	-1.608162
B	1.207609
C	-0.369463

A   -1.608162
B    1.207609
C   -0.369463
Name: (bar, one), dtype: float64

# drop_level=False 参数可以使xs保留选定的层级，而不是舍弃，这样的话，与切片得到的结果完全相同
df.xs('one', level='second', axis=1, drop_level=False)  # 默认 drop_level=True
df.loc[:, (slice(None) ,"one")]
# drop_level=False 参数可以使xs保留选定的层级，而不是舍弃，这样的话，与切片得到的结果完全相同
df.xs('one', level='second', axis=1, drop_level=False)  # 默认 drop_level=True
df.loc[:, (slice(None) ,"one")]

first	bar	baz	foo	qux
second	one	one	one	one
A	-1.608162	1.048244	-0.437866	2.026875
B	1.207609	-0.530191	-0.244362	0.818493
C	-0.369463	-0.118297	1.147616	0.817716

first	bar	baz	foo	qux
second	one	one	one	one
A	-1.608162	1.048244	-0.437866	2.026875
B	1.207609	-0.530191	-0.244362	0.818493
C	-0.369463	-0.118297	1.147616	0.817716

# Advanced reindexing and alignment 高级索引和定位
# level参数加在索引reindex和定位align方法中。可用于通过级层进行广播值。

midx = pd.MultiIndex(levels=[['zero', 'one'], ['x','y']],
                         labels=[[1,1,0,0],[1,0,1,0]])  # 指定了labels,指定了层级间对应关系
midx
# Advanced reindexing and alignment 高级索引和定位
# level参数加在索引reindex和定位align方法中。可用于通过级层进行广播值。

midx = pd.MultiIndex(levels=[['zero', 'one'], ['x','y']],
                         labels=[[1,1,0,0],[1,0,1,0]])  # 指定了labels,指定了层级间对应关系
midx

MultiIndex(levels=[['zero', 'one'], ['x', 'y']],
           labels=[[1, 1, 0, 0], [1, 0, 1, 0]])

df = pd.DataFrame(np.random.randn(4,2), index=midx)
df
df = pd.DataFrame(np.random.randn(4,2), index=midx)
df

		0	1
one	y	-1.388542	-1.170054
x	0.240534	-0.656707
zero	y	-0.848351	-1.394871
x	-0.212248	0.051445

# 不同层级索引的广播计算
df2 = df.mean(level=1)
df2
df2 = df.mean(level=0)
df2
# 不同层级索引的广播计算
df2 = df.mean(level=1)
df2
df2 = df.mean(level=0)
df2

	0	1
y	-1.118447	-1.282462
x	0.014143	-0.302631

	0	1
one	-0.574004	-0.913381
zero	-0.530300	-0.671713

# 重构索引
df2.reindex(df.index, level=0)
# 重构索引
df2.reindex(df.index, level=0)

		0	1
one	y	-0.574004	-0.913381
x	-0.574004	-0.913381
zero	y	-0.530300	-0.671713
x	-0.530300	-0.671713

# 定位/对齐
df
df2
df.align(df2, level=0)
df_aligned, df2_aligned = df.align(df2, level=0)  # ??
df_aligned
df2_aligned
# 定位/对齐
df
df2
df.align(df2, level=0)
df_aligned, df2_aligned = df.align(df2, level=0)  # ??
df_aligned
df2_aligned

		0	1
one	y	-1.388542	-1.170054
x	0.240534	-0.656707
zero	y	-0.848351	-1.394871
x	-0.212248	0.051445

	0	1
one	-0.574004	-0.913381
zero	-0.530300	-0.671713

(               0         1
 one  y -1.388542 -1.170054
      x  0.240534 -0.656707
 zero y -0.848351 -1.394871
      x -0.212248  0.051445,                0         1
 one  y -0.574004 -0.913381
      x -0.574004 -0.913381
 zero y -0.530300 -0.671713
      x -0.530300 -0.671713)

		0	1
one	y	-1.388542	-1.170054
x	0.240534	-0.656707
zero	y	-0.848351	-1.394871
x	-0.212248	0.051445

		0	1
one	y	-0.574004	-0.913381
x	-0.574004	-0.913381
zero	y	-0.530300	-0.671713
x	-0.530300	-0.671713

# 交换层级 swaplevel()
df.swaplevel(0, 1, axis=0)
# 交换层级 swaplevel()
df.swaplevel(0, 1, axis=0)

		0	1
y	one	-1.388542	-1.170054
x	one	0.240534	-0.656707
y	zero	-0.848351	-1.394871
x	zero	-0.212248	0.051445

# reorder_levels 概况了 swaplevel 函数， 可以一步交换层级索引
df.reorder_levels([1,0], axis=0)  # 看上去结果与swaplevel一样，传入参数不一样
# reorder_levels 概况了 swaplevel 函数， 可以一步交换层级索引
df.reorder_levels([1,0], axis=0)  # 看上去结果与swaplevel一样，传入参数不一样

		0	1
y	one	-1.388542	-1.170054
x	one	0.240534	-0.656707
y	zero	-0.848351	-1.394871
x	zero	-0.212248	0.051445

# 多重索引排序
# 排序是为了搞笑的索引和切片。任何索引都可以使用sort_index

tuples
s = pd.Series(np.random.randn(8), index=pd.MultiIndex.from_tuples(tuples))
s
# 多重索引排序
# 排序是为了搞笑的索引和切片。任何索引都可以使用sort_index

tuples
s = pd.Series(np.random.randn(8), index=pd.MultiIndex.from_tuples(tuples))
s

[('baz', 'two'),
 ('qux', 'two'),
 ('bar', 'one'),
 ('foo', 'one'),
 ('qux', 'one'),
 ('baz', 'one'),
 ('foo', 'two'),
 ('bar', 'two')]






baz  two    1.365155
qux  two   -1.331225
bar  one   -1.512430
foo  one    0.468294
qux  one   -0.667115
baz  one   -0.502417
foo  two    1.685553
bar  two   -1.611271
dtype: float64

s.sort_index()
s.sort_index(level=1)  # 默认是level=0排序
s.sort_index(level=0)
s.sort_index()
s.sort_index(level=1)  # 默认是level=0排序
s.sort_index(level=0)

bar  one   -1.512430
     two   -1.611271
baz  one   -0.502417
     two    1.365155
foo  one    0.468294
     two    1.685553
qux  one   -0.667115
     two   -1.331225
dtype: float64






bar  one   -1.512430
baz  one   -0.502417
foo  one    0.468294
qux  one   -0.667115
bar  two   -1.611271
baz  two    1.365155
foo  two    1.685553
qux  two   -1.331225
dtype: float64






bar  one   -1.512430
     two   -1.611271
baz  one   -0.502417
     two    1.365155
foo  one    0.468294
     two    1.685553
qux  one   -0.667115
     two   -1.331225
dtype: float64

# level参数除了可以用整型序号，还可以使用层级的names
s.index.set_names(['L1', 'L2'], inplace=True)
s
s.sort_index(level="L1")
s.sort_index(level="L2")
# level参数除了可以用整型序号，还可以使用层级的names
s.index.set_names(['L1', 'L2'], inplace=True)
s
s.sort_index(level="L1")
s.sort_index(level="L2")

L1   L2 
baz  two    1.365155
qux  two   -1.331225
bar  one   -1.512430
foo  one    0.468294
qux  one   -0.667115
baz  one   -0.502417
foo  two    1.685553
bar  two   -1.611271
dtype: float64






L1   L2 
bar  one   -1.512430
     two   -1.611271
baz  one   -0.502417
     two    1.365155
foo  one    0.468294
     two    1.685553
qux  one   -0.667115
     two   -1.331225
dtype: float64






L1   L2 
bar  one   -1.512430
baz  one   -0.502417
foo  one    0.468294
qux  one   -0.667115
bar  two   -1.611271
baz  two    1.365155
foo  two    1.685553
qux  two   -1.331225
dtype: float64

# 可以指定排序的轴方向
df.T
df.T.sort_index(level=1, axis=1)
# 可以指定排序的轴方向
df.T
df.T.sort_index(level=1, axis=1)

			one	zero
	y	x	y	x
0	-1.388542	0.240534	-0.848351	-0.212248
1	-1.170054	-0.656707	-1.394871	0.051445

	zero	one	zero	one
	x	x	y	y
0	-0.212248	0.240534	-0.848351	-1.388542
1	0.051445	-0.656707	-1.394871	-1.170054

# 即使数据没有排序也可以索引，但是这样效率低下。
# 返回值是拷贝
dfm = pd.DataFrame({'jim': [0, 0, 1, 1],
                    'joe': ['x', 'x', 'z', 'y'],
                    'jolie': np.random.rand(4)})
dfm
# 即使数据没有排序也可以索引，但是这样效率低下。
# 返回值是拷贝
dfm = pd.DataFrame({'jim': [0, 0, 1, 1],
                    'joe': ['x', 'x', 'z', 'y'],
                    'jolie': np.random.rand(4)})
dfm

	jim	joe	jolie
0	0	x	0.844228
1	0	x	0.317508
2	1	z	0.413824
3	1	y	0.074264

dfm = dfm.set_index(["jim", "joe"])
dfm
dfm = dfm.set_index(["jim", "joe"])
dfm

		jolie
jim	joe
0	x	0.844228
x	0.317508
1	z	0.413824
y	0.074264

dfm.loc[(1, "z")]  # 会提示PerformanceWarning
dfm.loc[(1, "z")]  # 会提示PerformanceWarning

d:\python\36-64\lib\site-packages\ipykernel_launcher.py:1: PerformanceWarning: indexing past lexsort depth may impact performance.
  """Entry point for launching an IPython kernel.

		jolie
jim	joe
1	z	0.413824

# dfm.loc[(0,'y'):(1, 'z')]  # 错误 无法定位
dfm.index.is_lexsorted()
dfm.index.lexsort_depth
# dfm.loc[(0,'y'):(1, 'z')]  # 错误 无法定位
dfm.index.is_lexsorted()
dfm.index.lexsort_depth

False






1

dfm = dfm.sort_index()  # 索引排序，默认对所有层级
dfm
dfm = dfm.sort_index()  # 索引排序，默认对所有层级
dfm

		jolie
jim	joe
0	x	0.844228
x	0.317508
1	y	0.074264
z	0.413824

dfm.index.is_lexsorted()
dfm.index.lexsort_depth
dfm.loc[(0,'y'):(1, 'z')]

dfm.index.is_lexsorted()
dfm.index.lexsort_depth
dfm.loc[(0,'y'):(1, 'z')]

True






2

		jolie
jim	joe
1	y	0.074264
z	0.413824

# Take Methods take 方法 (拿、取)
# 与numpy的数组类似，padas的index、series、Dataframe也提供take方法
# 用来检索给定轴方向上给定的指数indices（必须是整数列表或者整数数组，可以是负整数）

# 在性能方面，由于take方法管理了一个更窄的输入范围，它能提供比想象的索引更快的性能


index = pd.Index(np.random.randint(0, 1000, 10))
index
# Take Methods take 方法 (拿、取)
# 与numpy的数组类似，padas的index、series、Dataframe也提供take方法
# 用来检索给定轴方向上给定的指数indices（必须是整数列表或者整数数组，可以是负整数）

# 在性能方面，由于take方法管理了一个更窄的输入范围，它能提供比想象的索引更快的性能


index = pd.Index(np.random.randint(0, 1000, 10))
index

Int64Index([523, 532, 386, 998, 832, 71, 965, 274, 389, 59], dtype='int64')

positions = [0, 9, 3]
index[positions]
index.take(positions)
positions = [0, 9, 3]
index[positions]
index.take(positions)

Int64Index([523, 59, 998], dtype='int64')






Int64Index([523, 59, 998], dtype='int64')

ser = pd.Series(np.random.randn(10))
ser
ser = pd.Series(np.random.randn(10))
ser

0    0.733196
1    0.975773
2   -0.261602
3   -0.055134
4    0.959253
5    1.189025
6   -0.434102
7    0.653628
8    0.248894
9   -0.203562
dtype: float64

ser.iloc[positions]
ser.take(positions)
ser.iloc[positions]
ser.take(positions)

0    0.733196
9   -0.203562
3   -0.055134
dtype: float64






0    0.733196
9   -0.203562
3   -0.055134
dtype: float64

# 对DataFrame，indices应该是一个一维 的列表或数组，规定了行或列的位置
frm = pd.DataFrame(np.random.randn(5, 3))
frm
# 对DataFrame，indices应该是一个一维 的列表或数组，规定了行或列的位置
frm = pd.DataFrame(np.random.randn(5, 3))
frm

	0	1	2
0	-0.722107	-1.758271	0.580805
1	0.555332	-0.856173	-1.143862
2	-0.636994	1.312340	0.046131
3	-0.154813	0.311931	0.933192
4	-1.277001	-0.144097	-1.871135

frm.take([1, 4, 3])  # 默认取行方向
frm.take([0, 2], axis=1)
frm.take([1, 4, 3])  # 默认取行方向
frm.take([0, 2], axis=1)

	0	1	2
1	0.555332	-0.856173	-1.143862
4	-1.277001	-0.144097	-1.871135
3	-0.154813	0.311931	0.933192

	0	2
0	-0.722107	0.580805
1	0.555332	-1.143862
2	-0.636994	0.046131
3	-0.154813	0.933192
4	-1.277001	-1.871135

# 注意：take方法不要用于布尔indices
arr = np.random.randn(10)
arr
# 注意：take方法不要用于布尔indices
arr = np.random.randn(10)
arr

array([-0.00772525,  0.95419469,  1.80636718, -2.46742236, -0.025503  ,
        0.44203691,  0.48626739, -0.74160374, -0.22453771,  0.8813933 ])

arr.take([False, False, True, True])  # 相当于取了[0,0,1,1]
arr[[0, 1]]
arr.take([False, False, True, True])  # 相当于取了[0,0,1,1]
arr[[0, 1]]

array([-0.00772525, -0.00772525,  0.95419469,  0.95419469])






array([-0.00772525,  0.95419469])

ser = pd.Series(np.random.randn(10))
ser
ser = pd.Series(np.random.randn(10))
ser

0    1.782426
1    0.531882
2   -0.339277
3    0.500497
4   -0.333816
5   -1.713753
6   -0.125252
7   -0.857100
8    0.385080
9    1.247962
dtype: float64

ser.take([False, False, True, True])  # 相当于取了[0,0,1,1]
ser.iloc[[0, 1]]
ser.take([False, False, True, True])  # 相当于取了[0,0,1,1]
ser.iloc[[0, 1]]

0    1.782426
0    1.782426
1    0.531882
1    0.531882
dtype: float64






0    1.782426
1    0.531882
dtype: float64

# Index Types 索引 index 对象
# 其他一些索引对象

# Index Types 索引 index 对象
# 其他一些索引对象

# CategoricalIndex   绝对索引？类别索引？
# 用于支持重复的索引
from pandas.api.types import CategoricalDtype
# CategoricalIndex   绝对索引？类别索引？
# 用于支持重复的索引
from pandas.api.types import CategoricalDtype

df = pd.DataFrame({'A': np.arange(6),
                   'B': list('aabbca')})
df
df = pd.DataFrame({'A': np.arange(6),
                   'B': list('aabbca')})
df

	A	B
0	0	a
1	1	a
2	2	b
3	3	b
4	4	c
5	5	a

df['B'] = df['B'].astype(CategoricalDtype(list('cab')))
df
df.dtypes
df.B.cat.categories
df['B'] = df['B'].astype(CategoricalDtype(list('cab')))
df
df.dtypes
df.B.cat.categories

	A	B
0	0	a
1	1	a
2	2	b
3	3	b
4	4	c
5	5	a

A       int32
B    category
dtype: object






Index(['c', 'a', 'b'], dtype='object')

df2 = df.set_index('B')
df2
df2.index
df2 = df.set_index('B')
df2
df2.index

	A
B
a	0
a	1
b	2
b	3
c	4
a	5

CategoricalIndex(['a', 'a', 'b', 'b', 'c', 'a'], categories=['c', 'a', 'b'], ordered=False, name='B', dtype='category')

# 使用 __getitem__/.iloc/.loc 索引时，索引对象 必须 在类别里面，否则操作将挂起
df2.loc['a']
df2.loc['a'].index  # 保留了全部的 CategoricalIndex 
df2.sort_index()  # 按照categoies给定的顺序排序
# 使用 __getitem__/.iloc/.loc 索引时，索引对象 必须 在类别里面，否则操作将挂起
df2.loc['a']
df2.loc['a'].index  # 保留了全部的 CategoricalIndex 
df2.sort_index()  # 按照categoies给定的顺序排序

	A
B
a	0
a	1
a	5

CategoricalIndex(['a', 'a', 'a'], categories=['c', 'a', 'b'], ordered=False, name='B', dtype='category')

	A
B
c	4
a	0
a	1
a	5
b	2
b	3

df2.groupby(level=0)
df2.groupby(level=0).sum()
df2.groupby(level=0).sum().index  # 也保留了category
df2.groupby(level=0)
df2.groupby(level=0).sum()
df2.groupby(level=0).sum().index  # 也保留了category

<pandas.core.groupby.DataFrameGroupBy object at 0x00000000117AC710>

	A
B
c	4
a	6
b	5

CategoricalIndex(['c', 'a', 'b'], categories=['c', 'a', 'b'], ordered=False, name='B', dtype='category')

df2.reindex(['a','e'])  # reindex 传入普通列表 返回一个 普通的 index
df2.reindex(['a','e']).index
df2.reindex(pd.Categorical(['a','e'],categories=list('abcde')))  # 指定catgorical index，即使原来的index没有的类别，也可以reindex
df2.reindex(pd.Categorical(['a','e'],categories=list('abcde'))).index
df2.reindex(['a','e'])  # reindex 传入普通列表 返回一个 普通的 index
df2.reindex(['a','e']).index
df2.reindex(pd.Categorical(['a','e'],categories=list('abcde')))  # 指定catgorical index，即使原来的index没有的类别，也可以reindex
df2.reindex(pd.Categorical(['a','e'],categories=list('abcde'))).index

	A
B
a	0.0
a	1.0
a	5.0
e	NaN

Index(['a', 'a', 'a', 'e'], dtype='object', name='B')

	A
B
a	0.0
a	1.0
a	5.0
e	NaN

CategoricalIndex(['a', 'a', 'a', 'e'], categories=['a', 'b', 'c', 'd', 'e'], ordered=False, name='B', dtype='category')

# 注意：变形和比较操作必须有同样的categories，否则报错

# 注意：变形和比较操作必须有同样的categories，否则报错

# Int64Index and RangeIndex
# Int64Index 是pandas基础索引。
# RangeIndex是Int64Index的一个子集，现在作为所有NDFrame对象的默认索引。
# Int64Index and RangeIndex
# Int64Index 是pandas基础索引。
# RangeIndex是Int64Index的一个子集，现在作为所有NDFrame对象的默认索引。

# Float64Index 当创建索引index时，传入浮点数或者浮点与整数混合值，就默认是Float64Index

indexf = pd.Index([1.5, 2, 3, 4.5, 5])
indexf
# Float64Index 当创建索引index时，传入浮点数或者浮点与整数混合值，就默认是Float64Index

indexf = pd.Index([1.5, 2, 3, 4.5, 5])
indexf

Float64Index([1.5, 2.0, 3.0, 4.5, 5.0], dtype='float64')

sf = pd.Series(range(5), index=indexf)
sf
sf = pd.Series(range(5), index=indexf)
sf

1.5    0
2.0    1
3.0    2
4.5    3
5.0    4
dtype: int64

# [] .loc 基于 label，整数将被转为浮点值
sf[1.5:4.5]
sf[1:4]
sf.loc[3]  # label，不是位置索引
# sf[3.2]  # 错误，传入值必须在labels中
# [] .loc 基于 label，整数将被转为浮点值
sf[1.5:4.5]
sf[1:4]
sf.loc[3]  # label，不是位置索引
# sf[3.2]  # 错误，传入值必须在labels中

1.5    0
2.0    1
3.0    2
4.5    3
dtype: int64






1.5    0
2.0    1
3.0    2
dtype: int64






2

sf.iloc[3]  # 基于位置，传入整数，不能传入浮点数
sf.iloc[3]  # 基于位置，传入整数，不能传入浮点数

# 例子： 有不规则的数据表，其索引类似时间间隔，但数值是浮点型的
dfir1 = pd.DataFrame(np.random.randn(5,2),
                               index=np.arange(5) * 250.0,
                               columns=list('AB'))
dfir1
dfir2 = pd.DataFrame(np.random.randn(6,2),
                               index=np.arange(4,10) * 250.1,
                               columns=list('AB'))
dfir2
dfir = pd.concat([dfir1,dfir2])
dfir
# 例子： 有不规则的数据表，其索引类似时间间隔，但数值是浮点型的
dfir1 = pd.DataFrame(np.random.randn(5,2),
                               index=np.arange(5) * 250.0,
                               columns=list('AB'))
dfir1
dfir2 = pd.DataFrame(np.random.randn(6,2),
                               index=np.arange(4,10) * 250.1,
                               columns=list('AB'))
dfir2
dfir = pd.concat([dfir1,dfir2])
dfir

	A	B
0.0	1.158461	0.595743
250.0	1.457556	0.268541
500.0	-0.437650	-0.299700
750.0	-1.095812	-2.079684
1000.0	0.242220	-0.868812

	A	B
1000.4	-0.858327	-0.364968
1250.5	-1.445806	-2.129608
1500.6	0.799049	1.232102
1750.7	-1.132538	0.283472
2000.8	-1.157884	0.398119
2250.9	-1.330821	-0.563333

	A	B
0.0	1.158461	0.595743
250.0	1.457556	0.268541
500.0	-0.437650	-0.299700
750.0	-1.095812	-2.079684
1000.0	0.242220	-0.868812
1000.4	-0.858327	-0.364968
1250.5	-1.445806	-2.129608
1500.6	0.799049	1.232102
1750.7	-1.132538	0.283472
2000.8	-1.157884	0.398119
2250.9	-1.330821	-0.563333

# 选取第1秒前的数据
dfir[:1000]
# 选取第1秒前的数据
dfir[:1000]

	A	B
0.0	1.158461	0.595743
250.0	1.457556	0.268541
500.0	-0.437650	-0.299700
750.0	-1.095812	-2.079684
1000.0	0.242220	-0.868812

# IntervalIndex  区间索引  (数学上的开闭区间)
df = pd.DataFrame({'A': [1, 2, 3, 4]},
                  index=pd.IntervalIndex.from_breaks([0, 1, 2, 3, 4]))
df
# IntervalIndex  区间索引  (数学上的开闭区间)
df = pd.DataFrame({'A': [1, 2, 3, 4]},
                  index=pd.IntervalIndex.from_breaks([0, 1, 2, 3, 4]))
df

	A
(0, 1]	1
(1, 2]	2
(2, 3]	3
(3, 4]	4

df.loc[2]  # loc 可以是区间的边缘
df.loc[2.5]
df.loc[1.5:2.5]
df.loc[2]  # loc 可以是区间的边缘
df.loc[2.5]
df.loc[1.5:2.5]

A    2
Name: (1, 2], dtype: int64






A    3
Name: (2, 3], dtype: int64

	A
(1, 2]	2
(2, 3]	3

# Interval and IntervalIndex are used by cut and qcut
# 区间数值类型和区间类型索引可以使用 cut qcut 方法??
c = pd.cut(range(4), bins=2)
c
c.categories
# Interval and IntervalIndex are used by cut and qcut
# 区间数值类型和区间类型索引可以使用 cut qcut 方法??
c = pd.cut(range(4), bins=2)
c
c.categories

[(-0.003, 1.5], (-0.003, 1.5], (1.5, 3.0], (1.5, 3.0]]
Categories (2, interval[float64]): [(-0.003, 1.5] < (1.5, 3.0]]






IntervalIndex([(-0.003, 1.5], (1.5, 3.0]]
              closed='right',
              dtype='interval[float64]')

pd.cut([0, 3, 5, 1], bins=c.categories)  # 允许一个interval类型去bin（分隔）其他数据
pd.cut([0, 3, 5, 1], bins=c.categories)  # 允许一个interval类型去bin（分隔）其他数据

[(-0.003, 1.5], (1.5, 3.0], NaN, (-0.003, 1.5]]
Categories (2, interval[float64]): [(-0.003, 1.5] < (1.5, 3.0]]

# Miscellaneous indexing FAQ 杂项 常见问题


# Miscellaneous indexing FAQ 杂项 常见问题

# Integer indexing  整数型索引
# 整型的索引是label，应满足label的要求
# 在pandas中，一般认为标签label事项大于整数定位。

s = pd.Series(range(5))
s
# s[-1]  # 异常
# s.loc[-1]  # 异常
s.loc[-1:]  # 允许
s.iloc[-1]  # 允许
df = pd.DataFrame(np.random.randn(5, 4))
df
df.loc[-2:]
# df.loc[-2]  # 异常
# Integer indexing  整数型索引
# 整型的索引是label，应满足label的要求
# 在pandas中，一般认为标签label事项大于整数定位。

s = pd.Series(range(5))
s
# s[-1]  # 异常
# s.loc[-1]  # 异常
s.loc[-1:]  # 允许
s.iloc[-1]  # 允许
df = pd.DataFrame(np.random.randn(5, 4))
df
df.loc[-2:]
# df.loc[-2]  # 异常

0    0
1    1
2    2
3    3
4    4
dtype: int64






0    0
1    1
2    2
3    3
4    4
dtype: int64






4

	0	1	2	3
0	0.021033	0.127054	-0.864734	-1.835828
1	-0.400611	0.594981	-1.758866	-1.059539
2	-0.108597	0.784000	0.306035	-0.695933
3	-0.078048	-1.742895	-0.159740	0.934115
4	-0.524633	0.433224	-0.732334	0.442827

	0	1	2	3
0	0.021033	0.127054	-0.864734	-1.835828
1	-0.400611	0.594981	-1.758866	-1.059539
2	-0.108597	0.784000	0.306035	-0.695933
3	-0.078048	-1.742895	-0.159740	0.934115
4	-0.524633	0.433224	-0.732334	0.442827

# Non-monotonic indexes require exact matches 非单调索引要求精确匹配
# 如果series或Dataframe的索引是单调增或单调减的，则基于标签的切片可以超出索引的范围。
# 就像对一般python列表list的索引切片。
# 可以用is_monotonic_increasing和is_monotonic_decreasing测试单调属性

df = pd.DataFrame(index=[2,3,3,4,5], columns=['data'], data=list(range(5)))
df
# Non-monotonic indexes require exact matches 非单调索引要求精确匹配
# 如果series或Dataframe的索引是单调增或单调减的，则基于标签的切片可以超出索引的范围。
# 就像对一般python列表list的索引切片。
# 可以用is_monotonic_increasing和is_monotonic_decreasing测试单调属性

df = pd.DataFrame(index=[2,3,3,4,5], columns=['data'], data=list(range(5)))
df

	data
2	0
3	1
3	2
4	3
5	4

df.index.is_monotonic_increasing
df.index.is_monotonic_decreasing
df.index.is_monotonic_increasing
df.index.is_monotonic_decreasing

True






False

df.loc[0:4, :]  # 没有0和1行，但是返回了label为2、3、4的行
df.loc[13:15, :]  # 超出界限，返回空
df.loc[0:4, :]  # 没有0和1行，但是返回了label为2、3、4的行
df.loc[13:15, :]  # 超出界限，返回空

	data
2	0
3	1
3	2
4	3

# 非单调索引，切片必须在index内，而且边界的值必须是唯一的

df = pd.DataFrame(index=[2,3,1,4,3,5], columns=['data'], data=list(range(6)))
df
df.index.is_monotonic_increasing
# 非单调索引，切片必须在index内，而且边界的值必须是唯一的

df = pd.DataFrame(index=[2,3,1,4,3,5], columns=['data'], data=list(range(6)))
df
df.index.is_monotonic_increasing

	data
2	0
3	1
1	2
4	3
3	4
5	5

False

df.loc[2:4, :]
# df.loc[0:4, :]  # 错误没有0标签
# df.loc[2:3, :]  # 错误，边界标签3不是唯一的
df.loc[2:4, :]
# df.loc[0:4, :]  # 错误没有0标签
# df.loc[2:3, :]  # 错误，边界标签3不是唯一的

	data
2	0
3	1
1	2
4	3

# Index.is_monotonic_increasing() and Index.is_monotonic_decreasing() 只检测弱单调（可以有重复值）
# 结合使用 Index.is_unique()  可以检测严格单调性

weakly_monotonic = pd.Index(['a', 'b', 'c', 'c'])
weakly_monotonic
weakly_monotonic.is_monotonic_increasing
weakly_monotonic.is_monotonic_increasing & weakly_monotonic.is_unique
# Index.is_monotonic_increasing() and Index.is_monotonic_decreasing() 只检测弱单调（可以有重复值）
# 结合使用 Index.is_unique()  可以检测严格单调性

weakly_monotonic = pd.Index(['a', 'b', 'c', 'c'])
weakly_monotonic
weakly_monotonic.is_monotonic_increasing
weakly_monotonic.is_monotonic_increasing & weakly_monotonic.is_unique

Index(['a', 'b', 'c', 'c'], dtype='object')






True






False

# Endpoints are inclusive 端点（边界）包括在内
# 与标准的python切片（不包括右端点值）相比，pandas中的标签切片包含端点值。
# 主要原因是经常不可能轻易断定 在索引的局部标签后 的 后继或者下一个元素
# Endpoints are inclusive 端点（边界）包括在内
# 与标准的python切片（不包括右端点值）相比，pandas中的标签切片包含端点值。
# 主要原因是经常不可能轻易断定 在索引的局部标签后 的 后继或者下一个元素

s = pd.Series(np.random.randn(6), index=list('abcdef'))
s
s = pd.Series(np.random.randn(6), index=list('abcdef'))
s

a    1.280483
b    1.562738
c    0.904503
d   -0.470785
e   -0.008048
f   -0.413812
dtype: float64

s[2:5]  # 基于整型的索引，与既有标签的不同，不包括右端点
s[2:5]  # 基于整型的索引，与既有标签的不同，不包括右端点

c    0.904503
d   -0.470785
e   -0.008048
dtype: float64

# 如果用标签，不容易取得下一个标签
# s.loc['c':'e'+1]  # 错误
s.loc['c':'e']
# 如果用标签，不容易取得下一个标签
# s.loc['c':'e'+1]  # 错误
s.loc['c':'e']

c    0.904503
d   -0.470785
e   -0.008048
dtype: float64

# Indexing potentially changes underlying Series dtype 
# 在series类型下索引可能出现变化
# The different indexing operation can potentially change the dtype of a Series.
# 不同的索引操作可能会潜在的改变series的类型

series1 = pd.Series([1, 2, 3])
series1.dtype  # int
series1
res = series1.reindex([0, 4])
res.dtype  # float
res
# Indexing potentially changes underlying Series dtype 
# 在series类型下索引可能出现变化
# The different indexing operation can potentially change the dtype of a Series.
# 不同的索引操作可能会潜在的改变series的类型

series1 = pd.Series([1, 2, 3])
series1.dtype  # int
series1
res = series1.reindex([0, 4])
res.dtype  # float
res

dtype('int64')






0    1
1    2
2    3
dtype: int64






dtype('float64')






0    1.0
4    NaN
dtype: float64

series2 = pd.Series([True])
series2.dtype  # 布尔类型
series2
res = series2.reindex_like(series1)
res.dtype  # '0' 型  （空？）
res
series2 = pd.Series([True])
series2.dtype  # 布尔类型
series2
res = series2.reindex_like(series1)
res.dtype  # '0' 型  （空？）
res

dtype('bool')






0    True
dtype: bool






dtype('O')






0    True
1     NaN
2     NaN
dtype: object

# 由于默认插入NaN，引起了dtype的改变。
# 这会导致一些问题，当使用如 numpy.logical_and. 的np ufuncs 时
# 由于默认插入NaN，引起了dtype的改变。
# 这会导致一些问题，当使用如 numpy.logical_and. 的np ufuncs 时

#  2018-02-22

#  2018-02-22

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。

上一篇：lua 脚本不为null lua脚本使用教程

下一篇：centos6 安装QQ centos6.10安装

提问和评论都可以，用心的回复会被更多人看到评论

发布评论

相关文章

python 多级索引表格合并 python 多重索引

python 多级索引表格合并 python 多重索引

51CTO博客