``````import pandas as pd
import numpy as np

# 允许多个print输出在一个单元格中
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"``````

## 1. How to create a series from a list, numpy array and dict?

``````a_list = list("abcdefg")
numpy_array = np.arange(1, 10)
dictionary = {"A":  0, "B":1, "C":2, "D":3, "E":5}``````
``a_list``
``['a', 'b', 'c', 'd', 'e', 'f', 'g']``
``numpy_array``
``array([1, 2, 3, 4, 5, 6, 7, 8, 9])``
``dictionary``
``{'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 5}``

``````s1 = pd.Series(a_list)
s1``````
``````0    a
1    b
2    c
3    d
4    e
5    f
6    g
dtype: object``````
``````s2 = pd.Series(numpy_array)
s2``````
``````0    1
1    2
2    3
3    4
4    5
5    6
6    7
7    8
8    9
dtype: int32``````
``````s3 = pd.Series(dictionary)
s3``````
``````A    0
B    1
C    2
D    3
E    5
dtype: int64``````

## 2. How to combine many series to form a dataframe?

``````ser1 = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))

ser2 = pd.Series(np.arange(26))``````

``````# 方式1

ser_df = pd.DataFrame(ser1, ser2).reset_index()
 index 0 0 0 a 1 1 b 2 2 c 3 3 e 4 4 d

``````# 方式2

ser_df = pd.DataFrame({"col1":ser1, "col2":ser2})
 col1 col2 0 a 0 1 b 1 2 c 2 3 e 3 4 d 4

``````# 方式3

ser_df = pd.concat([ser1, ser2], axis = 1)
 0 1 0 a 0 1 b 1 2 c 2 3 e 3 4 d 4

## 3. How to get the items of series A not present in series B?

``````ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])``````

``ser1[~ser1.isin(ser2)]``
``````0    1
1    2
2    3
dtype: int64``````

## 4. How to get the items not common to both series A and series B?

``````# 模拟数据

ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])``````

``````# 找出AB中各自的独有部分
a_not_b = ser1[~ser1.isin(ser2)]  # 结果为123
b_not_a = ser2[~ser2.isin(ser1)]  # 结果为678``````
``a_not_b``
``````0    1
1    2
2    3
dtype: int64``````
``b_not_a``
``````2    6
3    7
4    8
dtype: int64``````

``pd.concat([a_not_b,b_not_a],ignore_index=True)``
``````0    1
1    2
2    3
3    6
4    7
5    8
dtype: int64``````

``a_not_b.append(b_not_a, ignore_index = True)``
``````0    1
1    2
2    3
3    6
4    7
5    8
dtype: int64``````

``````ser_u = pd.Series(np.union1d(ser1, ser2))  # 求并集
ser_i = pd.Series(np.intersect1d(ser1, ser2))  # 求交集``````
``ser_u  # 并集``
``````0    1
1    2
2    3
3    4
4    5
5    6
6    7
7    8
dtype: int64``````
``ser_i  # 交集``
``````0    4
1    5
dtype: int64``````

``ser_u[~ser_u.isin(ser_i)]  # 成员判断``
``````0    1
1    2
2    3
5    6
6    7
7    8
dtype: int64``````

## 5. How to get useful infos?

``````state = np.random.RandomState(100)
ser = pd.Series(state.normal(10, 5, 25))
ser``````
``````0      1.251173
1     11.713402
2     15.765179
3      8.737820
4     14.906604
5     12.571094
6     11.105898
7      4.649783
8      9.052521
9     11.275007
10     7.709865
11    12.175817
12     7.082025
13    14.084235
14    13.363604
15     9.477944
16     7.343598
17    15.148663
18     7.809322
19     4.408409
20    18.094908
21    17.708026
22     8.740604
23     5.787821
24    10.922593
dtype: float64``````
``````# 使用describe

ser.describe()``````
``````count    25.000000
mean     10.435437
std       4.253118
min       1.251173
25%       7.709865
50%      10.922593
75%      13.363604
max      18.094908
dtype: float64``````

## 6. How to get frequency counts of unique items of a series?

``````ser = pd.Series(np.take(list('abcdefgh'), np.random.randint(8, size=30)))
ser``````
``````0     d
1     h
2     f
3     g
4     f
5     f
6     d
7     e
8     a
9     a
10    a
11    b
12    f
13    h
14    e
15    e
16    e
17    e
18    d
19    h
20    h
21    c
22    f
23    h
24    f
25    b
26    d
27    c
28    b
29    c
dtype: object``````

``ser.value_counts()``
``````f    6
h    5
e    5
d    4
b    3
c    3
a    3
g    1
dtype: int64``````

## 7. How to convert a numpy array to a dataframe of given shape? (L1)

``````# 1到35间有放回的选择35个数据

ser = pd.Series(np.random.randint(1, 10, 35))
ser``````
``````0     4
1     9
2     4
3     8
4     5
5     4
6     8
7     5
8     7
9     5
10    8
11    3
12    1
13    7
14    7
15    1
16    3
17    2
18    8
19    7
20    6
21    3
22    7
23    4
24    8
25    4
26    9
27    9
28    4
29    2
30    3
31    7
32    5
33    6
34    1
dtype: int32``````
``pd.DataFrame(np.array(ser).reshape(7, 5))``
 0 1 2 3 4 0 4 9 4 8 5 1 4 8 5 7 5 2 8 3 1 7 7 3 1 3 2 8 7 4 6 3 7 4 8 5 4 9 9 4 2 6 3 7 5 6 1
``pd.DataFrame(ser.values.reshape(7, 5))``
 0 1 2 3 4 0 4 9 4 8 5 1 4 8 5 7 5 2 8 3 1 7 7 3 1 3 2 8 7 4 6 3 7 4 8 5 4 9 9 4 2 6 3 7 5 6 1

## 8. How to find the positions of numbers that are multiples of 3 from a series?

``````# 模拟数据

np.random.RandomState(100)
ser = pd.Series(np.random.randint(1, 5, 10))
ser``````
``````RandomState(MT19937) at 0x2AEECC26440

0    3
1    1
2    2
3    1
4    4
5    4
6    3
7    1
8    2
9    1
dtype: int32``````

``ser.where(lambda x: x%3 == 0)``
``````0    3.0
1    NaN
2    NaN
3    NaN
4    NaN
5    NaN
6    3.0
7    NaN
8    NaN
9    NaN
dtype: float64``````

``ser.where(lambda x: x%3 == 0).dropna()``
``````0    3.0
6    3.0
dtype: float64``````

## 9. How to extract items at given positions from a series

``````ser = pd.Series(list('abcdefghijklmnopqrstuvwxyz'))
pos = [0, 4, 8, 14, 20]  # 指定位置``````

``ser.loc[pos]``
``````0     a
4     e
8     i
14    o
20    u
dtype: object``````

``ser.take(pos)``
``````0     a
4     e
8     i
14    o
20    u
dtype: object``````

## 10. How to stack two series vertically and horizontally ?

``````ser1 = pd.Series(range(5))
ser2 = pd.Series(list('abcde'))``````
``ser1``
``````0    0
1    1
2    2
3    3
4    4
dtype: int64``````
``ser2``
``````0    a
1    b
2    c
3    d
4    e
dtype: object``````

``ser1.append(ser2)``
``````0    0
1    1
2    2
3    3
4    4
0    a
1    b
2    c
3    d
4    e
dtype: object``````
``pd.concat([ser1, ser2], axis = 0)``
``````0    0
1    1
2    2
3    3
4    4
0    a
1    b
2    c
3    d
4    e
dtype: object``````

``pd.concat([ser1, ser2], axis = 1)``
 0 1 0 0 a 1 1 b 2 2 c 3 3 d 4 4 e

``np.stack([ser1,ser2])``
``````array([[0, 1, 2, 3, 4],
['a', 'b', 'c', 'd', 'e']], dtype=object)``````
``np.vstack([ser1,ser2])``
``````array([[0, 1, 2, 3, 4],
['a', 'b', 'c', 'd', 'e']], dtype=object)``````
``np.hstack([ser1,ser2])``
``array([0, 1, 2, 3, 4, 'a', 'b', 'c', 'd', 'e'], dtype=object)``
``np.dstack([ser1,ser2])``
``````array([[[0, 'a'],
[1, 'b'],
[2, 'c'],
[3, 'd'],
[4, 'e']]], dtype=object)``````
``np.concatenate([ser1,ser2],axis=0)``
``array([0, 1, 2, 3, 4, 'a', 'b', 'c', 'd', 'e'], dtype=object)``
``np.append(ser1,ser2)``
``array([0, 1, 2, 3, 4, 'a', 'b', 'c', 'd', 'e'], dtype=object)``

## 11. How to get the positions of items of series A in another series B?

``````ser1 = pd.Series([10, 9, 6, 5, 3, 1, 12, 8, 13])
ser2 = pd.Series([1, 3, 10, 13])``````

``list(ser1[ser1.isin(ser2)].index)``
``[0, 4, 5, 8]``

``[np.where(i == ser1)[0].tolist()[0] for i in ser2]``
``[5, 4, 0, 8]``

``[pd.Index(ser1).get_loc(i) for i in ser2]``
``[5, 4, 0, 8]``
``pd.Index(ser1)  # 创建Int64Index类型的索引数据``
``Int64Index([10, 9, 6, 5, 3, 1, 12, 8, 13], dtype='int64')``
``## 12. How to compute difference of differences between consequtive numbers of a series?``

``ser = pd.Series([1, 3, 6, 10, 15, 21, 27, 35])``
``ser.diff(periods = 1).tolist()``
``[nan, 2.0, 3.0, 4.0, 5.0, 6.0, 6.0, 8.0]``

``ser.diff(periods = 1).diff(periods = 1).tolist()``
``[nan, nan, 1.0, 1.0, 1.0, 1.0, 0.0, 2.0]``

## 13. How to convert a series of date-strings to a timeseries?

``````ser = pd.Series(['01 Jan 2020', '02-02-2021', '20220303',
'2023/04/04', '2020-05-05', '2022-06-06T12:20'])
ser.dtype``````
``dtype('O')``
``````# 1、直接使用pandas.to_datetime

pd.to_datetime(ser)``````
``````0   2020-01-01 00:00:00
1   2021-02-02 00:00:00
2   2022-03-03 00:00:00
3   2023-04-04 00:00:00
4   2020-05-05 00:00:00
5   2022-06-06 12:20:00
dtype: datetime64[ns]``````
``````# 使用python的dateutil.parser解析包

from dateutil.parser import parse
ser.map(lambda x: parse(x))``````
``````0   2020-01-01 00:00:00
1   2021-02-02 00:00:00
2   2022-03-03 00:00:00
3   2023-04-04 00:00:00
4   2020-05-05 00:00:00
5   2022-06-06 12:20:00
dtype: datetime64[ns]``````

## 14. How to filter words that contain atleast 2 vowels from a series?

``ser = pd.Series(['Apple', 'Orange', 'Plan', 'Python', 'Money'])``
``````vowels = list("aeiou")

list_ = []

for i in ser:
c = 0  # 计数器
for l in list(i.lower()):  # 将遍历的数据全部转成小写
if l in vowels:  #  如果数据在元音列表中
c += 1  # 计数器加1
if c >= 2:  # 循环完后计数器大于1
list_.append(i)  # 将满足要求的数据添加到列表中
list_``````
``['Apple', 'Orange', 'Money']``
``ser[ser.isin(list_)]``
``````0     Apple
1    Orange
4     Money
dtype: object``````

``````from collections import Counter

mask = ser.map(lambda x: sum([Counter(x.lower()).get(i,0) for i in list("aeiou")]) >= 2)
``````0     Apple
1    Orange
4     Money
dtype: object``````
``[Counter("Apple".lower()).get(i,0) for i in list("aeiou")]``
``[1, 1, 0, 0, 0]``
``[Counter("Python".lower()).get(i,0) for i in list("aeiou")]``
``[0, 0, 0, 1, 0]``

## 15. How to replace missing spaces in a string with the least frequent character?

``my_str = 'dbc deb abed ggade'``

``````ser = pd.Series(list(my_str.replace(" ", "")))
ser``````
``````0     d
1     b
2     c
3     d
4     e
5     b
6     a
7     b
8     e
9     d
10    g
11    g
12    a
13    d
14    e
dtype: object``````

``ser.value_counts()``
``````d    4
b    3
e    3
a    2
g    2
c    1
dtype: int64``````
``````mini = ser.value_counts().index[-1]
mini``````
``'c'``

``my_str.replace(" ", mini)``
``'dbccdebcabedcggade'``

``````from collections import Counter

my_str_ = my_str  # 副本``````

``````Counter_ = Counter(list(my_str_.replace(" ", "")))
Counter_``````
``Counter({'d': 4, 'b': 3, 'c': 1, 'e': 3, 'a': 2, 'g': 2})``

``````mini = min(Counter_, key = Counter_.get)
mini``````
``'c'``
``my_str.replace(" ", mini)``
``'dbccdebcabedcggade'``

## 16. How to change column values when importing csv to a dataframe?

``````df = pd.read_csv(
"housing_preprocessed.csv",
index_col = 0,
skipfooter=1,
converters = {"MEDV": lambda x: "HIGH" if float(x) >= 25 else "LOW"}  # 关键代码
)``````

## 17. How to import only specified columns from a csv file?

``````df = pd.read_csv(file, usecols = [1, 2, 4], skipfooter=1)  # 索引号
df = pd.read_csv(file, usecols = ["CRIM", "ZN", "CHAS"])  # 列名``````

## 18. How to check if a dataframe has any missing values?

``````df.isnull()  # 查看每个位置是否为空值；如果是用True，否则是False

df.isnull().sum()  # 判断每列有多少个空值

df.isnull().values.any()  # 判断数据中是否至少存在一个空值

df.isnull().values.any(axis=0)  # 显示每列是否至少有一个空值

df.isnull().values.any(axis=1)  # 显示每行是否至少存在一个空值``````

## 19. How to replace missing values of multiple numeric columns with the mean?

``````df = pd.DataFrame({"col":[1,2,3,None]})
df``````
 col 0 1.0 1 2.0 2 3.0 3 NaN
``````# 均值

(1 + 2 + 3) / 3``````
``2.0``
``````df[["col"]] = df[["col"]].apply(lambda x: x.fillna(x.mean()))

df``````
 col 0 1.0 1 2.0 2 3.0 3 2.0

## 20. How to change the order of columns of a dataframe?

``````# 模拟数据

df = pd.DataFrame(np.arange(20).reshape(-1, 5),
columns=list('abcde'))
df``````
 a b c d e 0 0 1 2 3 4 1 5 6 7 8 9 2 10 11 12 13 14 3 15 16 17 18 19
``````# 方式1：直接认为指定顺序

df[["c", "b", "a", "d", "e"]]``````
 c b a d e 0 2 1 0 3 4 1 7 6 5 8 9 2 12 11 10 13 14 3 17 16 15 18 19
``````# 方式2：交换两个列的位置
def change_cols(df, col1, col2):
df_columns = df.columns.to_list()
index1 = df_columns.index(col1)
index2 = df_columns.index(col2)

df_columns[index1], df_columns[index2] = col2, col1

return df[df_columns]

# 交换be两列的数据的位置
df = change_cols(df, "b", "e")
df``````
 a e c d b 0 0 4 2 3 1 1 5 9 7 8 6 2 10 14 12 13 11 3 15 19 17 18 16
``````# 方式3：翻转列名

df_columns = df.columns
df_columns``````
``Index(['a', 'e', 'c', 'd', 'b'], dtype='object')``
``````df_columns_reversed = df_columns[::-1]  # 翻转
df_columns_reversed``````
``Index(['b', 'd', 'c', 'e', 'a'], dtype='object')``
``df[df_columns_reversed]  # 翻转后的数据``
 b d c e a 0 1 3 2 4 0 1 6 8 7 9 5 2 11 13 12 14 10 3 16 18 17 19 15

## 21. How to filter every nth row in a dataframe?

``````df = pd.read_csv("iris.csv",usecols=['sepal_length', 'sepal_width', 'petal_length'])
df``````
 sepal_length sepal_width petal_length 0 5.1 3.5 1.4 1 4.9 3.0 1.4 2 4.7 3.2 1.3 3 4.6 3.1 1.5 4 5.0 3.6 1.4 ... ... ... ... 145 6.7 3.0 5.2 146 6.3 2.5 5.0 147 6.5 3.0 5.2 148 6.2 3.4 5.4 149 5.9 3.0 5.1

150 rows × 3 columns

``df.columns``
``Index(['sepal_length', 'sepal_width', 'petal_length'], dtype='object')``

``df[::20]``
 sepal_length sepal_width petal_length 0 5.1 3.5 1.4 20 5.4 3.4 1.7 40 5.0 3.5 1.3 60 5.0 2.0 3.5 80 5.5 2.4 3.8 100 6.3 3.3 6.0 120 6.9 3.2 5.7 140 6.7 3.1 5.6
``````# 等价
df.iloc[::20,:]  # 行方向是间隔20行，全部列``````
 sepal_length sepal_width petal_length 0 5.1 3.5 1.4 20 5.4 3.4 1.7 40 5.0 3.5 1.3 60 5.0 2.0 3.5 80 5.5 2.4 3.8 100 6.3 3.3 6.0 120 6.9 3.2 5.7 140 6.7 3.1 5.6

## 22. How to get the last n rows of a dataframe with row sum > 100?

``````df = pd.DataFrame(np.random.randint(10, 40, 60).reshape(-1, 4))
df1 = df.copy(deep = True)  # 副本

df``````
 0 1 2 3 0 20 10 24 25 1 22 30 24 10 2 32 21 37 21 3 28 21 14 29 4 33 19 36 26 5 39 18 28 25 6 16 35 14 17 7 29 13 21 30 8 22 26 28 29 9 37 28 16 28 10 24 33 37 33 11 30 30 29 39 12 13 19 21 28 13 36 38 34 30 14 23 10 22 21

``````df["sum"] = df.sum(axis=1)
df``````
 0 1 2 3 sum 0 20 10 24 25 79 1 22 30 24 10 86 2 32 21 37 21 111 3 28 21 14 29 92 4 33 19 36 26 114 5 39 18 28 25 110 6 16 35 14 17 82 7 29 13 21 30 93 8 22 26 28 29 105 9 37 28 16 28 109 10 24 33 37 33 127 11 30 30 29 39 128 12 13 19 21 28 81 13 36 38 34 30 138 14 23 10 22 21 76
``(df[df["sum"] > 100].index).to_list()[-2:]  # 最后两行``
``[11, 13]``
``print("The index of the rows that are greater than 100 are {}".format((df[df["sum"] > 100].index).to_list()[-2:]))``
``The index of the rows that are greater than 100 are [11, 13]``
``df.iloc[(df[df["sum"] > 100].index).to_list()[-2:]]  # 最后两行``
 0 1 2 3 sum 11 30 30 29 39 128 13 36 38 34 30 138

``````rowsums = df1.apply(np.sum, axis=1)  # 安装行求和
rowsums``````
``````0      79
1      86
2     111
3      92
4     114
5     110
6      82
7      93
8     105
9     109
10    127
11    128
12     81
13    138
14     76
dtype: int64``````

``np.where(rowsums > 100)``
``(array([ 2,  4,  5,  8,  9, 10, 11, 13], dtype=int64),)``
``np.where(rowsums > 100)[0]``
``array([ 2,  4,  5,  8,  9, 10, 11, 13], dtype=int64)``
``np.where(rowsums > 100)[0][-2:]``
``array([11, 13], dtype=int64)``
``df1.iloc[np.where(rowsums > 100)[0][-2:], :]``
 0 1 2 3 11 30 30 29 39 13 36 38 34 30

## 23. How to find and cap outliers from a series or dataframe column?

``````# 模拟数据
ser = pd.Series(np.logspace(-2, 2, 30))
ser1 = ser.copy(deep = True)  # 副本，深拷贝
ser2 = ser.copy(deep = True)

ser``````
``````0       0.010000
1       0.013738
2       0.018874
3       0.025929
4       0.035622
5       0.048939
6       0.067234
7       0.092367
8       0.126896
9       0.174333
10      0.239503
11      0.329034
12      0.452035
13      0.621017
14      0.853168
15      1.172102
16      1.610262
17      2.212216
18      3.039195
19      4.175319
20      5.736153
21      7.880463
22     10.826367
23     14.873521
24     20.433597
25     28.072162
26     38.566204
27     52.983169
28     72.789538
29    100.000000
dtype: float64``````

``````quantiles = np.quantile(ser, [0.05, 0.95])
quantiles``````
``array([1.60492941e-02, 6.38766722e+01])``

``````ser1[ser1 < quantiles[0]] = quantiles[0]
ser1[ser1 > quantiles[1]] = quantiles[1]

# 等价
# ser.iloc[np.where(ser < quantiles[0])] = quantiles[0]
# ser.iloc[np.where(ser > quantiles[1])] = quantiles[1]``````
``ser1``
``````0      0.016049
1      0.016049
2      0.018874
3      0.025929
4      0.035622
5      0.048939
6      0.067234
7      0.092367
8      0.126896
9      0.174333
10     0.239503
11     0.329034
12     0.452035
13     0.621017
14     0.853168
15     1.172102
16     1.610262
17     2.212216
18     3.039195
19     4.175319
20     5.736153
21     7.880463
22    10.826367
23    14.873521
24    20.433597
25    28.072162
26    38.566204
27    52.983169
28    63.876672
29    63.876672
dtype: float64``````

``````def cap_outliers(ser, low_perc, high_perc):
low, high = ser.quantile([low_perc, high_perc]) # 指定分位数

ser[ser < low] = low  # 小于low部分全部赋值为low
ser[ser > high] = high  # 大于hight部分全部赋值为high
return ser``````
``````capped_ser = cap_outliers(ser2, .05, .95)
capped_ser``````
``````0      0.016049
1      0.016049
2      0.018874
3      0.025929
4      0.035622
5      0.048939
6      0.067234
7      0.092367
8      0.126896
9      0.174333
10     0.239503
11     0.329034
12     0.452035
13     0.621017
14     0.853168
15     1.172102
16     1.610262
17     2.212216
18     3.039195
19     4.175319
20     5.736153
21     7.880463
22    10.826367
23    14.873521
24    20.433597
25    28.072162
26    38.566204
27    52.983169
28    63.876672
29    63.876672
dtype: float64``````

## 24. How to reverse the rows of a dataframe?

``````df = pd.DataFrame(np.arange(15).reshape(3, -1))
df``````
 0 1 2 3 4 0 0 1 2 3 4 1 5 6 7 8 9 2 10 11 12 13 14

``df.index.to_list()[::-1]  # 行索引翻转``
``[2, 1, 0]``
``df.iloc[df.index.to_list()[::-1]]``
 0 1 2 3 4 2 10 11 12 13 14 1 5 6 7 8 9 0 0 1 2 3 4

``df.iloc[::-1, :]``
 0 1 2 3 4 2 10 11 12 13 14 1 5 6 7 8 9 0 0 1 2 3 4
``df.loc[df.index[::-1], :]``
 0 1 2 3 4 2 10 11 12 13 14 1 5 6 7 8 9 0 0 1 2 3 4