# -*- coding: utf-8 -*-
"""
Created on Tue Jun 11 14:27:05 2019
@author: wuxian
"""
list6 = [1,5,2,8,10,13,17,4,6]
result = []
for i in list6:
y = i ** 2 + 1
result.append(y)
print(result)
y = map(lambda X:X**2+1 ,list6 )
list6.apply(lambda X:X**2+1 ,list6 )
li = [1, 2, 3, 4, 5]
# 序列中的每个元素加1
map(lambda x: x+1, li) # [2,3,4,5,6]
li
# 返回序列中的偶数
filter(lambda x: x % 2 == 0, li) # [2, 4]
import numpy as np
arr4 = np.array([[1,10,100],[2,20,200],[3,30,300]])
print('原数组:\n',arr4)
# 默认排序降维
print('数组降维:\n',arr4.ravel())
print(arr4.flatten())
print(arr4.reshape(-1))
# 改变排序模式的降维
print(arr4.ravel(order = 'F'))
print(arr4.flatten(order = 'F'))
print(arr4.reshape(-1, order = 'F'))
arr3 = np.array([[1,5,7],[3,6,1],[2,4,8],[5,8,9],[1,5,9],[8,5,2]])
# 数组的行列数
print(arr3.shape)
# 使用reshape方法更改数组的形状
print(arr3.reshape(2,9))
# 打印数组arr3的行列数
print(arr3.shape)
# 使用resize方法更改数组的形状
print(arr3.resize(2,9))
# 打印数组arr3的行列数
print(arr3.shape)
# 更改预览值
arr4.flatten()[0] = 2000
print('flatten方法:\n',arr4)
arr4.ravel()[1] = 1000
print('ravel方法:\n',arr4)
arr4.reshape(-1)[2] = 3000
print('reshape方法:\n',arr4)
arr5 = np.array([1,2,3])
print('vstack纵向合并数组:\n',np.vstack([arr4,arr5]))
print('row_stack纵向合并数组:\n',np.row_stack([arr4,arr5]))
arr6 = np.array([[5],[15],[25]])
print('hstack横向合并数组:\n',np.hstack([arr4,arr6]))
print('column_stack横向合并数组:\n',np.column_stack([arr4,arr6]))
print(arr4)
print('垂直方向计算数组的和:\n',np.sum(arr4,axis = 0))
print('水平方向计算数组的和:\n',np.sum(arr4, axis = 1))
# 加法运算
math = np.array([98,83,86,92,67,82])
english = np.array([68,74,66,82,75,89])
chinese = np.array([92,83,76,85,87,77])
tot_symbol = math+english+chinese
tot_fun = np.add(np.add(math,english),chinese)
print('符号加法:\n',tot_symbol)
print('函数加法:\n',tot_fun)
# 除法运算
height = np.array([165,177,158,169,173])
weight = np.array([62,73,59,72,80])
BMI_symbol = weight/(height/100)**2
BMI_fun = np.divide(weight,np.divide(height,100)**2)
print('符号除法:\n',BMI_symbol)
print('函数除法:\n',BMI_fun)
arr7 = np.array([[1,2,10],[10,8,3],[7,6,5]])
arr8 = np.array([[2,2,2],[3,3,3],[4,4,4]])
print('数组arr7:\n',arr7)
print('数组arr8:\n',arr8)
# 求余数
print('计算余数:\n',arr7 % arr8)
# 求整除
print('计算整除:\n',arr7 // arr8)
# 求指数
print('计算指数:\n',arr7 ** arr8)
# 整除部分
np.modf(arr7/arr8)[1]
# 各输入数组维度一致,对应维度值相等
arr10 = np.arange(12).reshape(3,4)
arr11 = np.arange(101,113).reshape(3,4)
print('3×4的二维矩阵运算:\n',arr10 + arr11)
# 各输入数组维度不一致,对应维度值相等
arr12 = np.arange(60).reshape(5,4,3)
arr10 = np.arange(12).reshape(4,3)
print('维数不一致,但末尾的维度值一致:\n',arr12 + arr10)
# 各输入数组维度不一致,对应维度值不相等,但其中有一个为1
arr12 = np.arange(60).reshape(5,4,3)
arr13 = np.arange(4).reshape(4,1)
print('维数不一致,维度值也不一致,但维度值至少一个为1:\n',arr12 + arr13)
# 加1补齐
arr14 = np.array([5,15,25])
print('arr14的维度自动补齐为(1,3):\n',arr10 + arr14)
np.arange(0.9,7,7)
from numpy import random
randArray = random.random(size=(2,4))
# 一维数组的点积
vector_dot = np.dot(np.array([1,2,3]), np.array([4,5,6]))
print('一维数组的点积:\n',vector_dot)
# 二维数组的乘法
print('两个二维数组:')
print(arr10)
print(arr11)
arr2d = np.dot(arr10,arr11)
print('二维数组的乘法:\n',arr2d)
# diag的使用
arr15 = np.arange(16).reshape(4,-1)
print('4×4的矩阵:\n',arr15)
print('取出矩阵的主对角线元素:\n',np.diag(arr15))
print('由一维数组构造的方阵:\n',np.diag(np.array([5,15,25])))
# 计算方阵的特征向量和特征根
arr16 = np.array([[1,2,5],[3,6,8],[4,7,9]])
print('计算3×3方阵的特征根和特征向量:\n',arr16)
print('求解结果为:\n',np.linalg.eig(arr16))
# 计算偏回归系数
X = np.array([[1,1,4,3],[1,2,7,6],[1,2,6,6],[1,3,8,7],[1,2,5,8],[1,3,7,5],[1,6,10,12],[1,5,7,7],[1,6,3,4],[1,5,7,8]])
Y = np.array([3.2,3.8,3.7,4.3,4.4,5.2,6.7,4.8,4.2,5.1])
X.shape
Y.shape
X_trans_X_inverse = np.linalg.inv(np.dot(np.transpose(X),X))
np.dot(X_trans_X_inverse,np.transpose(X)).shape
beta = np.dot(np.dot(X_trans_X_inverse,np.transpose(X)),Y)
np.dot(np.dot(X_trans_X_inverse,np.transpose(X)),Y).shape
print('偏回归系数为:\n',beta)
# 多元线性方程组
A = np.array([[3,2,1],[2,3,1],[1,2,3]])
b = np.array([39,34,26])
X = np.linalg.solve(A,b)
print('三元一次方程组的解:\n',X)
# 范数的计算
arr17 = np.array([1,3,5,7,9,10,-12])
# 一范数
res1 = np.linalg.norm(arr17, ord = 1)
print('向量的一范数:\n',res1)
# 二范数
res2 = np.linalg.norm(arr17, ord = 2)
print('向量的二范数:\n',res2)
# 无穷范数
res3 = np.linalg.norm(arr17, ord = np.inf)
print('向量的无穷范数:\n',res3)
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
# 生成各种正态分布随机数
np.random.seed(1234)
rn1 = np.random.normal(loc = 0, scale = 1, size = 1000)
rn2 = np.random.normal(loc = 0, scale = 2, size = 1000)
rn3 = np.random.normal(loc = 2, scale = 3, size = 1000)
rn4 = np.random.normal(loc = 5, scale = 3, size = 1000)
# 绘图
plt.style.use('ggplot')
sns.distplot(rn1, hist = False, kde = False, fit = stats.norm,
fit_kws = {'color':'black','label':'u=0,s=1','linestyle':'-'})
sns.distplot(rn2, hist = False, kde = False, fit = stats.norm,
fit_kws = {'color':'red','label':'u=0,s=2','linestyle':'--'})
sns.distplot(rn3, hist = False, kde = False, fit = stats.norm,
fit_kws = {'color':'blue','label':'u=2,s=3','linestyle':':'})
sns.distplot(rn4, hist = False, kde = False, fit = stats.norm,
fit_kws = {'color':'purple','label':'u=5,s=3','linestyle':'-.'})
# 呈现图例
plt.legend()
# 呈现图形
plt.show()
# 生成各种指数分布随机数
np.random.seed(1234)
re1 = np.random.exponential(scale = 0.5, size = 1000)
re2 = np.random.exponential(scale = 1, size = 1000)
re3 = np.random.exponential(scale = 1.5, size = 1000)
# 绘图
sns.distplot(re1, hist = False, kde = False, fit = stats.expon,
fit_kws = {'color':'black','label':'lambda=0.5','linestyle':'-'})
sns.distplot(re2, hist = False, kde = False, fit = stats.expon,
fit_kws = {'color':'red','label':'lambda=1','linestyle':'--'})
sns.distplot(re3, hist = False, kde = False, fit = stats.expon,
fit_kws = {'color':'blue','label':'lambda=1.5','linestyle':':'})
# 呈现图例
plt.legend()
# 呈现图形
plt.show()
random.uniform(0.9, 1, 10)
# 导入模块
import pandas as pd
import numpy as np
# 构造序列
gdp1 = pd.Series([2.8,3.01,8.99,8.59,5.18])
gdp2 = pd.Series({'北京':2.8,'上海':3.01,'广东':8.99,'江苏':8.59,'浙江':5.18})
gdp3 = pd.Series(np.array((2.8,3.01,8.99,8.59,5.18)))
np.array((2.8,3.01,8.99,8.59,5.18)).shape
print(gdp1)
print(gdp2)
print(gdp3)
# 取出gdp1中的第一、第四和第五个元素
print('行号风格的序列:\n',gdp1[[0,3,4]])
# 取出gdp2中的第一、第四和第五个元素
print('行名称风格的序列:\n',gdp2[[0,3,4]])
# 取出gdp2中上海、江苏和浙江的GDP值
print('行名称风格的序列:\n',gdp2[['上海','江苏','浙江']])
# 数学函数--取对数
print('通过numpy函数:\n',np.log(gdp1))
# 平均gdp
print('通过numpy函数:\n',np.mean(gdp1))
print('通过序列的方法:\n',gdp1.mean())
# 构造数据框
df1 = pd.DataFrame([['张三',23,'男'],['李四',27,'女'],['王二',26,'女']])
df2 = pd.DataFrame({'姓名':['张三','李四','王二'],'年龄':[23,27,26],'性别':['男','女','女']})
df3 = pd.DataFrame(np.array([['张三',23,'男'],['李四',27,'女'],['王二',26,'女']]))
print('嵌套列表构造数据框:\n',df1)
print('字典构造数据框:\n',df2)
print('二维数组构造数据框:\n',df3)
# 读取文本文件中的数据
user_income = pd.read_table(r'C:\Users\wuxian\Desktop\personal_python\从零开始学Python--数据分析与挖掘\第5章 Python数据处理工具--Pandas\data_test01.txt', sep = ',',
parse_dates={'birthday':[0,1,2]},skiprows=2, skipfooter=3,
comment='#', encoding='utf8', thousands='&')
user_income
child_cloth = pd.read_excel(io = r'C:\Users\wuxian\Desktop\personal_python\从零开始学Python--数据分析与挖掘\第5章 Python数据处理工具--Pandas\data_test02.xlsx', header = None,
names = ['Prod_Id','Prod_Name','Prod_Color','Prod_Price'], converters = {0:str})
child_cloth
# 读取电子表格数据
pd.read_excel(io = r'C:\Users\wuxian\Desktop\personal_python\从零开始学Python--数据分析与挖掘\第5章 Python数据处理工具--Pandas\data_test02.xlsx', header = None,
names = ['Prod_Id','Prod_Name','Prod_Color','Prod_Price'])
# 导入模块
import pymysql
# 连接MySQL数据库
conn = pymysql.connect(host='localhost', user='root', password='1q2w3e4r',
database='test', port=3306, charset='utf8')
# 读取数据
user = pd.read_sql('select * from topy', conn)
# 关闭连接
conn.close()
# 数据输出
user
# 导入第三方模块
import pymssql
# 连接SQL Server数据库
connect = pymssql.connect(server = 'localhost', user = '', password = '',
database = 'train', charset = 'utf8')
# 读取数据
data = pd.read_sql("select * from sec_buildings where direction = '朝南'", con=connect)
# 关闭连接
connect.close()
# 数据输出
data.head()
# 数据类型转换及描述统计
# 数据读取
files = open(r'C:\Users\wuxian\Desktop\personal_python\从零开始学Python--数据分析与挖掘\第5章 Python数据处理工具--Pandas\sec_cars.csv')
sec_cars = pd.read_csv(files)
# 预览数据的前五行
sec_cars.head()
# 查看数据的行列数
print('数据集的行列数:\n',sec_cars.shape)
# 查看数据集每个变量的数据类型
print('各变量的数据类型:\n',sec_cars.dtypes)
# 修改二手车上牌时间的数据类型
sec_cars.Boarding_time = pd.to_datetime(sec_cars.Boarding_time, format = '%Y年%m月')
# 修改二手车新车价格的数据类型
sec_cars.New_price = sec_cars.New_price.str[:-1].astype('float')
# 重新查看各变量数据类型
sec_cars.dtypes
# 数据的描述性统计
sec_cars.describe()
# 数据的形状特征
# 挑出所有数值型变量
num_variables = sec_cars.columns[sec_cars.dtypes !='object'][1:]
# 自定义函数,计算偏度和峰度
def skew_kurt(x):
skewness = x.skew()
kurtsis = x.kurt()
# 返回偏度值和峰度值
return pd.Series([skewness,kurtsis], index = ['Skew','Kurt'])
# 运用apply方法
sec_cars[num_variables].apply(func = skew_kurt, axis = 0)
# 离散型变量的统计描述
sec_cars.describe(include = ['object'])
# 离散变量频次统计
Freq = sec_cars.Discharge.value_counts()
Freq_ratio = Freq/sec_cars.shape[0]
Freq_df = pd.DataFrame({'Freq':Freq,'Freq_ratio':Freq_ratio})
Freq_df.head()
# 将行索引重设为变量
Freq_df.reset_index(inplace = True)
Freq_df.head()
# 数据读入
df = pd.read_excel(r'C:\Users\wuxian\Desktop\personal_python\从零开始学Python--数据分析与挖掘\第5章 Python数据处理工具--Pandas\data_test03.xlsx')
# 各变量数据类型
df.columns
print(df.dtypes)
# 将birthday变量转换为日期型
df.birthday = pd.to_datetime(df.birthday, format = '%Y/%m/%d')
# 将手机号转换为字符串
df.tel = df.tel.astype('str')
# 新增年龄和工龄两列
df['age'] = pd.datetime.today().year - df.birthday.dt.year
df['workage'] = pd.datetime.today().year - df.start_work.dt.year
# 将手机号中间四位隐藏起来
df.tel = df.tel.apply(func = lambda x : x.replace(x[3:7], '****'))
# 取出邮箱的域名
df['email_domain'] = df.email.apply(func = lambda x : x.split('@')[1])
# 取出用户的专业信息
df['profession'] = df.other.str.findall('专业:(.*?),')
# 去除birthday、start_work和other变量
df.drop(['birthday','start_work','other'], axis = 1, inplace = True)
df.head()
# 常用日期处理方法
dates = pd.to_datetime(pd.Series(['1989-8-18 13:14:55','1995-2-16']), format = '%Y-%m-%d %H:%M:%S')
print('返回日期值:\n',dates.dt.date)
print('返回季度:\n',dates.dt.quarter)
print('返回几点钟:\n',dates.dt.hour)
print('返回年中的天:\n',dates.dt.dayofyear)
print('返回年中的周:\n',dates.dt.weekofyear)
print('返回星期几的名称:\n',dates.dt.weekday_name)
print('返回月份的天数:\n',dates.dt.days_in_month)
# 数据清洗
# 数据读入
df = pd.read_excel(r'C:\Users\wuxian\Desktop\personal_python\从零开始学Python--数据分析与挖掘\第5章 Python数据处理工具--Pandas\data_test04.xlsx')
df.columns
df.info()
df.appcategory.value_counts()
df.describe(include=['object'])
# 重复观测的检测
print('数据集中是否存在重复观测:\n',any(df.duplicated()))
# 删除重复项
df.drop_duplicates(inplace = True)
df
# 数据读入
df = pd.read_excel(r'C:\Users\wuxian\Desktop\personal_python\从零开始学Python--数据分析与挖掘\第5章 Python数据处理工具--Pandas\data_test05.xlsx')
# 缺失观测的检测
print('数据集中是否存在缺失值:\n',any(df.isnull()))
df.isnull().any(axis=0).sum()
df.apply(lambda x : np.sum(x.isnull()))
# 删除法之记录删除
df.dropna()
# 删除法之变量删除
df.drop('age', axis = 1)
# 替换法之前向替换
df.fillna(method = 'ffill')
# 替换法之后向替换
df.fillna(method = 'bfill')
# 替换法之常数替换
df.fillna(value = 0)
# 替换法之统计值替换
df.fillna(value = {'gender':df.gender.mode()[0], 'age':df.age.mean(), 'income':df.income.median()})
# 数据读入
file0 = open(r'C:\Users\wuxian\Desktop\personal_python\从零开始学Python--数据分析与挖掘\第5章 Python数据处理工具--Pandas\sunspots.csv')
sunspots = pd.read_csv(file0, sep = ',')
sunspots.shape
sunspots.columns
# 异常值检测之标准差法
sunspots.counts.median()
sunspots.counts.mode()
xbar = sunspots.counts.mean()
xstd = sunspots.counts.std()
np.sum(any(sunspots.counts > xbar + 2 * xstd))
np.sum(any(sunspots.counts < xbar - 2 * xstd))
print('标准差法异常值上限检测:\n',any(sunspots.counts > xbar + 2 * xstd))
print('标准差法异常值下限检测:\n',any(sunspots.counts < xbar - 2 * xstd))
# 异常值检测之箱线图法
Q1 = sunspots.counts.quantile(q = 0.25)
Q3 = sunspots.counts.quantile(q = 0.75)
IQR = Q3 - Q1
print('箱线图法异常值上限检测:\n',any(sunspots.counts > Q3 + 1.5 * IQR))
print('箱线图法异常值下限检测:\n',any(sunspots.counts < Q1 - 1.5 * IQR))
# 导入绘图模块
import matplotlib.pyplot as plt
# 设置绘图风格
plt.style.use('ggplot')
# 绘制直方图
sunspots.counts.plot(kind = 'hist', bins = 30, normed = True)
# 绘制核密度图
sunspots.counts.plot(kind = 'kde')
# 图形展现
plt.show()
# 替换法处理异常值
print('异常值替换前的数据统计特征:\n',sunspots.counts.describe())
# 箱线图中的异常值判别上限
UL = Q3 + 1.5 * IQR
print('判别异常值的上限临界值:\n',UL)
# 从数据中找出低于判别上限的最大值
replace_value = sunspots.counts[sunspots.counts < UL].max()
print('用以替换异常值的数据:\n',replace_value)
# 替换超过判别上限异常值
sunspots.counts[sunspots.counts > UL] = replace_value
print('异常值替换后的数据统计特征:\n',sunspots.counts.describe())
# 数据子集的获取
# 构造数据集
df1 = pd.DataFrame({'name':['张三','李四','王二','丁一','李五'],
'gender':['男','女','女','女','男'],
'age':[23,26,22,25,27]}, columns = ['name','gender','age'])
df1
# 取出数据集的中间三行(即所有女性),并且返回姓名和年龄两列
df1.iloc[1:4,[0,2]]
df1.loc[1:3, ['name','age']]
df1.ix[1:3,[0,2]]
# 将员工的姓名用作行标签
df2 = df1.set_index('name')
df2
# 取出数据集的中间三行
df2.iloc[1:4,:]
df2.loc[['李四','王二','丁一'],:]
df2.ix[1:4,:]
# 使用筛选条件,取出所有男性的姓名和年龄
# df1.iloc[df1.gender == '男',]
df1.loc[df1.gender == '男',['name','age']]
df1.ix[df1.gender == '男',['name','age']]
# 数据读取
file0 = open(r'C:\Users\wuxian\Desktop\personal_python\从零开始学Python--数据分析与挖掘\第5章 Python数据处理工具--Pandas\diamonds.csv')
diamonds = pd.read_csv(file0, sep = ',')
diamonds.columns
diamonds.info()
diamonds.head()
diamonds.price.mean()
# 单个分组变量的均值统计
pd.pivot_table(data = diamonds, index = 'color', values = 'price', margins = True, margins_name = '总计')
# 两个分组变量的列联表
# 导入numpy模块
import numpy as np
pd.pivot_table(data = diamonds, index = 'clarity', columns = 'cut', values = 'carat',
aggfunc = np.size,margins = True, margins_name = '总计')
# 构造数据集df1和df2
df1 = pd.DataFrame({'name':['张三','李四','王二'], 'age':[21,25,22], 'gender':['男','女','男']})
df2 = pd.DataFrame({'name':['丁一','赵五'], 'age':[23,22], 'gender':['女','女']},)
# 数据集的纵向合并
pd.concat([df1,df2], keys = ['df1','df2'], )
# 如果df2数据集中的“姓名变量为Name”
df2 = pd.DataFrame({'Name':['丁一','赵五'], 'age':[23,22], 'gender':['女','女']})
# 数据集的纵向合并
pd.concat([df1,df2],keys = ['df1','df2'], )
# 构造数据集
df3 = pd.DataFrame({'id':[1,2,3,4,5],'name':['张三','李四','王二','丁一','赵五'],'age':[27,24,25,23,25],'gender':['男','男','男','女','女']})
df4 = pd.DataFrame({'Id':[1,2,2,4,4,4,5],'kemu':['科目1','科目1','科目2','科目1','科目2','科目3','科目1'],'score':[83,81,87,75,86,74,88]})
df5 = pd.DataFrame({'id':[1,3,5],'name':['张三','王二','赵五'],'income':[13500,18000,15000]})
# 三表的数据连接
# 首先df3和df4连接
merge1 = pd.merge(left = df3, right = df4, how = 'left', left_on='id', right_on='Id')
merge1
# 再将连接结果与df5连接
merge2 = pd.merge(left = merge1, right = df5, how = 'left')
merge2
# 通过groupby方法,指定分组变量
grouped = diamonds.groupby(by = ['color','cut'])
# 对分组变量进行统计汇总
result = grouped.aggregate({'color':np.size, 'carat':np.min, 'price':np.mean, 'x':np.max})
# 调整变量名的顺序
result = pd.DataFrame(result, columns=['color','carat','price','x'])
# 数据集重命名
result.rename(columns={'color':'counts','carat':'min_weight','price':'avg_price','x':'max_face_width'}, inplace=True)
# 将行索引变量数据框的变量
result.reset_index(inplace=True)
result.columns
result.sort_values(by = ['cut'],ascending=False,inplace=True)
result