一、飞机客户数据分析预测
1、读取数据
代码如下
import pandas as pd
datafile = "D:\\python_data\\air_data.csv"
resultfile = "D:\\python_data\\air_data_explore.csv"
data = pd.read_csv(datafile, encoding='utf-8')
explore = data.describe(percentiles=[], include='all').T
explore['null'] = len(data)-explore['count']
explore = explore[['null', 'max', 'min']]
explore.columns = [u'空值数', u'最大值', u'最小值']
explore.to_csv(resultfile)
2、绘制图像
各年份会员入会人数#直方图
代码如下
#提取会员入会年份
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt
ffp = data['FFP_DATE'].apply(lambda x:datetime.strptime(x,'%Y/%m/%d'))
ffp_year = ffp.map(lambda x :x.year)
fig = plt.figure(figsize=(8,5))
plt.rcParams['font.sans-serif'] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False
plt.hist(ffp_year,bins='auto',color='#0504aa')
plt.xlabel('年份')
plt.ylabel('入会人数')
plt.title('各年份会员入会人数3129')
plt.show()
plt.close
运行结果
会员性别比例#扇形图
代码如下
male = pd.value_counts(data['GENDER'])['男']
female = pd.value_counts(data['GENDER'])['女']
fig = plt.figure(figsize=(7,4))
plt.pie([male,female],labels=['男','女'],colors=['lightskyblue','lightcoral'],autopct='%1.1f%%')
plt.title('会员性别比例3129')
plt.show()
plt.close()
运行结果
会员各级别人数#直方图
代码如下
lv_four = pd.value_counts(data['FFP_TIER'])[4]
lv_five = pd.value_counts(data['FFP_TIER'])[5]
lv_six = pd.value_counts(data['FFP_TIER'])[6]
fig = plt.figure(figsize=(8,5))
plt.bar(x=range(3),height=[lv_four,lv_five,lv_six],width=0.4,alpha=0.8,color='skyblue')
plt.xticks([index for index in range(3)],['4','5','6'])
plt.xlabel('会员等级')
plt.ylabel('会员人数')
plt.title('会员各级别人数3129')
plt.show()
plt.close()
运行结果
会员年龄分布#箱型图
代码如下
age = data['AGE'].dropna()
age = age.astype('int64')
fig = plt.figure(figsize=(5,10))
plt.boxplot(age,
patch_artist=True,
labels=['会员年龄'],
boxprops={'facecolor':'lightblue'})
plt.title('会员年龄分布箱形图3129')
plt.grid(axis='y')
plt.show()
plt.close()
运行结果
会员最后乘机至结束时长分布#箱型图
代码如下
datafile = "D:\\python_data\\air_data.csv"
resultfile = "D:\\python_data\\air_data_explore.csv"
data = pd.read_csv(datafile, encoding='utf-8')
lte = data['LAST_TO_END']
fc = data['FLIGHT_COUNT']
sks = data['SEG_KM_SUM']
fig = plt.figure(figsize=(5, 8))
plt.boxplot(lte,
patch_artist=True,
labels=['时长'],
boxprops={'facecolor': 'lightblue'})
plt.title('会员最后乘机至结束时长分布箱型图3129')
plt.rcParams['font.sans-serif'] = 'SimHei'
plt.grid(axis='y')
plt.show()
plt.close
运行结果
会员飞行次数分布#箱型图
代码如下
fig = plt.figure(figsize=(5, 8))
plt.boxplot(fc,
patch_artist=True,
labels=['飞行次数'],
boxprops={'facecolor': 'lightblue'})
plt.title('会员飞行次数分布箱型图3129')
plt.grid(axis='y')
plt.show()
plt.close
运行结果
客户飞行公里数#箱型图
代码如下
fig = plt.figure(figsize=(5, 10))
plt.boxplot(sks,
patch_artist=True,
labels=['总飞行公里数'],
boxprops={'facecolor': 'lightblue'})
plt.title('客户飞行公里数箱型图3129')
plt.grid(axis='y')
plt.show()
plt.close
运行结果
会员兑换积分次数分布#直方图
代码如下
# 绘制直方图
ec = data['EXCHANGE_COUNT']
# 绘制会员兑换积分次数直方图
fig = plt.figure(figsize=(8, 5))
plt.hist(ec, bins=5, color='#0405aa')
plt.xlabel('兑换次数')
plt.ylabel('会员人数')
plt.title('会员兑换积分次数分布直方图3129')
plt.show()
plt.close
运行结果
客户总累计积分#箱型图
代码如下
ps = data['Points_Sum']
# 绘制会员总累计积分箱型图
fig = plt.figure(figsize=(5, 8))
plt.boxplot(ps,
patch_artist=True,
labels=['总累计积分'],
boxprops={'facecolor': 'lightblue'})
plt.title('客户总累计积分箱型图3129')
plt.grid(axis='y')
plt.show()
plt.close
运行结果
3、相关矩阵及热力图
代码如下
#相关系数矩阵与热力图
data_corr=data[['FFP_TIER','FLIGHT_COUNT','LAST_TO_END',
'SEG_KM_SUM','EXCHANGE_COUNT','Points_Sum']]
age1=data['AGE'].fillna(0)
data_corr['AGE']=age1.astype('int64')
data_corr['ffp_year']=ffp_year
dt_corr=data_corr.corr(method='pearson')
print('相关性矩阵为:\n',dt_corr)
import seaborn as sns
plt.subplots(figsize=(10,10))
sns.heatmap(dt_corr,annot=True,vmax=1,square=True,cmap='Blues')
plt.title('3129')
plt.show()
plt.close
运行结果
客户分群#雷达图
代码如下
%matplotlib inline
import matplotlib.pyplot as plt
labels=['ZL','ZR','ZF','ZM','ZC']
legen=['客户群'+str(i+1) for i in cluster_center.index]#客户群命名
lstype=['-','--',(0,(3,5,1,5,1,5)),':','-.']
kinds=list(cluster_center.iloc[:,0])
#由于雷达图要保证数据闭合,因此再添加L列,并转换为np.ndarry
cluster_center=pd.concat([cluster_center,cluster_center[['ZL']]],axis=1)
centers=np.array(cluster_center.iloc[:,0:])
#分割圆周长,并让其闭合
n=len(labels)
angle=np.linspace(0,2*np.pi,n,endpoint=False)
angle=np.concatenate((angle,[angle[0]]))
feature=np.concatenate((feature,[feature[0]]))
#绘图
fig=plt.figure(figsize=(8,6))
ax=fig.add_subplot(111,polar=True)
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
#画线
for i in range(len(kinds)):
ax.plot(angle,centers[i],linestyle=lstype[i],linewidth=2,label=kinds[i])
#添加属性标签
ax.set_thetagrids(angle* 180/np.pi, labels)
plt.legend(legen)
plt.show()
plt.close
运行结果
二、电信客户流失分析预测
代码1:读取并简单分析数据
plt.rc("font",family="SimHei",size="12") #解决中文无法显示的问题
data = pd.read_csv("D:\python_data\dianxin_kehuliushi.csv") # 导入数据
data.shape # 查看数据大小
(7043, 21)
data.head()
运行结果
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | ... | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn |
0 | 7590-VHVEG | Female | 0 | Yes | No | 1 | No | No phone service | DSL | No | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | No |
1 | 5575-GNVDE | Male | 0 | No | No | 34 | Yes | No | DSL | Yes | ... | Yes | No | No | No | One year | No | Mailed check | 56.95 | 1889.5 | No |
2 | 3668-QPYBK | Male | 0 | No | No | 2 | Yes | No | DSL | Yes | ... | No | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Yes |
3 | 7795-CFOCW | Male | 0 | No | No | 45 | No | No phone service | DSL | Yes | ... | Yes | Yes | No | No | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | No |
4 | 9237-HQITU | Female | 0 | No | No | 2 | Yes | No | Fiber optic | No | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | Yes |
5 rows × 21 columns
data.describe() #描述性统计信息
| SeniorCitizen | tenure | MonthlyCharges |
count | 7043.000000 | 7043.000000 | 7043.000000 |
mean | 0.162147 | 32.371149 | 64.761692 |
std | 0.368612 | 24.559481 | 30.090047 |
min | 0.000000 | 0.000000 | 18.250000 |
25% | 0.000000 | 9.000000 | 35.500000 |
50% | 0.000000 | 29.000000 | 70.350000 |
75% | 0.000000 | 55.000000 | 89.850000 |
max | 1.000000 | 72.000000 | 118.750000 |
代码2:客户流失数据分析
data['Churn'].value_counts() #查找缺失值
No 5174
Yes 1869
Name: Churn, dtype: int64
#数据集中有5174名用户没流失,有1869名客户流失,数据集不均衡。
data.dtypes #查看数据类型
customerID object
gender object
SeniorCitizen int64
Partner object
Dependents object
tenure int64
PhoneService object
MultipleLines object
InternetService object
OnlineSecurity object
OnlineBackup object
DeviceProtection object
TechSupport object
StreamingTV object
StreamingMovies object
Contract object
PaperlessBilling object
PaymentMethod object
MonthlyCharges float64
TotalCharges object
Churn object
dtype: object
#TotalCharges表示总费用,这里为对象类型,需要转换为float类型
data['TotalCharges']=data['TotalCharges'].apply(pd.to_numeric, errors="ignore")
data['TotalCharges'].describe()
count 7043
unique 6531
top
freq 11
Name: TotalCharges, dtype: object
#数据归一化处理
#对Churn列中的YES和No分别用1和0替换,方便后续处理
data['Churn'].replace(to_replace='Yes',value=1,inplace=True)
data['Churn'].replace(to_replace='No',value=0,inplace=True)
data['Churn'].describe()
count 7043.000000
mean 0.265370
std 0.441561
min 0.000000
25% 0.000000
50% 0.000000
75% 1.000000
max 1.000000
Name: Churn, dtype: float64
data.info() #数据预览
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 customerID 7043 non-null object
1 gender 7043 non-null object
2 SeniorCitizen 7043 non-null int64
3 Partner 7043 non-null object
4 Dependents 7043 non-null object
5 tenure 7043 non-null int64
6 PhoneService 7043 non-null object
7 MultipleLines 7043 non-null object
8 InternetService 7043 non-null object
9 OnlineSecurity 7043 non-null object
10 OnlineBackup 7043 non-null object
11 DeviceProtection 7043 non-null object
12 TechSupport 7043 non-null object
13 StreamingTV 7043 non-null object
14 StreamingMovies 7043 non-null object
15 Contract 7043 non-null object
16 PaperlessBilling 7043 non-null object
17 PaymentMethod 7043 non-null object
18 MonthlyCharges 7043 non-null float64
19 TotalCharges 7043 non-null object
20 Churn 7043 non-null int64
dtypes: float64(1), int64(3), object(17)
memory usage: 1.1+ MB
#在数据预览过后,我们发现不存在缺失值,并且许多特征维度的数据类型均为python默认的object对象类型。
代码3:绘制电信客户性别饼图和绘制客户流失情况饼图
plt.rcParams['font.sans-serif']='SimHei'
plt.rcParams['axes.unicode_minus']='False'
#提取会员不同性别人数
male=pd.value_counts(data['gender'])['Female']
female=pd.value_counts(data['gender'])['Male']
#绘制会员性别比例饼图
fig=plt.figure(figsize=(10,6))
plt.pie([male,female],labels=['男','女'],colors=['lightskyblue','lightcoral'],autopct='%1.1f%%')
plt.title('电信用户性别比例3129',fontsize=15)
plt.show()
plt.close()churnvalue=data[ "Churn" ].value_counts()
labels=data["Churn"].value_counts().indexplt.figure(figsize=(6,6))
plt.pie(churnvalue,labels=labels,colors=["blue","yellow"],explode=(0.1,0),autopct='%1.1f', shadow=True)
plt.title('客户流失情况饼图3129',fontsize=15)
plt.show
#由图中结果可以看出,流失客户占整体客户的26.5%。
代码4:客户流失影响直方图
#性别、老年人、配偶、亲属对流客户流失率的影响
plt.figure(figsize=(10,10))
plt.subplot(2,2,1)
gender=sns.countplot(x='gender',hue='Churn',data=data,palette='Set2') #palette参数表示设置颜色,设置为主颜色paste12
plt.xlabel('性别')
plt.title('不同性别客户流失直方图3129',fontsize=15)plt.subplot(2,2,2)
seniorcitizen=sns.countplot(x='SeniorCitizen',hue='Churn',data=data,palette='Set2')
plt.xlabel('老年人')
plt.title('老年人客户流失直方图3129',fontsize=15)plt.subplot(2,2,3)
partner=sns.countplot(x='Partner',hue='Churn',data=data,palette='Set2')
plt.xlabel('配偶')
plt.title('是否有配偶客户流失直方图3129',fontsize=15)plt.subplot(2,2,4)
dependents=sns.countplot(x='Dependents',hue='Churn',data=data,palette='Set2')
plt.xlabel('亲属')
plt.title('亲属客户流失直方图3129',fontsize=15)
plt.show()