一、飞机客户数据分析预测

1、读取数据

代码如下

import pandas as pd
datafile = "D:\\python_data\\air_data.csv"
resultfile = "D:\\python_data\\air_data_explore.csv"
data = pd.read_csv(datafile, encoding='utf-8')
explore = data.describe(percentiles=[], include='all').T
explore['null'] = len(data)-explore['count']
explore = explore[['null', 'max', 'min']]
explore.columns = [u'空值数', u'最大值', u'最小值']
explore.to_csv(resultfile)

2、绘制图像

各年份会员入会人数#直方图

代码如下

#提取会员入会年份
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt

ffp = data['FFP_DATE'].apply(lambda x:datetime.strptime(x,'%Y/%m/%d'))
ffp_year = ffp.map(lambda x :x.year)
fig = plt.figure(figsize=(8,5))
plt.rcParams['font.sans-serif'] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False
plt.hist(ffp_year,bins='auto',color='#0504aa')
plt.xlabel('年份')
plt.ylabel('入会人数')
plt.title('各年份会员入会人数3129')
plt.show()
plt.close

运行结果

数据挖掘作业 数据挖掘作业代码_数据

会员性别比例#扇形图

代码如下

male = pd.value_counts(data['GENDER'])['男']
female = pd.value_counts(data['GENDER'])['女']
fig = plt.figure(figsize=(7,4))
plt.pie([male,female],labels=['男','女'],colors=['lightskyblue','lightcoral'],autopct='%1.1f%%')
plt.title('会员性别比例3129')
plt.show()
plt.close()

运行结果

数据挖掘作业 数据挖掘作业代码_python_02

会员各级别人数#直方图

代码如下

lv_four = pd.value_counts(data['FFP_TIER'])[4]
lv_five = pd.value_counts(data['FFP_TIER'])[5]
lv_six = pd.value_counts(data['FFP_TIER'])[6]
fig = plt.figure(figsize=(8,5))
plt.bar(x=range(3),height=[lv_four,lv_five,lv_six],width=0.4,alpha=0.8,color='skyblue')
plt.xticks([index for index in range(3)],['4','5','6'])
plt.xlabel('会员等级')
plt.ylabel('会员人数')
plt.title('会员各级别人数3129')
plt.show()
plt.close()

运行结果

数据挖掘作业 数据挖掘作业代码_数据挖掘作业_03

会员年龄分布#箱型图

代码如下

age = data['AGE'].dropna()
age = age.astype('int64')
fig = plt.figure(figsize=(5,10))
plt.boxplot(age,
patch_artist=True,
labels=['会员年龄'],
boxprops={'facecolor':'lightblue'})
plt.title('会员年龄分布箱形图3129')
plt.grid(axis='y')
plt.show()
plt.close()

运行结果

数据挖掘作业 数据挖掘作业代码_数据_04

会员最后乘机至结束时长分布#箱型图

代码如下

datafile = "D:\\python_data\\air_data.csv"
resultfile = "D:\\python_data\\air_data_explore.csv"
data = pd.read_csv(datafile, encoding='utf-8')
lte = data['LAST_TO_END']
fc = data['FLIGHT_COUNT']
sks = data['SEG_KM_SUM']
fig = plt.figure(figsize=(5, 8))
plt.boxplot(lte,
            patch_artist=True,
            labels=['时长'],
            boxprops={'facecolor': 'lightblue'})
plt.title('会员最后乘机至结束时长分布箱型图3129')
plt.rcParams['font.sans-serif'] = 'SimHei'
plt.grid(axis='y')
plt.show()
plt.close

运行结果

数据挖掘作业 数据挖掘作业代码_数据挖掘作业_05

会员飞行次数分布#箱型图

代码如下

fig = plt.figure(figsize=(5, 8))
plt.boxplot(fc,
            patch_artist=True,
            labels=['飞行次数'],
            boxprops={'facecolor': 'lightblue'})
plt.title('会员飞行次数分布箱型图3129')
plt.grid(axis='y')
plt.show()
plt.close

运行结果

数据挖掘作业 数据挖掘作业代码_数据挖掘作业_06

客户飞行公里数#箱型图

代码如下

fig = plt.figure(figsize=(5, 10))
plt.boxplot(sks,
            patch_artist=True,
            labels=['总飞行公里数'],
            boxprops={'facecolor': 'lightblue'})
plt.title('客户飞行公里数箱型图3129')
plt.grid(axis='y')
plt.show()
plt.close

 

运行结果

数据挖掘作业 数据挖掘作业代码_直方图_07

会员兑换积分次数分布#直方图

代码如下

# 绘制直方图
ec = data['EXCHANGE_COUNT']
# 绘制会员兑换积分次数直方图
fig = plt.figure(figsize=(8, 5))
plt.hist(ec, bins=5, color='#0405aa')
plt.xlabel('兑换次数')
plt.ylabel('会员人数')
plt.title('会员兑换积分次数分布直方图3129')
plt.show()
plt.close

运行结果

数据挖掘作业 数据挖掘作业代码_数据挖掘作业_08

客户总累计积分#箱型图

代码如下

ps = data['Points_Sum']
# 绘制会员总累计积分箱型图
fig = plt.figure(figsize=(5, 8))
plt.boxplot(ps,
            patch_artist=True,
            labels=['总累计积分'],
            boxprops={'facecolor': 'lightblue'})
plt.title('客户总累计积分箱型图3129')
plt.grid(axis='y')
plt.show()
plt.close

运行结果

数据挖掘作业 数据挖掘作业代码_直方图_09

3、相关矩阵及热力图

代码如下

#相关系数矩阵与热力图
data_corr=data[['FFP_TIER','FLIGHT_COUNT','LAST_TO_END',
                'SEG_KM_SUM','EXCHANGE_COUNT','Points_Sum']]
age1=data['AGE'].fillna(0)
data_corr['AGE']=age1.astype('int64')
data_corr['ffp_year']=ffp_year
dt_corr=data_corr.corr(method='pearson')
print('相关性矩阵为:\n',dt_corr)

import seaborn as sns
plt.subplots(figsize=(10,10))
sns.heatmap(dt_corr,annot=True,vmax=1,square=True,cmap='Blues')
plt.title('3129')
plt.show()
plt.close

运行结果

数据挖掘作业 数据挖掘作业代码_数据_10

 

 

 

数据挖掘作业 数据挖掘作业代码_python_11

客户分群#雷达图

代码如下

%matplotlib inline
import matplotlib.pyplot as plt

labels=['ZL','ZR','ZF','ZM','ZC']
legen=['客户群'+str(i+1) for i in cluster_center.index]#客户群命名
lstype=['-','--',(0,(3,5,1,5,1,5)),':','-.']
kinds=list(cluster_center.iloc[:,0])
#由于雷达图要保证数据闭合,因此再添加L列,并转换为np.ndarry
cluster_center=pd.concat([cluster_center,cluster_center[['ZL']]],axis=1)
centers=np.array(cluster_center.iloc[:,0:])

#分割圆周长,并让其闭合
n=len(labels)
angle=np.linspace(0,2*np.pi,n,endpoint=False)
angle=np.concatenate((angle,[angle[0]]))
feature=np.concatenate((feature,[feature[0]]))

#绘图
fig=plt.figure(figsize=(8,6))
ax=fig.add_subplot(111,polar=True)
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
#画线
for i in range(len(kinds)):
ax.plot(angle,centers[i],linestyle=lstype[i],linewidth=2,label=kinds[i])
#添加属性标签
ax.set_thetagrids(angle* 180/np.pi, labels)
plt.legend(legen)
plt.show()
plt.close

运行结果

数据挖掘作业 数据挖掘作业代码_python_12

 

二、电信客户流失分析预测

代码1:读取并简单分析数据

plt.rc("font",family="SimHei",size="12") #解决中文无法显示的问题
data = pd.read_csv("D:\python_data\dianxin_kehuliushi.csv") # 导入数据

data.shape # 查看数据大小

(7043, 21)

data.head()

运行结果

 

customerID

gender

SeniorCitizen

Partner

Dependents

tenure

PhoneService

MultipleLines

InternetService

OnlineSecurity

...

DeviceProtection

TechSupport

StreamingTV

StreamingMovies

Contract

PaperlessBilling

PaymentMethod

MonthlyCharges

TotalCharges

Churn

0

7590-VHVEG

Female

0

Yes

No

1

No

No phone service

DSL

No

...

No

No

No

No

Month-to-month

Yes

Electronic check

29.85

29.85

No

1

5575-GNVDE

Male

0

No

No

34

Yes

No

DSL

Yes

...

Yes

No

No

No

One year

No

Mailed check

56.95

1889.5

No

2

3668-QPYBK

Male

0

No

No

2

Yes

No

DSL

Yes

...

No

No

No

No

Month-to-month

Yes

Mailed check

53.85

108.15

Yes

3

7795-CFOCW

Male

0

No

No

45

No

No phone service

DSL

Yes

...

Yes

Yes

No

No

One year

No

Bank transfer (automatic)

42.30

1840.75

No

4

9237-HQITU

Female

0

No

No

2

Yes

No

Fiber optic

No

...

No

No

No

No

Month-to-month

Yes

Electronic check

70.70

151.65

Yes

5 rows × 21 columns

data.describe()  #描述性统计信息

 

SeniorCitizen

tenure

MonthlyCharges

count

7043.000000

7043.000000

7043.000000

mean

0.162147

32.371149

64.761692

std

0.368612

24.559481

30.090047

min

0.000000

0.000000

18.250000

25%

0.000000

9.000000

35.500000

50%

0.000000

29.000000

70.350000

75%

0.000000

55.000000

89.850000

max

1.000000

72.000000

118.750000

代码2:客户流失数据分析

data['Churn'].value_counts()  #查找缺失值


No     5174
Yes    1869
Name: Churn, dtype: int64

#数据集中有5174名用户没流失,有1869名客户流失,数据集不均衡。

data.dtypes  #查看数据类型

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object
#TotalCharges表示总费用,这里为对象类型,需要转换为float类型
data['TotalCharges']=data['TotalCharges'].apply(pd.to_numeric, errors="ignore")
data['TotalCharges'].describe()
count     7043
unique    6531
top           
freq        11
Name: TotalCharges, dtype: object

#数据归一化处理
#对Churn列中的YES和No分别用1和0替换,方便后续处理
data['Churn'].replace(to_replace='Yes',value=1,inplace=True)
data['Churn'].replace(to_replace='No',value=0,inplace=True)
data['Churn'].describe()
count    7043.000000
mean        0.265370
std         0.441561
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000
Name: Churn, dtype: float64

data.info()  #数据预览
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 
 17  PaymentMethod     7043 non-null   object 
 18  MonthlyCharges    7043 non-null   float64
 19  TotalCharges      7043 non-null   object 
 20  Churn             7043 non-null   int64  
dtypes: float64(1), int64(3), object(17)
memory usage: 1.1+ MB

#在数据预览过后,我们发现不存在缺失值,并且许多特征维度的数据类型均为python默认的object对象类型。

 代码3:绘制电信客户性别饼图和绘制客户流失情况饼图

plt.rcParams['font.sans-serif']='SimHei'
plt.rcParams['axes.unicode_minus']='False'
#提取会员不同性别人数
male=pd.value_counts(data['gender'])['Female']
female=pd.value_counts(data['gender'])['Male']
#绘制会员性别比例饼图
fig=plt.figure(figsize=(10,6))
plt.pie([male,female],labels=['男','女'],colors=['lightskyblue','lightcoral'],autopct='%1.1f%%')
plt.title('电信用户性别比例3129',fontsize=15)
plt.show()
plt.close()churnvalue=data[ "Churn" ].value_counts()
labels=data["Churn"].value_counts().indexplt.figure(figsize=(6,6))
plt.pie(churnvalue,labels=labels,colors=["blue","yellow"],explode=(0.1,0),autopct='%1.1f', shadow=True)
plt.title('客户流失情况饼图3129',fontsize=15)
plt.show

数据挖掘作业 数据挖掘作业代码_python_13

#由图中结果可以看出,流失客户占整体客户的26.5%。

代码4:客户流失影响直方图

#性别、老年人、配偶、亲属对流客户流失率的影响
plt.figure(figsize=(10,10))
plt.subplot(2,2,1)
gender=sns.countplot(x='gender',hue='Churn',data=data,palette='Set2') #palette参数表示设置颜色,设置为主颜色paste12
plt.xlabel('性别')
plt.title('不同性别客户流失直方图3129',fontsize=15)plt.subplot(2,2,2)
seniorcitizen=sns.countplot(x='SeniorCitizen',hue='Churn',data=data,palette='Set2')
plt.xlabel('老年人')
plt.title('老年人客户流失直方图3129',fontsize=15)plt.subplot(2,2,3)
partner=sns.countplot(x='Partner',hue='Churn',data=data,palette='Set2')
plt.xlabel('配偶')
plt.title('是否有配偶客户流失直方图3129',fontsize=15)plt.subplot(2,2,4)
dependents=sns.countplot(x='Dependents',hue='Churn',data=data,palette='Set2')
plt.xlabel('亲属')
plt.title('亲属客户流失直方图3129',fontsize=15)
plt.show()