使用英国某零售平台交易数据

# 查看特征和类型
# 发票编号、商品名称、简介、数量、日期、单价、用户ID、国家
DESC uk

mydql做数据分析的步骤 mysql数据分析实战_中位数

# 查看前5行
SELECT * FROM uk LIMIT 5;

mydql做数据分析的步骤 mysql数据分析实战_中位数_02


1 缺失值检测

# 总行数、各字段的行数(是否存在缺失值)
SELECT COUNT(*),COUNT(invoiceno),COUNT(stockcode),COUNT(description),
COUNT(quantity),COUNT(invoicedate),COUNT(unitprice),COUNT(customerid),
COUNT(country) FROM uk;
# 共541909行,其中发票编号、描述、customerid存在缺失

2 异常值检测
数值型的数量和单价可能存在异常

【数量和单价】
SELECT MAX(quantity),MIN(quantity),MAX(unitprice),MIN(unitprice)
FROM uk;

SELECT COUNT(*) FROM uk WHERE quantity<=0  # 10000多条
SELECT COUNT(*) FROM uk WHERE unitprice<=0  # 2条异常
# 去掉2条小于0的,将10000条的变为正数

【样本时间范围】-2010-12-01到2011-12-09
SELECT MAX(invoicedate),MIN(invoicedate) FROM uk;

【每月记录数】-没有很少的
SELECT YEAR(invoicedate),MONTH(invoicedate),COUNT(*)
FROM uk
GROUP BY YEAR(invoicedate),MONTH(invoicedate);

3 重复值检测-536641,541909存在重复值

SELECT COUNT(*) FROM (SELECT DISTINCT * FROM uk) t;
SELECT COUNT(*) FROM uk;

4 处理缺失、异常、重复

# quantity<0的变为正
UPDATE uk
SET quantity=quantity*(-1)
WHERE quantity<0

# 删除customerid为空,unitprice<0,quantity=0
DELETE FROM uk 
WHERE CustomerID IS NULL
OR unitprice<=0 OR quantity=0

# 再次查看是否缺失,总记录406789,invoiceNo397884条
SELECT COUNT(invoiceno),COUNT(stockcode),COUNT(description),
COUNT(quantity),COUNT(invoicedate),COUNT(unitprice),COUNT(customerid),
COUNT(country) FROM uk

# 再次查看是否存在重复值-存在
SELECT COUNT(*) FROM (SELECT DISTINCT * FROM uk) t;
SELECT COUNT(*) FROM uk;

# 建新表,去重,删去原表
CREATE TABLE uk2 AS
SELECT DISTINCT * FROM uk;
DROP TABLE uk;

# invoiceNo缺失比例计算-各月比例大致相同,直接删除对订单月分布影响不大
SELECT YEAR(invoicedate) `年`,MONTH(invoicedate) `月`,
SUM(InvoiceNo IS NULL) `缺失数`,COUNT(*) `记录数`,ROUND(SUM(InvoiceNo IS NULL)/COUNT(*),2) `缺失比例`
FROM uk2
GROUP BY YEAR(invoicedate),MONTH(invoicedate)

# 删除缺失
DELETE FROM uk2 WHERE Invoiceno IS NULL;

交易总体状况

SELECT COUNT(DISTINCT invoiceno) `总订单量`,ROUND(SUM(unitprice*quantity),2) `总订单金额`,
COUNT(DISTINCT customerid) `消费者数量`,SUM(quantity) `商品总销量`,
ROUND(SUM(unitprice*quantity)/COUNT(DISTINCT customerid),2) `客单价`,
ROUND(SUM(quantity)/COUNT(DISTINCT invoiceno),2) `连带率`
FROM uk2;

mydql做数据分析的步骤 mysql数据分析实战_时间间隔_03

# 每月订单数、订单金额、消费者数量、商品总量
SELECT DATE_FORMAT(invoicedate,'%Y年%m月') `月份`,
COUNT(DISTINCT invoiceno) `订单数`,
ROUND(SUM(unitprice*quantity),2) `订单金额`,
COUNT(DISTINCT customerid) `消费者数量`,
SUM(quantity) `出售商品总量`,
ROUND(SUM(unitprice*quantity)/COUNT(DISTINCT customerid),2) `客单价`,
ROUND(SUM(quantity)/COUNT(DISTINCT customerid),2) `连带率`,
ROUND(SUM(unitprice*quantity)/COUNT(DISTINCT invoiceno),2) `每单平均金额`,
ROUND(SUM(quantity)/COUNT(DISTINCT invoiceno)) `每单平均数量`
FROM uk2
GROUP BY DATE_FORMAT(invoicedate,'%Y年%m月')
# 10月、11月是订单高峰期

mydql做数据分析的步骤 mysql数据分析实战_时间间隔_04

一、订单量分布

按交易地区

订单top5的国家

SELECT country,COUNT(DISTINCT invoiceno) `订单数`,
ROUND(COUNT(DISTINCT invoiceno)/(SELECT COUNT(DISTINCT invoiceno) FROM uk2),3) `比例`
FROM uk2 GROUP BY country ORDER BY `订单数` DESC LIMIT 5;

mydql做数据分析的步骤 mysql数据分析实战_中位数_05

按交易时间

一个月内各天订单数及比例

SELECT DAY(invoicedate) `日`,COUNT(DISTINCT invoiceno) `订单数`,
ROUND(COUNT(DISTINCT invoiceno)/(SELECT COUNT(DISTINCT invoiceno) FROM uk2),2) `比例`
FROM uk2
GROUP BY DAY(invoicedate)
ORDER BY `日`;
# 月末订单明显少

mydql做数据分析的步骤 mysql数据分析实战_缺失值_06


星期内订单数及比例

SELECT DAYOFWEEK(invoicedate) `星期`,COUNT(DISTINCT invoiceno) `订单数`,
ROUND(COUNT(DISTINCT invoiceno)/(SELECT COUNT(DISTINCT invoiceno) FROM uk2),2) `比例`
FROM uk2
GROUP BY DAYOFWEEK(invoicedate)
ORDER BY `星期`;  
# 1表示星期日,周六不营业?周四订单量最大

mydql做数据分析的步骤 mysql数据分析实战_时间间隔_07


每小时内订单数及比例

SELECT CONCAT(HOUR(invoicedate),'-',HOUR(invoicedate)+1) `时间`,COUNT(DISTINCT invoiceno) `订单数`,
ROUND(COUNT(DISTINCT invoiceno)/(SELECT COUNT(DISTINCT invoicedate) FROM uk2),3) `比例`
FROM uk2
GROUP BY CONCAT(HOUR(invoicedate),'-',HOUR(invoicedate)+1)
ORDER BY HOUR(invoicedate);
# 10到16是集中时间

mydql做数据分析的步骤 mysql数据分析实战_中位数_08

每单金额、数量及商品单价分布

每单消费金额分布

SELECT `金额`,COUNT(*) `数量`
FROM (
    SELECT (CASE 
    WHEN amount<100 THEN '0-100'
    WHEN amount BETWEEN 100 AND 1000 THEN '100-1000'
    WHEN amount BETWEEN 1001 AND 2000 THEN '1000-2000'
    WHEN amount BETWEEN 2001 AND 3000 THEN '2000-3000'
    WHEN amount BETWEEN 3001 AND 4000 THEN '3000-4000'
    WHEN amount BETWEEN 4001 AND 5000 THEN '4000-5000'
    WHEN amount BETWEEN 5001 AND 6000 THEN '5000-6000'
    ELSE '6000以上' END) `金额`
    FROM
	(
	    SELECT invoiceno,SUM(unitprice*quantity) amount
	    FROM uk2 GROUP BY invoiceno
	) temp
) t
GROUP BY `金额` ;

mydql做数据分析的步骤 mysql数据分析实战_缺失值_09

商品单价分布

SELECT `价格`,COUNT(*) `数量`
FROM (
	SELECT CASE
	WHEN unitprice<1 THEN '0-1'
	WHEN unitprice BETWEEN 2 AND 5 THEN '2-5'
	WHEN unitprice BETWEEN 6 AND 10 THEN '6-10'
	WHEN unitprice BETWEEN 11 AND 20 THEN '11-20'
	WHEN unitprice BETWEEN 21 AND 30 THEN '21-30'
	WHEN unitprice BETWEEN 31 AND 50 THEN '31-50'
	WHEN unitprice BETWEEN 51 AND 100 THEN '51-100'
	WHEN unitprice BETWEEN 101 AND 300 THEN '101-300'
	WHEN unitprice BETWEEN 301 AND 500 THEN '301-500'
	WHEN unitprice BETWEEN 501 AND 1000 THEN '501-1000'
	ELSE '1000以上' END `价格`
	FROM uk2
	) temp
GROUP BY `价格`

mydql做数据分析的步骤 mysql数据分析实战_时间间隔_10

# 购买量top20的单价
SELECT unitprice,COUNT(*)`频率` FROM uk2 GROUP BY unitprice ORDER BY `频率` DESC LIMIT 20;

mydql做数据分析的步骤 mysql数据分析实战_mydql做数据分析的步骤_11

3 订购量分布

SELECT `订购量`,COUNT(*) '数量'
FROM
(
    SELECT (CASE 
    WHEN amount<100 THEN '0-100'
    WHEN amount BETWEEN 101 AND 500 THEN '100-500'
    WHEN amount BETWEEN 501 AND 1000 THEN '500-1000'
    WHEN amount BETWEEN 1001 AND 2000 THEN '1000-2000'
    WHEN amount BETWEEN 2001 AND 3000 THEN '2000-3000'
    ELSE '3000以上' END)`订购量`
    FROM
	(
	    SELECT invoiceno,SUM(quantity) amount
	    FROM uk2 GROUP BY invoiceno
	    ) temp
	) t
GROUP BY `订购量`;

mydql做数据分析的步骤 mysql数据分析实战_mydql做数据分析的步骤_12

每单数量的中位数(四分位数)

SELECT AVG(amount) `中位数`
FROM (
	SELECT @index:=@index+1 AS num, amount
	FROM (
	    SELECT invoiceno,SUM(quantity) amount
	    FROM uk2 GROUP BY invoiceno ORDER BY amount
	    ) temp,(SELECT @index:=0) ti
	) t
WHERE num=FLOOR((@index+1)/2) OR num=CEIL((@index+1)/2)
# @index是最后一个index,也就是最大的值
# 下四分位:where num=FLOOR((@index+1)/2),select amount
# 上四分位数:where num=FLOOR(3*(@index+1)/2),select amount

二、每月留存率

2010/12月的消费者为初始,每月的新增消费者在以后月份的留存情况

CREATE VIEW customer_orders AS 
SELECT CustomerID,
IF(SUM(YEAR(invoicedate)='2010')>0,1,0) '2010/12',
IF(SUM(MONTH(invoicedate)='1')>0,1,0) '2011/01',
IF(SUM(MONTH(invoicedate)='2')>0,1,0) '2011/02',
IF(SUM(MONTH(invoicedate)='3')>0,1,0) '2011/03',
IF(SUM(MONTH(invoicedate)='4')>0,1,0) '2011/04',
IF(SUM(MONTH(invoicedate)='5')>0,1,0) '2011/05',
IF(SUM(MONTH(invoicedate)='6')>0,1,0) '2011/06',
IF(SUM(MONTH(invoicedate)='7')>0,1,0) '2011/07',
IF(SUM(MONTH(invoicedate)='8')>0,1,0) '2011/08',
IF(SUM(MONTH(invoicedate)='9')>0,1,0) '2011/09',
IF(SUM(MONTH(invoicedate)='10')>0,1,0) '2011/10',
IF(SUM(MONTH(invoicedate)='11')>0,1,0) '2011/11',
IF(SUM(DATE_FORMAT(invoicedate,'%Y/%m')='2011/12')>0,1,0) '2011/12'
FROM uk2
GROUP BY CustomerID;

SELECT * FROM customer_orders;
# 1表示消费了,0表示没有消费

mydql做数据分析的步骤 mysql数据分析实战_中位数_13


5个月后的留存

CREATE TABLE retention AS 
SELECT '2010/12' AS`月份`,COUNT(*) AS `新增`,SUM(`2011/01`=1) `+1`,
SUM(`2011/01`+`2011/02`=2) `+2`,SUM(`2011/01`+`2011/02`+`2011/03`=3) `+3`,
SUM(`2011/01`+`2011/02`+`2011/03`+`2011/04`=4)`+4`,SUM(`2011/01`+`2011/02`+`2011/03`+`2011/04`+`2011/05`=5)`+5`
FROM  customer_orders WHERE `2010/12`=1
UNION
SELECT '2011/01' AS`月份`,COUNT(*) AS `新增`,SUM(`2011/02`=1) `+1`,
SUM(`2011/02`+`2011/03`=2) `+2`,SUM(`2011/02`+`2011/03`+`2011/04`=3) `+3`,
SUM(`2011/02`+`2011/03`+`2011/04`+`2011/05`=4)`+4`,SUM(`2011/02`+`2011/03`+`2011/04`+`2011/05`+`2011/06`=5)`+5`
FROM  customer_orders WHERE `2010/12`=0 AND `2011/01`=1
UNION
SELECT '2011/02' AS`月份`,COUNT(*) AS `新增`,SUM(`2011/03`=1) `+1`,
SUM(`2011/03`+`2011/04`=2) `+2`,SUM(`2011/03`+`2011/04`+`2011/05`=3) `+3`,
SUM(`2011/03`+`2011/04`+`2011/05`+`2011/06`=4)`+4`,SUM(`2011/03`+`2011/04`+`2011/05`+`2011/06`+`2011/07`=5)`+5`
FROM  customer_orders WHERE `2010/12`=`2011/01`=0 AND `2011/02`=1
UNION
SELECT '2011/03' AS`月份`,COUNT(*) AS `新增`,SUM(`2011/04`=1) `+1`,
SUM(`2011/04`+`2011/05`=2) `+2`,SUM(`2011/04`+`2011/05`+`2011/06`=3) `+3`,
SUM(`2011/04`+`2011/05`+`2011/06`+`2011/07`=4)`+4`,SUM(`2011/04`+`2011/05`+`2011/06`+`2011/07`+`2011/08`=5)`+5`
FROM  customer_orders WHERE `2010/12`=`2011/01`=`2011/02`=0 AND `2011/03`=1
UNION
SELECT '2011/04' AS`月份`,COUNT(*) AS `新增`,SUM(`2011/02`=5) `+1`,
SUM(`2011/05`+`2011/06`=2) `+2`,SUM(`2011/05`+`2011/06`+`2011/07`=3) `+3`,
SUM(`2011/05`+`2011/06`+`2011/07`+`2011/08`=4)`+4`,SUM(`2011/05`+`2011/06`+`2011/07`+`2011/08`+`2011/09`=5)`+5`
FROM  customer_orders WHERE `2010/12`=`2011/01`=`2011/02`=`2011/03`=0 AND `2011/04`=1
UNION
SELECT '2011/05' AS`月份`,COUNT(*) AS `新增`,SUM(`2011/06`=1) `+1`,
SUM(`2011/06`+`2011/07`=2) `+2`,SUM(`2011/06`+`2011/07`+`2011/08`=3) `+3`,
SUM(`2011/06`+`2011/07`+`2011/08`+`2011/09`=4)`+4`,SUM(`2011/06`+`2011/07`+`2011/08`+`2011/09`+`2011/10`=5)`+5`
FROM  customer_orders WHERE `2010/12`=`2011/01`=`2011/02`=`2011/03`=`2011/04`=0 AND `2011/05`=1
UNION
SELECT '2011/06' AS`月份`,COUNT(*) AS `新增`,SUM(`2011/07`=1) `+1`,
SUM(`2011/07`+`2011/08`=2) `+2`,SUM(`2011/07`+`2011/08`+`2011/09`=3) `+3`,
SUM(`2011/07`+`2011/08`+`2011/09`+`2011/10`=4)`+4`,SUM(`2011/07`+`2011/08`+`2011/09`+`2011/10`+`2011/11`=5)`+5`
FROM  customer_orders WHERE `2010/12`=`2011/01`=`2011/02`=`2011/03`=`2011/04`=`2011/05`=0 AND `2011/06`=1

SELECT * FROM retention;

mydql做数据分析的步骤 mysql数据分析实战_缺失值_14

三、复购率

购买次数及其人数

SELECT times `购买次数`,COUNT(*) `人数`
FROM
(
SELECT customerid,COUNT(DISTINCT invoiceno) times
FROM uk2 GROUP BY customerid
) temp
GROUP BY times ORDER BY times;

mydql做数据分析的步骤 mysql数据分析实战_mydql做数据分析的步骤_15

复购率

SELECT ROUND(COUNT(customerid)/ (SELECT COUNT(DISTINCT customerid) FROM uk2),2) `复购率`
FROM
(SELECT customerid FROM uk2 GROUP BY customerid
HAVING COUNT(DISTINCT invoiceno)>1) fugou

mydql做数据分析的步骤 mysql数据分析实战_缺失值_16

复购的间隔

CREATE TABLE fugou AS  # 生成复购表
SELECT temp.*,@num:=IF(customerid<>@c,1,IF(mydate>@date,@num+1,@num)) num,
@c:=customerid,@date:=mydate
FROM (
SELECT uk2.customerid,DATE(invoicedate) mydate
FROM uk2 JOIN (
    SELECT customerid FROM uk2
    GROUP BY customerid
    HAVING COUNT(DISTINCT DATE(invoicedate))>1 # 一张发票也许对应多个时间,但只要在一天就视为一个订单
    )fugou 
ON uk2.`CustomerID`=fugou.customerid
ORDER BY uk2.customerid,mydate
) temp,(SELECT @num:=0,@c:=' ',@date:='2000-01-01') t
GROUP BY customerid,mydate  # 每个顾客每个订单上有多个产品,需要分组
ORDER BY customerid,mydate;

SELECT * FROM fugou;

mydql做数据分析的步骤 mysql数据分析实战_缺失值_17

每个消费者的平均复购时间间隔

SELECT f1.customerid,MAX(f1.num) `下单次数`,1*AVG(DATEDIFF(f1.mydate,f2.mydate)) `平均间隔天数`
FROM fugou f1,fugou f2
WHERE f1.customerid=f2.customerid
AND f1.num-f2.num=1  # 本次,上一次
GROUP BY f1.customerid
ORDER BY `下单次数` DESC,`平均间隔天数` DESC

mydql做数据分析的步骤 mysql数据分析实战_时间间隔_18

新客户首日购买后,第二天(n日内)依旧购买的比例

SELECT ROUND(COUNT(DISTINCT uk2.customerid)/(SELECT COUNT(DISTINCT customerid) FROM uk2),2) `第二天回购比例`
FROM uk2 JOIN (SELECT customerid,MIN(invoicedate) first_buy FROM uk2 GROUP BY customerid) t
ON uk2.`CustomerID`=t.customerid
AND DATEDIFF(uk2.`InvoiceDate`,first_buy)=1  # <n
# DATEDIFF(uk2.`InvoiceDate`,first_buy)<30 AND  uk2.`InvoiceDate`<>first_buy  30天内回购比例:0.22

mydql做数据分析的步骤 mysql数据分析实战_时间间隔_19

2010年12月每日新用户数量,及其在一个月内的复购人数、复购比例

SELECT fd,COUNT(DISTINCT t.customerid) `新用户数`,COUNT(DISTINCT uk2.`CustomerID`) `30天内复购人数`,
ROUND(COUNT(DISTINCT uk2.`CustomerID`)/COUNT(DISTINCT t.customerid),2)`30天内复购比例`
FROM
uk2 RIGHT JOIN
(
SELECT customerid,MIN(DATE(invoicedate)) fd
FROM uk2 WHERE YEAR(invoicedate)=2010 GROUP BY customerid
) t
ON uk2.`CustomerID`=t.customerid
AND DATEDIFF(DATE(uk2.`InvoiceDate`),fd)<=30 AND DATE(uk2.`InvoiceDate`)<>fd
GROUP BY fd;
# 第二个and条件必须使用DATE函数变为日期,t表中是日期类型的

mydql做数据分析的步骤 mysql数据分析实战_缺失值_20

新老顾客标签

CREATE TABLE customer_type AS
SELECT uk2.customerid,DATE(uk2.`InvoiceDate`) d,
(CASE
WHEN fugou.`customerid` IS NULL THEN '新客户'
WHEN fugou.num=1 THEN '新客户'
ELSE '老客户' END) `客户类型`
FROM uk2 LEFT JOIN fugou ON uk2.`CustomerID`=fugou.`customerid`
AND DATE(uk2.`InvoiceDate`)=fugou.`mydate`;

SELECT DATE_FORMAT(d,'%Y-%m') `月份`,`客户类型`,COUNT(*) n
FROM customer_type
GROUP BY DATE_FORMAT(d,'%Y-%m'),`客户类型`

mydql做数据分析的步骤 mysql数据分析实战_时间间隔_21

四、RFM模型

# 设现在时间的2011-12-09
CREATE VIEW RFM AS
SELECT customerid,DATEDIFF('2011-12-09',MAX(invoicedate)) R,
COUNT(DISTINCT invoiceno) F,
ROUND(SUM(quantity*unitprice),2) M
FROM uk2
GROUP BY customerid;

SELECT * FROM RFM LIMIT 5

mydql做数据分析的步骤 mysql数据分析实战_mydql做数据分析的步骤_22


根据均值划分高低两类,客户标签可能不准确。

SELECT *,CASE
WHEN R_score=1 AND F_score=1 AND M_score=1 THEN '重要价值客户'
WHEN R_score=1 AND F_score=1 AND M_score=0 THEN '重要保持客户'
WHEN R_score=0 AND F_score=1 AND M_score=1 THEN '重要挽留客户'
WHEN R_score=1 AND F_score=0 AND M_score=1 THEN '重要价值客户'
WHEN R_score=1 AND F_score=1 AND M_score=0 THEN '一般价值客户'
WHEN R_score=1 AND F_score=0 AND M_score=0 THEN '一般保持客户'
WHEN R_score=0 AND F_score=1 AND M_score=0 THEN '一般客户'
ELSE '流失客户' END `客户类型`
FROM
(
SELECT *,IF(R>(SELECT AVG(R) FROM RFM),0,1) R_score,
IF(F>(SELECT AVG(F) FROM RFM),1,0) F_score,
IF(M>(SELECT AVG(M) FROM RFM),1,0) M_score
FROM RFM
) temp;

mydql做数据分析的步骤 mysql数据分析实战_缺失值_23