目的
数据导入数据库
数据颜色清洗
机身颜色分析
套餐类型分析
实现
一 商品评价信息导入mysql
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import pymysql
# 连接database
conn = pymysql.connect(
host="127.0.0.1",
user="root",
password="123456",
database="test",
charset="utf8mb4")
# 获取一个可以执行SQL语句的光标对象
cursor = conn.cursor()
# 将结果作为字典返回的游标
cursor = conn.cursor(cursor=pymysql.cursors.DictCursor)
# 创建评价表(id,网络类型,机身颜色,套餐类型,存储容量,版本类型,评价内容,评价时间)
sql = """
CREATE TABLE if not exists phone_assess(
id INT auto_increment PRIMARY KEY ,
Network_type varchar(30) ,
body_color varchar(30) NOT NULL ,
package_type TEXT NOT NULL,
storage_capacity varchar(30) NOT NULL,
version_type varchar(30) NOT NULL,
evaluation_content Text NOT Null,
evaluation_time Date
)ENGINE=innodb DEFAULT CHARSET=utf8mb4;
"""
cursor.execute(sql) # 执行SQL语句
def execute_sql():
f = open('../file/CSV/手机_评价.csv', 'r', encoding='UTF-8', )
# 跳过第一行
next(f)
line = f.readlines() # 读取一行文件,包括换行符
for i in line:
k = i.split(',')
print((k[0],k[1],k[2],k[3],k[4],k[5]),k[6])
try:
cursor.execute(
"insert into phone_assess"
"(Network_type,body_color,package_type,storage_capacity,version_type,evaluation_content, evaluation_time)"
" values(%s,%s,%s,%s,%s,%s,%s)",
(k[0],k[1],k[2],k[3],k[4],k[5],k[6]))
conn.commit()
except Exception as e:
print(e)
# finally:
# pass
print('正常数据导入操作,请等候(预计10s)')
execute_sql()
cursor.close() # 关闭光标对象
conn.close() # 关闭数据库连接
print('导入操作完成')
商品信息导入Mysql
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pymysql
# 连接database
conn = pymysql.connect(
host="127.0.0.1",
user ="root",
password ="123456",
database ="test",
charset ="utf8mb4")
# 获取一个可以执行SQL语句的光标对象
cursor = conn.cursor()
# 将结果作为字典返回的游标
cursor = conn.cursor(cursor=pymysql.cursors.DictCursor)
sql = """
CREATE TABLE if not exists phone(
id INT auto_increment PRIMARY KEY ,
shop_name text ,
shop_link varchar(100) NOT NULL ,
title TEXT NOT NULL,
price int NOT NULL,
sale int NOT NULL default 0,
province char(3) NOT Null
)ENGINE=innodb DEFAULT CHARSET=utf8mb4;
"""
cursor.execute(sql) # 执行SQL语句
import re
def execute_sql():
f = open('../file/CSV/智能手机-all-bak.csv', 'r', encoding='UTF-8', )
#跳过第一行
next(f)
line = f.readlines() # 读取一行文件,包括换行符
for i in line:
k = i.split(',')
#print(k[0], k[1], k[2],k[3],k[4],k[5])
if k[4]=='':#销量为空时默认为0
k[4]=0
try:
cursor.execute("insert into phone(shop_name,shop_link,title,price,sale,province) values(%s,%s,%s,%s,%s,%s)",
(k[0], k[1], k[2], k[3], k[4], k[5]))
conn.commit()
except Exception as e:
print(e)
# finally:
# pass
print('正常进行数据导入操作,请等候(预计10s)')
execute_sql()
cursor.close() # 关闭光标对象
conn.close() # 关闭数据库连接
print('导入操作完成')
三 机身颜色清洗规则
#alter table 表名 add column 列名 varchar(20) not null after user1;#增加列表
#delete from phone_assess where size like '%/%';#删除语句
#update phone_assess set body_color = '黑色系' where body_color like '%黑%' ;#更新语句
#常见颜色清洗
update phone_assess set body_color = '红色系' where body_color like '%赤%' ;
update phone_assess set body_color = '红色系' where body_color like '%红%' ;
update phone_assess set body_color = '橙色系' where body_color like '%橙%' ;
update phone_assess set body_color = '黄色系' where body_color like '%黄%' ;
update phone_assess set body_color = '绿色系' where body_color like '%绿%' ;
update phone_assess set body_color = '青色系' where body_color like '%青%' ;
update phone_assess set body_color = '蓝色系' where body_color like '%蓝%' ;
update phone_assess set body_color = '紫色系' where body_color like '%紫%' ;
update phone_assess set body_color = '黑色系' where body_color like '%黑%' ;
update phone_assess set body_color = '白色系' where body_color like '%白%' ;
update phone_assess set body_color = '灰色系' where body_color like '%灰%' ;
update phone_assess set body_color = '黄色系' where body_color like '%金%';
update phone_assess set body_color = '白色系' where body_color like '%银%' ;
update phone_assess set body_color = '红色系' where body_color like '%粉%' ;
update phone_assess set body_color = '黄色系' where body_color like '%棕%' ;
update phone_assess set body_color = '黄色系' where body_color like '%橘%';
update phone_assess set body_color = '白色系' where body_color like '%透%';
update phone_assess set body_color = '黑色系' where body_color like '%曜%';
update phone_assess set body_color = '黑色系' where body_color like '%夜%';
update phone_assess set body_color = '绿色系' where body_color like '%森%';
update phone_assess set body_color = '绿色系' where body_color like '%碧%';
update phone_assess set body_color = '绿色系' where body_color like '%翠%';
update phone_assess set body_color = '绿色系' where body_color like '%林%';
update phone_assess set body_color = '青色系' where body_color like '%幻%';
#杂色系处理
update phone_assess set body_color = '其他系' where body_color like '%亚%';
update phone_assess set body_color = '其他系' where body_color like '%深%';
update phone_assess set body_color = '其他系' where body_color like '%液%';
update phone_assess set body_color = '其他系' where body_color like '%牛%';
update phone_assess set body_color = '其他系' where body_color like '%珊%';
update phone_assess set body_color = '其他系' where body_color like '%天%';
update phone_assess set body_color = '其他系' where body_color like '%空%';
update phone_assess set body_color = '其他系' where body_color like '%樱%';
update phone_assess set body_color = '其他系' where body_color like '%云%';
update phone_assess set body_color = '其他系' where body_color like '%极%';
update phone_assess set body_color = '其他系' where body_color like '%之%';
update phone_assess set body_color = '其他系' where body_color like '%斯%';
update phone_assess set body_color = '其他系' where body_color like '%礼%';
四 颜色清洗
import pymysql
# 连接database
conn = pymysql.connect(
host="127.0.0.1",
user ="root",
password ="123456",
database ="test",
charset ="utf8mb4")
# 获取一个可以执行SQL语句的光标对象
cursor = conn.cursor()
# 将结果作为字典返回的游标
cursor = conn.cursor(cursor=pymysql.cursors.DictCursor)
def data_wash():
f = open('../file/TXT/数据清洗.txt', 'r', encoding='UTF-8')
line = f.readlines() # 读取一行文件,包括换行符
for i in line:
try:
cursor.execute(i)
conn.commit()
except Exception as e:
print(e)
print('开始数据清洗(5s)')
data_wash()
print('数据清洗完成')
五 颜色销量关系
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pandas as pd
from sqlalchemy import create_engine
#------------------------------------------------------------------------------
# 第一步:连接数据库
# - 初始化数据库连接,使用pymysql模块
# - MySQL的用户:root, 密码:123456, 端口:3306,数据库:test
# - 查询语句,选出phone_assess表中的所有数据
# - read_sql_query的两个参数: sql语句, 数据库连接
# - 按照颜色分组统计数据
# - 数据转字典,获取每个色系及统计数量
#------------------------------------------------------------------------------
engine = create_engine('mysql+mysqlconnector://root:123456@localhost:3306/test')
sql = ''' select id,body_color from phone_assess; '''
colors = pd.read_sql(sql, engine)
group_Count = colors.groupby('body_color').count()
color_sum=group_Count.to_dict()['id']
#------------------------------------------------------------------------------
# 第二步:绘制柱状图
#------------------------------------------------------------------------------
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号
index=list(color_sum.keys())#获取字典的键值对的前键
data=list(color_sum.values())#获取字典的键值对的后键(值)
# 绘图
plt.figure(figsize=(10,4))
plt.bar(index,data)
plt.xticks(rotation=-0) # X轴标签旋转,避免重叠
plt.grid(linestyle='-.')
plt.ylabel('销售量')
plt.title('机身色系销量关系图')
for a, b in zip(list(index), list(data)):# 显示数字
plt.text(a,b,b, ha='center', va='bottom', size=10) #x轴,y轴,显示数值,水平居中,垂直底部,字体大小
plt.grid(linestyle='-.')#显示栅格线
plt.savefig('../file/PNG/21机身颜色销量关系图.png')#保存图片
plt.show()#显示图片
六 套餐类型
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pandas as pd
from sqlalchemy import create_engine
#------------------------------------------------------------------------------
# 第一步:连接数据库
# - 初始化数据库连接,使用pymysql模块
# - MySQL的用户:root, 密码:123456, 端口:3306,数据库:test
# - 查询语句,选出phone_assess表中的所有数据
# - read_sql_query的两个参数: sql语句, 数据库连接
# - 按照颜色分组统计数据
# - 数据转字典,获取每个色系及统计数量
#------------------------------------------------------------------------------
engine = create_engine('mysql+mysqlconnector://root:123456@localhost:3306/test')
sql = ''' select id,package_type from phone_assess; '''
colors = pd.read_sql(sql, engine)
group_Count = colors.groupby('package_type').count()
color_sum=group_Count.to_dict()['id']
#------------------------------------------------------------------------------
# 第二步:绘制柱状图
#------------------------------------------------------------------------------
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号
index=list(color_sum.keys())#获取字典的键值对的前键
data=list(color_sum.values())#获取字典的键值对的后键(值)
# 绘图
plt.figure(figsize=(10,4))
plt.bar(index,data)
plt.xticks(rotation=-0) # X轴标签旋转,避免重叠
plt.grid(linestyle='-.')
plt.ylabel('销售量')
plt.title('套餐类型销量关系图')
for a, b in zip(list(index), list(data)):# 显示数字
plt.text(a,b,b, ha='center', va='bottom', size=10) #x轴,y轴,显示数值,水平居中,垂直底部,字体大小
plt.grid(linestyle='-.')#显示栅格线
plt.savefig('../file/PNG/22套餐类型销量关系图.png')#保存图片
plt.show()#显示图片