一、爬虫部分
# -*- coding:utf-8 -*-
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium import webdriver
from bs4 import BeautifulSoup
from urllib import parse
import time
import pymysql
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
browser = webdriver.Chrome(chrome_options=chrome_options)
wait = WebDriverWait(browser, 10)
def get_url(n, word,pinpai):
print('正在爬取第' + str(n) + '页')
# 确定搜索商品的内容
keyword = {'keyword':word}
# 页面n与参数page的关系
page = '&page=%s' % (2 * n - 1)
pinpai='&ev=exbrand_%s'%(pinpai)
url = 'https://search.jd.com/Search?' +parse.urlencode(keyword) +pinpai+'&enc=utf-8' + page
print(url)
return url
def parse_page(url,pinpai):
print('爬取信息并保存中...')
browser.get(url)
# 把滑轮慢慢下拉至底部,触发ajax
for y in range(100):
js = 'window.scrollBy(0,100)'
browser.execute_script(js)
time.sleep(0.1)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#J_goodsList .gl-item')))
html = browser.page_source
soup = BeautifulSoup(html, 'lxml')
# 找到所有商品标签
goods = soup.find_all('li', class_="gl-item")
# 遍历每个商品,得到每个商品的信息
for good in goods:
num = good['data-sku']
tag = good.find('div', class_="p-price").strong.em.string
money = good.find('div', class_="p-price").strong.i.string
#就是京东有些商品竟然没有店铺名,导检索store时找不到对应的节点导致报错
store = good.find('div', class_="p-shop").span
commit = good.find('div', class_="p-commit").strong.a.string
name = good.find('div', class_="p-name p-name-type-2").a.em
image = good.find('div', class_="p-img").a.img.get('src')
detail_addr = good.find('div', class_="p-img").find('a')['href']
if store is not None:
new_store = store.a.string
else:
new_store = '没有找到店铺 - -!'
new_name = ''
for item in name.strings:
new_name = new_name + item
product = (num,pinpai,new_name,money,new_store,commit,image,detail_addr)
save_to_mysql(product)
print(product)
def save_to_mysql(result):
db = pymysql.connect("localhost", "root", "", "jd")
cursor = db.cursor() # 使用cursor()方法获取操作游标
sql = "INSERT INTO information(info_num,info_brand,info_name,info_money,info_store,info_commit,info_image,info_detail) \
VALUES ('%s','%s', '%s','%s', '%s','%s', '%s', '%s')" % \
(result[0], result[1],result[2],result[3],result[4],result[5],result[6],result[7])
try:
cursor.execute(sql) # 执行sql语句
db.commit() # 提交到数据库执行
print('保存成功!')
except:
db.rollback() # 发生错误时回滚
print('保存失败!')
db.close() # 关闭数据库连接
def main():
try:
word = input('请输出你想要爬取的商品:')
pinpai = input('请输出你想要爬取的品牌:')
pages = int(input('请输入你想要抓取的页数(范围是1-100):'))
# 京东最大页面数为100
if 1 <= pages <= 100:
page = pages + 1
for n in range(1, page):
url = get_url(n, word,pinpai)
parse_page(url,pinpai)
print('爬取完毕!')
browser.close()
else:
print('请重新输入!')
main()
except Exception as error:
print('出现异常!', error)
return None
if __name__ == '__main__':
main()
二、可视化分析部分
import pandas as pd
import numpy as np
import matplotlib
import re
import matplotlib.pyplot as plt
#原始数据
data = pd.read_csv("information.csv",header=0,encoding="gbk",usecols = [1,2,3,4,5])#读取csv数据文件
data = pd.DataFrame(data)
print("Number of samples: %d" % len(data))
data.fillna(0)
#数据预处理
def type(x):
words1=['移动电源','充电宝','数据线','音箱','麦克风','耳机','手机壳','钢化膜','保护','支架']
for element in words1:
if x.find(element)!=-1:
return(str('配件-')+element)
elif x.find('二手')!=-1:
return('二手手机')
else:
return('新手机')
def type2(x):
words1=['老人','学生','商务','5G','智能']
for element in words1:
if x.find(element)!=-1:
return(element+str('手机'))
else:
return('智能手机')
def trans(c):
if c.find('+')!=-1:
c=c.replace('+','')
if c.find('万')!=-1:
c=c.replace('万','')
c=float(c)*10000
c=str(int(c))
return c
def check_contain_eng(check_str):
if check_str.find('(')!=-1 or check_str.find(')')!=-1 or check_str.find('-')!=-1:
check_str=check_str.replace('(','')
check_str=check_str.replace(')','')
check_str=check_str.replace('-','')
if u'\u4e00' <=check_str<= u'\u9fff':
check_str = re.sub('[a-zA-Z]','',check_str)
return str(check_str)
data["商品类型"] = data["info_name"].apply(type)
data["info_commit"] = data["info_commit"].apply(trans)
data["info_commit"] = data["info_commit"].apply(pd.to_numeric)
data["info_brand"] = data["info_brand"].apply(check_contain_eng)
data["手机类型"] = data["info_name"].apply(type2)
data.fillna(0)
#对数据进行数据类型的转换以及数据筛选
data['info_money'] = data['info_money'].astype(int)
data['info_commit'] = data['info_commit'].astype(int)
data1=data[(data['商品类型']=='新手机')]
data2=data[(data['商品类型']=='二手手机')]
#不同品牌的评论量占比
font = {
'family' : 'SimHei',
'size' : 25
}
matplotlib.rc('font', **font);
plt.rcParams['figure.figsize'] = (20.0, 20.0)
gb1 = data1.groupby(
by=['info_store'],
as_index=False
)['info_commit'].agg({
'info_commit':np.sum
});
g1=gb1[gb1['info_commit']>2000000]
plt.pie(g1['info_commit'], labels=g1['info_store'], autopct='%0.1f%%');
plt.title('不同店铺销量分析')
plt.legend(loc='lower right')
plt.show()
#不同店铺平均售价分析
gb1 = data1.groupby(
by=['info_store'],
as_index=False
)['info_money'].agg({
'info_money':np.average
});
g1=gb1[(gb1['info_money']>8000)]
index = np.arange(g1['info_store'].size);
plt.barh(index, g1['info_money'], height = 0.5,color='R');
plt.yticks(index,g1['info_store'])
plt.xlabel('售价8000元以上店铺平均售价分析')
plt.ylabel('商品售价')
#不同价格区间购买人数
data1['info_money'] = data1['info_money'].astype(int)
bins = [min(data1['info_money'])-1,500,1000,3000,5000, max(data1['info_money'])+1];
labels = ['500及以下','500到1000', '1000到3000','3000到5000', '5000以上'];
价格分层 = pd.cut(data1['info_money'], bins, labels=labels)
data1['info_money'] = 价格分层
gb1 = data1.groupby(
by=['info_money'],
as_index=False
)['info_commit'].agg({
'info_commit':np.sum
});
plt.pie(gb1['info_commit'], labels=gb1['info_money'], autopct='%.2f%%');
plt.title('不同价格区间购买人数百分比')
plt.show()
#均价3000元以上手机品牌平均售价
data1=data[(data['商品类型']=='新手机')]
gb1 = data1.groupby(
by=['info_brand'],
as_index=False
)['info_money'].agg({
'info_money':np.average
});
g1=gb1[gb1['info_money']>3000]
index = np.arange(g1['info_brand'].size);
plt.bar(index, g1['info_money'], width = 0.35,color='R');
plt.xticks(index,g1['info_brand'])
plt.xlabel('均价3000元以上手机品牌')
plt.ylabel('商品售价')
#均价1000-3000元以上手机品牌平均售价
g2=gb1[(gb1['info_money']>1000)&(gb1['info_money']<3000)]
index = np.arange(g2['info_brand'].size);
plt.bar(index, g2['info_money'], width = 0.35,color='R');
plt.xticks(index,g2['info_brand'])
plt.xlabel('均价1000-3000元手机品牌')
plt.ylabel('商品售价')
#散点图———————商品价格和购买人数关系
data3=data[(data['商品类型']=='新手机')&(data['info_money']<15000)]
plt.plot(data3['info_money'],data3['info_commit'],'.', color='blue')
plt.xlabel('商品售价')
plt.ylabel('购买人数')
plt.title('商品价格和购买人数关系')
plt.grid(True)
plt.show()
#不同手机类型平均价格分析
data1=data[(data['商品类型']=='新手机')]
gb1 = data1.groupby(
by=['手机类型'],
as_index=False
)['info_money'].agg({
'info_money':np.average
});
index = np.arange(gb1['手机类型'].size);
plt.bar(index, gb1['info_money'], width = 0.35,color='R');
plt.xticks(index,gb1['手机类型'])
plt.xlabel('手机类型')
plt.ylabel('商品平均售价')
# plt.legend(gb1)
#不同手机类型购买人数占比分析
gb1 = data1.groupby(
by=['手机类型'],
as_index=False
)['info_commit'].agg({
'info_commit':np.sum
});
plt.pie(gb1['info_commit'], labels=gb1['手机类型'], autopct='%0.1f%%);
plt.title('不同手机类型购买人数占比')
plt.legend(loc='lower right')
plt.show()