python爬虫—孔夫子旧书网数据可视化分析

转载

mb5fed6fc050005 2021-09-04 18:40:00

文章标签 firefox html 词云数据图例 文章分类 Python 后端开发

一、选题背景

现如今，购买书的渠道有很多，京东、淘宝、天猫、当当网、咸鱼……我此次选题是旧二手书期刊类数据可视化分析。

二、网络爬虫设计方案

爬虫名称：孔夫子旧书网期刊数据爬取

内容：通过爬虫程序爬取期刊旧书价格，然后进行数据可视化分析。

方案描述：

1、request请求访问

2、解析网页，爬取数据。这里采用xtree.xpath

3、数据保存，使用sys

三、结构特征分析

结构特征：内容导航型

python爬虫—孔夫子旧书网数据可视化分析_数据

结构分析：

及查找方法

#书名bookname、出版社publishing_house、发货率delivery、价格price、上架时间bookTime_on_shelf、书店bookShop
bookname = html.xpath('//*[@id="listBox"]/div[{}]/div[2]/div[1]/a/text()'.format(count))
publishing_house = html.xpath('//*[@id="listBox"]/div[{}]/div[2]/div[2]/div[1]/div/span[2]/text()'.format(count))
delivery = html.xpath('//*[@id="listBox"]/div[{}]/div[2]/div[3]/div[2]/span[2]/i/text()'.format(count))
price = html.xpath('//*[@id="listBox"]/div[{}]/div[3]/div[1]/div[2]/span[2]/text()'.format(count))
bookTime_on_shelf = html.xpath('//*[@id="listBox"]/div[{}]/div[3]/div[4]/span[1]/text()'.format(count))
bookShop = html.xpath('//*[@id="listBox"]/div[{}]/div[2]/div[3]/div[1]/div[3]/a/text()'.format(count))

遍历：

            for i in range(50):
                bookname = html.xpath('//*[@id="listBox"]/div[{}]/div[2]/div[1]/a/text()'.format(count))
                for i in bookname:
                    bookname = i
                publishing_house = html.xpath('//*[@id="listBox"]/div[{}]/div[2]/div[2]/div[1]/div/span[2]/text()'.format(count))
                for i in publishing_house:
                    publishing_house = i
                delivery = html.xpath('//*[@id="listBox"]/div[{}]/div[2]/div[3]/div[2]/span[2]/i/text()'.format(count))
                for i in delivery:
                    delivery = i.strip("%")
                price = html.xpath('//*[@id="listBox"]/div[{}]/div[3]/div[1]/div[2]/span[2]/text()'.format(count))
                for i in price:
                    price = i
                bookTime_on_shelf = html.xpath('//*[@id="listBox"]/div[{}]/div[3]/div[4]/span[1]/text()'.format(count))
                for i in bookTime_on_shelf:
                    bookTime_on_shelf = i
                bookShop = html.xpath('//*[@id="listBox"]/div[{}]/div[2]/div[3]/div[1]/div[3]/a/text()'.format(count))
                for i in bookShop:
                    bookShop = i
                count += 1

四、网络爬虫设计

数据爬取与采集

代码分析：

 1 import time
 2 import random
 3 import requests
 4 from lxml import etree
 5 import sys
 6 import re
 7 
 8 
 9 USER_AGENTS = [
10                 'Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/22.0',
11                 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:22.0) Gecko/20130328 Firefox/22.0',
12                 'Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0',
13                 'Mozilla/5.0 (Microsoft Windows NT 6.2.9200.0); rv:22.0) Gecko/20130405 Firefox/22.0',
14                 'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1',
15                 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1',
16                 'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:21.0.0) Gecko/20121011 Firefox/21.0.0',
17                 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20130514 Firefox/21.0',
18                 'Mozilla/5.0 (Windows NT 6.2; rv:21.0) Gecko/20130326 Firefox/21.0',
19                 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130401 Firefox/21.0',
20                 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130331 Firefox/21.0',
21                 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130330 Firefox/21.0',
22                 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0',
23                 'Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130401 Firefox/21.0',
24                 'Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130328 Firefox/21.0',
25                 'Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20100101 Firefox/21.0',
26                 'Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130401 Firefox/21.0',
27                 'Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130331 Firefox/21.0',
28                 'Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20100101 Firefox/21.0',
29                 'Mozilla/5.0 (Windows NT 5.0; rv:21.0) Gecko/20100101 Firefox/21.0',
30                 'Mozilla/5.0 (Windows NT 6.2; Win64; x64;) Gecko/20100101 Firefox/20.0',
31                 'Mozilla/5.0 (Windows NT 6.1; rv:6.0) Gecko/20100101 Firefox/19.0',
32                 'Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/18.0.1',
33                 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0)  Gecko/20100101 Firefox/18.0',
34                 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
35                 ]
36 headers = {
37     'User-Agent':random.choice(USER_AGENTS),
38     'Connection':'keep-alive',
39     'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2'
40     }
41 # 创建Kongfuzi.csv
42 file = open("Kongfuzi.csv", "a")
43 file.write("bookname" + "," + "publishing_house"  + "," + "price" +  "," + "bookTime_on_shelf" +  "," + "bookShop" + '\n')
44 file = file.close()
45 
46 def Kongfuzi(keyword):
47     try:
48         for i in range(0,keyword):
49             url = "https://book.kongfz.com/Cqikan/cat_10002w{}".format(str(i))
50             req = requests.get(url=url,headers=headers)
51             # print(req.text)
52             html = etree.HTML(req.text)
53             count = 1
54 
55             #书名bookname、出版社publishing_house、发货率delivery、价格price、上架时间bookTime_on_shelf、书店bookShop
56             for i in range(50):
57                 bookname = html.xpath('//*[@id="listBox"]/div[{}]/div[2]/div[1]/a/text()'.format(count))
58                 for i in bookname:
59                     bookname = i
60                 publishing_house = html.xpath('//*[@id="listBox"]/div[{}]/div[2]/div[2]/div[1]/div/span[2]/text()'.format(count))
61                 for i in publishing_house:
62                     publishing_house = i
63                 delivery = html.xpath('//*[@id="listBox"]/div[{}]/div[2]/div[3]/div[2]/span[2]/i/text()'.format(count))
64                 for i in delivery:
65                     delivery = i.strip("%")
66                 price = html.xpath('//*[@id="listBox"]/div[{}]/div[3]/div[1]/div[2]/span[2]/text()'.format(count))
67                 for i in price:
68                     price = i
69                 bookTime_on_shelf = html.xpath('//*[@id="listBox"]/div[{}]/div[3]/div[4]/span[1]/text()'.format(count))
70                 for i in bookTime_on_shelf:
71                     bookTime_on_shelf = i
72                 bookShop = html.xpath('//*[@id="listBox"]/div[{}]/div[2]/div[3]/div[1]/div[3]/a/text()'.format(count))
73                 for i in bookShop:
74                     bookShop = i
75                 count += 1
76                 #保存数据
77                 with open("Kongfuzi.csv", "a") as f2:
78                     f2.writelines(bookname + "," + publishing_house + "," + price +  "," + bookTime_on_shelf +  "," + bookShop + '\n')
79                     f2.close()
80 
81                 #显示保存数据
82                 print(bookname,
83                       "出版社：",publishing_house,'\n',
84                       "发货率：",delivery,'%\n',
85                       "价格：",price,'元\n',
86                       "上架时间：",bookTime_on_shelf,'\n',
87                       "书店名：",bookShop)
88                 print('\n')
89     except:
90         print("网络错误")
91 
92 
93 if __name__ == '__main__':
94     keyword = input("爬取几页：")
95     Kongfuzi(int(keyword))

数据的清洗与处理

import pandas as pd
import numpy as np
# xs为销量排行的表格、zh为综合表排序
xs =  pd.read_csv(r'D:\Py_project\Kongfuzi.csv',error_bad_lines=False,encoding='gbk')

# 重复值处理
xs = xs.drop_duplicates('bookname')
# Nan处理
xs = xs.dropna(axis = 0)

# 根据价格数降序排序
xs.sort_values(by=["price"],inplace=True,ascending=[False])
xs.head(20)

　　 python爬虫—孔夫子旧书网数据可视化分析_html_02

# 价格排行可视化分析
import matplotlib.pyplot as plt
x = xs['bookname'].head(20)
y = xs['price'].head(20)
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False
plt.xticks(rotation=90)
plt.bar(x,y,alpha=0.2, width=0.4, color='b', lw=3,label="price")
plt.plot(x,y,'-',color = 'r',label="sell")
plt.legend(loc = "best")#图例
plt.title("价格趋势图")
plt.xlabel("书名",)#横坐标名字
plt.ylabel("价格")#纵坐标名字
plt.show()

python爬虫—孔夫子旧书网数据可视化分析_html_03

python爬虫—孔夫子旧书网数据可视化分析_词云_04

plt.barh(x,y, alpha=0.2, height=0.4, color='g',label="价格", lw=3)
plt.title("价格水平图")
plt.legend(loc = "best")#图例
plt.xlabel("价格",)#横坐标名字
plt.ylabel("书名")#纵坐标名字
plt.show()

python爬虫—孔夫子旧书网数据可视化分析_词云_05

# 散点图
plt.scatter(x,y,color='gray',marker='o',s=40,alpha=0.5)
plt.xticks(rotation=90)
plt.title("价格散点图")
plt.xlabel("主题",)#横坐标名字
plt.ylabel("价格")#纵坐标名字
plt.show()

python爬虫—孔夫子旧书网数据可视化分析_firefox_06

plt.boxplot(y)
plt.title("价格盒图")
plt.show()

python爬虫—孔夫子旧书网数据可视化分析_firefox_07

云词：

import pandas as pd
import numpy as np
import wordcloud as wc
from PIL import Image
import matplotlib.pyplot as plt
import random

bk = np.array(Image.open(r"C:\Users\X0iaoyan\Downloads\111.jpg"))
mask = bk
# 定义尺寸
word_cloud = wc.WordCloud(
                       width=1000,  # 词云图宽
                       height=1000,  # 词云图高
                       mask = mask,
                       background_color='black',  # 词云图背景颜色，默认为白色
                       font_path='msyhbd.ttc',  # 词云图 字体（中文需要设定为本机有的中文字体）
                       max_font_size=400,  # 最大字体，默认为200
                       random_state=50,  # 为每个单词返回一个PIL颜色
                       )
text = xs["bookname"]
text = " ".join(text)
word_cloud.generate(text)
plt.imshow(word_cloud)
plt.show()

python爬虫—孔夫子旧书网数据可视化分析_词云_08

可视化分析总代码：

 1 import pandas as pd
 2 import numpy as np
 3 # xs为销量排行的表格、zh为综合表排序
 4 xs =  pd.read_csv(r'D:\Py_project\Kongfuzi.csv',error_bad_lines=False,encoding='gbk')
 5 
 6 # 重复值处理
 7 xs = xs.drop_duplicates('bookname')
 8 # Nan处理
 9 xs = xs.dropna(axis = 0)
10 
11 # 根据价格数降序排序
12 xs.sort_values(by=["price"],inplace=True,ascending=[False])
13 xs.head(20)
14 
15 # 价格排行可视化分析
16 import matplotlib.pyplot as plt
17 x = xs['bookname'].head(20)
18 y = xs['price'].head(20)
19 plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
20 plt.rcParams['axes.unicode_minus']=False
21 plt.xticks(rotation=90)
22 plt.bar(x,y,alpha=0.2, width=0.4, color='b', lw=3,label="price")
23 plt.plot(x,y,'-',color = 'r',label="sell")
24 plt.legend(loc = "best")#图例
25 plt.title("价格趋势图")
26 plt.xlabel("书名",)#横坐标名字
27 plt.ylabel("价格")#纵坐标名字
28 plt.show()
29 
30 plt.barh(x,y, alpha=0.2, height=0.4, color='g',label="价格", lw=3)
31 plt.title("价格水平图")
32 plt.legend(loc = "best")#图例
33 plt.xlabel("价格",)#横坐标名字
34 plt.ylabel("书名")#纵坐标名字
35 plt.show()
36 
37 # 散点图
38 plt.scatter(x,y,color='gray',marker='o',s=40,alpha=0.5)
39 plt.xticks(rotation=90)
40 plt.title("价格散点图")
41 plt.xlabel("主题",)#横坐标名字
42 plt.ylabel("价格")#纵坐标名字
43 plt.show()
44 
45 plt.boxplot(y)
46 plt.title("价格盒图")
47 plt.show()
48 
49 
50 import pandas as pd
51 import numpy as np
52 import wordcloud as wc
53 from PIL import Image
54 import matplotlib.pyplot as plt
55 import random
56 
57 bk = np.array(Image.open(r"C:\Users\X0iaoyan\Downloads\111.jpg"))
58 mask = bk
59 # 定义尺寸
60 word_cloud = wc.WordCloud(
61                        width=1000,  # 词云图宽
62                        height=1000,  # 词云图高
63                        mask = mask,
64                        background_color='black',  # 词云图背景颜色，默认为白色
65                        font_path='msyhbd.ttc',  # 词云图 字体（中文需要设定为本机有的中文字体）
66                        max_font_size=400,  # 最大字体，默认为200
67                        random_state=50,  # 为每个单词返回一个PIL颜色
68                        )
69 text = xs["bookname"]
70 text = " ".join(text)
71 word_cloud.generate(text)
72 plt.imshow(word_cloud)
73 plt.show()