爬取详细书籍数据信息
#coding:utf-8 501开始到1010 c=23 1001到1520 c=43 1501到2020 c=63
import requests ,re,json,pandas as pd,time,random
from selenium import webdriver #selenium===2.48.0 (支持phantomjs)
from lxml import etree
from openpyxl import load_workbook
def data():
url_list = []
c = 1
daliebiao=[] #页链接
for i in range(1,501, 50): # 可以是10000 先爬1到1000 再爬1000到2000 1到506
url = 'https://search.jd.com/Search?keyword=5g%E6%89%8B%E6%9C%BA&suggest=1.def.0.0&wq=5g%E6%89%8B%E6%9C%BA&page='+str(c)+'&s={}&click=0 '.format(i)
url_list.append(url)
c = c + 2
for url in url_list:
time.sleep(1)
columns = ['名称配置', '价格', '店铺名称', '详情链接(包含评价)','id','cpu型号', '网络频率', '5g信号', '上市年份','屏幕尺寸','入网型号','像素'] # 表的最顶横向的表栏
driver = webdriver.Chrome(r"C:\Users\V\Desktop\chromedriver.exe")
driver.get(url=url)
for i in range(100): #窗口下拉
# x管水平,y管垂直
js = 'window.scrollTo(0,%s)'%(i*100)
driver.execute_script(js)
time.sleep(0.1)
tree=etree.HTML(driver.page_source)
book_list = tree.xpath('//ul[@class="gl-warp clearfix"]/li') # 选一个标签作为树根,
for book in book_list:
time.sleep(2)
xiaoliebiao=[]
name_peizhi= book.xpath('.//a[@target="_blank"]/em/text()')
name_peizhi = ",".join(name_peizhi)
price=book.xpath('.//div[@class="p-price"]/strong/i/text()')[0]
dianpu=book.xpath('.//div[@class="p-shop"]/span/a/text()')
if len(dianpu)==0:
pass
else:
dianpu=dianpu[0]
pingjiaxiangqing_href=book.xpath('.//div[@class="p-commit"]/strong/a/@href')[0]
pingjiaxiangqing_href=str("https:")+pingjiaxiangqing_href
id = book.xpath('.//@data-sku')[0]
print(id)
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
driver1 = webdriver.Chrome(r"C:\Users\V\Desktop\chromedriver.exe")
driver1.get(url=pingjiaxiangqing_href)
tree1 = etree.HTML(driver1.page_source)
time.sleep(2)
cpu = tree1.xpath('//*[@id="detail"]/div[2]/div[2]/div[1]/div[3]/dl/dl/dd/text()') # 这个是xpathhelper 复制的
wangluo_pinlv = tree1.xpath('//*[@id="detail"]/div[2]/div[2]/div[1]/div[8]/dl/dl[2]/dt/text()')
five_g_wangluo = tree1.xpath('//*[@id="detail"]/div[2]/div[2]/div[1]/div[8]/dl/dl[3]/dd/text()')
shangshi_year = tree1.xpath('//*[@id="detail"]/div[2]/div[2]/div[1]/div[1]/dl/dl[3]/dd/text()')
pingmu_chicun = tree1.xpath('//*[@id="detail"]/div[2]/div[2]/div[1]/div[4]/dl/dl[3]/dd/text()')
ruwang_xinghao = tree1.xpath('//*[@id="detail"]/div[2]/div[2]/div[1]/div[1]/dl/dl[1]/dd[2]/text()')
xiangsu = tree1.xpath('//*[@id="detail"]/div[2]/div[2]/div[1]/div[5]/dl/dl[2]/dd/text()')
driver1.quit()
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!1
if float(price)<1000.00:
pass
else:
xiaoliebiao.append(name_peizhi)
xiaoliebiao.append(price)
xiaoliebiao.append(dianpu)
xiaoliebiao.append(pingjiaxiangqing_href)
xiaoliebiao.append(id)
daliebiao.append(xiaoliebiao)
xiaoliebiao.append(cpu)
xiaoliebiao.append(wangluo_pinlv)
xiaoliebiao.append(five_g_wangluo)
xiaoliebiao.append(shangshi_year)
xiaoliebiao.append(pingmu_chicun)
xiaoliebiao.append(ruwang_xinghao)
xiaoliebiao.append(xiangsu)
print(xiaoliebiao)
driver.quit()
df=pd.DataFrame(data=daliebiao,columns=columns)
df.to_excel('京东5g手机前10页.xls')
data()
根据id爬取商品所有评论
import requests ,re,json,pandas as pd,time
from lxml import etree
import os,time
# author:Jnchin
# 目前要获取的评价信息:
ind_dict = {'评论ID': 'id', # 一定要使用py3.6以后的版本,因为之前的版本字典是无序的!
'评价内容': 'content',
'评价时间': 'creationTime',
'评分': 'score',
'是否为vip(非0就是)': 'plusAvailable',
'评论图片': 'images', # 是url列表
'评论视频': 'videos', # 是url列表
'产品颜色': 'productColor',
'产品配置': 'productSize',
'用户昵称': 'nickname',
'点赞数': 'usefulVoteCount'}
columns = list(ind_dict.keys())
# 格式打印
def printf(str_):
print('\r',str_,end='',flush=True)
# 字典to列表
def dict2list(dic):
dic_infos = []
for col in columns:
name = ind_dict[col]
if name not in dic:
dic_infos.append('None')
continue
if name == 'images': # 对图片项特殊处理
dic_infos.append(['https:' + i['imgUrl'] for i in dic[name]])
elif name == 'videos': # 对视频项特殊处理
dic_infos.append([i['remark'] for i in dic[name]])
else:
dic_infos.append(dic[name])
return dic_infos
# 获取单页的评论信息
def getApage(pageUrl): # 用驱动==================================================================》
header = {
'Host': 'club.jd.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
'Referer': 'https://item.jd.com/'
}
res = requests.get(pageUrl, headers=header)
html = res.text
# str2json
ind = re.search('{', html).span()[1] - 1
html = html[ind:].replace(');', '')
jsontext = json.loads(html) # 若还想抓取其他条目,只需打印此变量即可
maxPage = jsontext['maxPage']
commentsInfos = jsontext['comments']
dic_infos_list = []
for commentDic in commentsInfos:
dic_infos = dict2list(commentDic)
dic_infos_list.append(dic_infos)
time.sleep(1)
return dic_infos_list, maxPage
# 获取此商品全部页的评价信息
def getAllComments(productId):
page = 0
all_list = []
while True:
pageUrl = 'https://club.jd.com/comment/productPageComments.action?\
callback=fetchJSON_comment98&productId={}&score=0&sortType=5&page={}&pageSize=10&isShadowSku=0&fold=1'\
.format(str(productId), str(page))
printf('正在采集:商品id%s第%s页评论。。。。。。'%(str(productId),str(page)))
try:
dic_infos_list, maxPage = getApage(pageUrl)
except:
printf('productId%spage%s获取失败!'%(str(productId),str(page)))
with open('error.txt','a') as f:
f.write(pageUrl+'\n')
continue
printf('采集成功')
all_list.extend(dic_infos_list)
if page >= maxPage-1 or page >=10:
break
page += 1
text = pd.DataFrame(columns=columns, data=all_list)
pcFolder = r'productComments/'
if not os.path.exists(pcFolder):
os.mkdir(pcFolder)
text.to_csv(pcFolder + str(productId) + '.csv', encoding='gbk')
if __name__=='__main__':
# 接口
# 取出本地文件中的商品id
filename = r'京东5g手机数据.xls'
data = pd.read_excel(filename)
a = data['id']
id_list = []
for a_1 in a:
id_list.append(a_1)
len(id_list)
# 开始抓取
with open('error.txt','w')as f:
f.write('')
temp = [i[:-4] for i in os.listdir('productComments/')] #设置断点重爬
id_list = [j for j in id_list if str(j) not in temp]
for ind, _id in enumerate(id_list):
productId = str(_id)
print('正在获取商品id为%s 进度:%s/%s '%(str(productId),str(ind),str(len(id_list))))
getAllComments(productId)
根据id单独爬取评论数目
import requests ,re,json,pandas as pd,time
from selenium import webdriver #selenium2.48.0 支持phantomjs
from lxml import etree
import os,time
from lxml import html
from html.parser import HTMLParser
#前六行是爬虫常用的一些库 其实不管用到用不到 直接一股脑儿import导入就好了
data = pd.read_excel('京东5g手机数据.xls')#读取这个表格
a = data['id']#把id这一列拿出来
id_list = []
for a_1 in a:#把id数据循环进一个列表中
id_list.append(a_1)
daliebiao=[]
for id in id_list:#遍历这个列表中每一条id
columns = ['评价总数']
xiaoliebiao = []
id= str(id)
headers = { #配置请求头
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
'Referer': "'https://item.jd.com/'+urrl+'100010617232.html'"
}
url="https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId="+str(id)+"&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1"
reponse = requests.get(url=url, headers=headers)
tree1=reponse.text #相应回来的代码转成text形式
if len(str(tree1)) <2: #如果响应回来的代码字符串长度小于2说明几乎是没请求到数据 出错了
pinglunshu=0
else:
reg = r'defaultGoodCountStr":"(.{4})'#只取“SessionId=”字符后面4位字符串下面四行是正则表达式的套路形式
reg2 = re.compile(reg)
pinglunshu= str(re.findall(reg2 ,tree1)[0])
print(pinglunshu)
xiaoliebiao.append(pinglunshu)
daliebiao.append(xiaoliebiao)
df=pd.DataFrame(data=daliebiao,columns=columns)
df.to_excel('京东5g手机评价.xls')