#coding:utf-8 import requests import time from gevent import monkey;monkey.patch_all() import gevent
from bs4 import BeautifulSoup
import random from lxml import etree import csv
import pandas as PD
res_list = [] def get_html(url,data=None): header = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh - CN, zh;q = 0.9', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,/;q=0.8' }
timeout = random.choice(range(30,60))
while True:
try:
rep = requests.get(url,headers=header,timeout=timeout)
break
except requests.HTTPError as e:
print(e)
#print(rep.text)
return rep
def get_data(response): data = []
# soup = BeautifulSoup(response,'lxml')
tree = etree.HTML(response.text)
str_name = tree.xpath('//div[@class="info"]/h2[@class=""]/a/text()')
#print(str_name)
list_s = [story for story in str_name ]
# data.append(story)
all_data = ','.join(list_s)
all_list = all_data.replace('\n','').replace(' ','').split(',')
# for lis in all_list:
# end_data = res_list.append(lis)
#print(end_data)
print (all_list)
write_data(all_list,'story.csv')
# return all_list
def write_data(data,name): # dataframe = PD.DataFrame({'小说名称': lis}) # dataframe.to_csv('storys.csv',index=True,sep=',') file_name = name with open(file_name,'a',errors='ignore',newline='') as f: f_csv = csv.writer(f) f_csv.writerows(data)
if name == 'main': for i in range(0,981,20): url_list = ['https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=' + str(i),'https://book.douban.com/tag/%E5%A4%96%E5%9B%BD%E6%96%87%E5%AD%A6?start=' + str(i), 'https://book.douban.com/tag/文学?start='+str(i),'https://book.douban.com/tag/%E9%9A%8F%E7%AC%94?start='+ str(i)] for url in url_list: response = get_html(url) # time.sleep(1) # result = get_data(response) task = [gevent.spawn(get_data,response)] result = gevent.joinall(task) # print(result) # write_data(,'story.csv')