今天根据昨天爬取到的网址进行了二次爬取,爬取内容为每个信件的内容,然而本应该是一项很简单的任务,但是奈何数据是真的‘脏’,所以今天知识对所有的三万个网址进行了信件内容的爬取。
使用的时beautifulsoup进行爬取,在爬取的同时对数据进行了简单的处理,完善了一些bug。之后将按照下一步对怕爬取到的数据进行清洗,之后导入数据库,在进行下一步的操作。
爬取信件内容源代码:
# -*- coding: utf-8 -*-
"""
Created on Tue Jan 28 15:14:59 2020
@author: 陈欢
"""
import requests
from bs4 import BeautifulSoup
def ReadFile():
f=open('url','r',encoding = 'utf-8-sig')
Text=f.readlines()
Text2=[]
for i in range(len(Text)):
x=Text[i].split(',',1)
Text2.append(x[1])
return Text2
#爬取信件内容
def WriteFile(data):
f=open('data2.csv','a+',encoding = 'utf-8')
for i in range(len(data)):
if(i<(len(data)-1)):
f.write(data[i]+"\t")
else :
f.write(data[i]+"\n")
URLAll=ReadFile()
error=[]
time=1;
#for i in range(0,100,10): #0到100以10为步长,range总是考虑后面的数减步长
headers = { # 假装自己是浏览器
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/73.0.3683.75 Chrome/73.0.3683.75 Safari/537.36', # 把你刚刚拿到的Cookie塞进来
'cookie': 'HDJLJSID=39DBD6D5E12B9F0F8834E297FAFC973B; __jsluid_h=e6e550159f01ae9aceff30d191b09911; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216f9edc47471cb-0059c45dfa78d6-c383f64-1049088-16f9edc474895%22%7D; _gscu_564121711=80128103kc5dx617; X-LB=1.1.44.637df82f; _va_ref=%5B%22%22%2C%22%22%2C1580462724%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DM-f5ankfbAnnYIH43aTQ0bvcFij9-hVxwm64pCc6rhCu5DYwg6xEVis-OVjqGinh%26wd%3D%26eqid%3Dd6b151bf000cfb36000000025e1c5d84%22%5D; _va_ses=*; route=74cee48a71a9ef78636a55b3fa493f67; _va_id=b24752d801da28d7.1578917255.10.1580462811.1580450943.',}
session = requests.Session()
for i in URLAll:
try:
print(time)
time+=1
print(i)
url=i[0:len(i)-1]
#url="http://www.beijing.gov.cn/hudong/hdjl/com.web.consult.consultDetail.flow?originalId=10000037"
print(url)
response = session.get(url, headers=headers)
html = response.text #将网页内容以html返回
soup = BeautifulSoup(html,'lxml')#解析网页的一种方法
LetterPerson =soup.find_all('div',class_="col-xs-10 col-lg-3 col-sm-3 col-md-4 text-muted")#来信人
LetterCount = soup.find_all('div',class_="col-xs-12 col-md-12 column p-2 text-muted mx-2")#信件内容
AnswerDepartment = soup.find_all('div',class_="col-xs-9 col-sm-7 col-md-5 o-font4 my-2")#回答机构(一个或多个)
AnswerCount = soup.find_all('div',class_="col-xs-12 col-md-12 column p-4 text-muted my-3")#回答内容(对应回答机构数量)
AnswerTime = soup.find_all('div',class_="col-xs-12 col-sm-3 col-md-3 my-2")#回答时间(对应回答机构数量)
IsPlay=soup.find_all('span',class_="font14 offic blod")#是否回复
YPraise = soup.find_all('a',class_="dex_yes font12")#网友赞的数量
NPraise = soup.find_all('a',class_="dex_no font12")#网友不赞的数量
print(LetterPerson)
x=url.split('=',1)
url2=x[1]
if(len(IsPlay)!=0):
data=[]
data.append(url2)
data.append(LetterPerson[0].text)
data.append(LetterCount[0].text)
data.append("0")
data.append("0")
data.append("null")
data.append("null")
data.append("null")
data.append("false")
else :
data=[]
data.append(url2)
data.append(LetterPerson[0].text)
data.append(LetterCount[0].text)
data.append(YPraise[0].text)
data.append(NPraise[0].text)
for j in range(len(AnswerDepartment)):
data.append(AnswerDepartment[j].text)
data.append(AnswerCount[j].text)
data.append(AnswerTime[j].text)
data.append("true")
for j in range(len(data)):
replace=data[j]
replace=replace.replace('\r','')
replace=replace.replace('\n','')
replace=replace.replace('\t','')
replace=replace.replace('\xa0','')
replace=replace.replace('来信人:','')
replace=replace.replace('[官方回答]:','')
replace=replace.replace('答复时间:','')
data[j]=replace.replace(' ','')
#print(data)
WriteFile(data)
except IndexError:
error.append(time-1)
continue
print(error)