二十、python 分页爬取(百思不得姐信息爬取)
import requests
from lxml import etree
import datetime
#获取段子的内容
def getJokeList(basurl='http://www.budejie.com/text/{0}'):
nextPage = True
pageNum = 1
while nextPage:
url = basurl.format(pageNum)
response = requests.get(url)
selector = etree.HTML(response.text)
jokes = selector.xpath('//*/div[@class="j-r-list-c-desc"]/a/text()')
for joke in jokes:
yield joke
hasNext = selector.xpath('//a[@class="pagenxt"]')
if hasNext:
pageNum += 1
else:
nextPage = False
# print pageNum
#获取段子内容、赞、分享、收藏数
def getJokeOfAllList(basurl='http://www.budejie.com/text/{0}'):
nextPage = True
pageNum = 1
while nextPage:
url = basurl.format(pageNum)
response = requests.get(url)
selector = etree.HTML(response.text)
all = selector.xpath('//*/div[@class="j-r-list"]/ul/li')
for a in all:
joke = a.xpath('div[@class="j-r-list-c"]/div[@class="j-r-list-c-desc"]/a/text()')[0]
like = a.xpath('div[@class="j-r-list-tool"]/div[@class="j-r-list-tool-l "]/ul/li[@class="j-r-list-tool-l-up"]/span/text()')[0]
down = a.xpath('div[@class="j-r-list-tool"]/div[@class="j-r-list-tool-l "]/ul/li[@class="j-r-list-tool-l-down "]/span/text()')[0]
share = a.xpath('div[@class="j-r-list-tool"]/div[@class="j-r-list-tool-ct"]/div[@class="j-r-list-tool-ct-share-c"]/span/text()')[0]
comment = a.xpath( 'div[@class="j-r-list-tool"]/div[@class="j-r-list-tool-r j-r-list-tool-cc"]/ul/li[@class=" f-tac j-comment j-comment-width j-comment-down-width"]/a/span[@class="comment-counts"]/text()')[0]
分享??",""), '=====', comment, '====='
分享??",""), comment
hasNext = selector.xpath('//a[@class="pagenxt"]')
if hasNext:
pageNum += 1
else:
nextPage = False
print pageNum
if __name__ == "__main__":
f = open('basejie.txt','w')
# for joke in getJokeList():
# #print joke
# f.writelines(joke.encode('utf-8'))
# f.writelines('\n')
# f.writelines('~'*100)
# f.writelines('\n')
# f.close()
###############################################
#getJokeOfAllList()
for joke, like, down, share, comment in getJokeOfAllList():
print joke.encode('utf-8')+'\t'+(like + '\t' + down + '\t' + share.encode("utf-8").replace('??',"") + '\t' + comment).encode("utf-8")
f.writelines(joke.encode('utf-8')+'\t'+(like + '\t' + down + '\t' + share.encode("utf-8").replace('??',"") + '\t' + comment).encode("utf-8"))
f.writelines('\n')
f.close()