python 爬取分页 python分页爬取page

转载

fjfdh 2023-07-01 15:24:42

文章标签 python 爬取分页 python 分页爬取 HTML xml python 文章分类 Python 后端开发

二十、python 分页爬取(百思不得姐信息爬取)

import requests
from lxml import etree
import datetime
 
#获取段子的内容
def getJokeList(basurl='http://www.budejie.com/text/{0}'):
    nextPage = True
    pageNum = 1
    while nextPage:
        url = basurl.format(pageNum)
        response = requests.get(url)
        selector = etree.HTML(response.text)
        jokes = selector.xpath('//*/div[@class="j-r-list-c-desc"]/a/text()')
        for joke in jokes:
            yield joke
        hasNext = selector.xpath('//a[@class="pagenxt"]')
        if hasNext:
            pageNum += 1
        else:
            nextPage = False
       # print pageNum
 
#获取段子内容、赞、分享、收藏数
def getJokeOfAllList(basurl='http://www.budejie.com/text/{0}'):
    nextPage = True
    pageNum = 1
    while nextPage:
        url = basurl.format(pageNum)
        response = requests.get(url)
        selector = etree.HTML(response.text)
        all = selector.xpath('//*/div[@class="j-r-list"]/ul/li')
        for a in all:
            joke = a.xpath('div[@class="j-r-list-c"]/div[@class="j-r-list-c-desc"]/a/text()')[0]
            like = a.xpath('div[@class="j-r-list-tool"]/div[@class="j-r-list-tool-l "]/ul/li[@class="j-r-list-tool-l-up"]/span/text()')[0]
            down = a.xpath('div[@class="j-r-list-tool"]/div[@class="j-r-list-tool-l "]/ul/li[@class="j-r-list-tool-l-down "]/span/text()')[0]
            share = a.xpath('div[@class="j-r-list-tool"]/div[@class="j-r-list-tool-ct"]/div[@class="j-r-list-tool-ct-share-c"]/span/text()')[0]
            comment = a.xpath( 'div[@class="j-r-list-tool"]/div[@class="j-r-list-tool-r j-r-list-tool-cc"]/ul/li[@class=" f-tac j-comment j-comment-width  j-comment-down-width"]/a/span[@class="comment-counts"]/text()')[0]
分享??",""), '=====', comment, '====='
分享??",""), comment
        hasNext = selector.xpath('//a[@class="pagenxt"]')
        if hasNext:
            pageNum += 1
        else:
            nextPage = False
        print pageNum
 
if __name__ == "__main__":
    f = open('basejie.txt','w')
    # for joke in getJokeList():
    #     #print joke
    #     f.writelines(joke.encode('utf-8'))
    #     f.writelines('\n')
    #     f.writelines('~'*100)
    #     f.writelines('\n')
    # f.close()
 
    ###############################################
    #getJokeOfAllList()
    for joke, like, down, share, comment in getJokeOfAllList():
        print joke.encode('utf-8')+'\t'+(like + '\t' + down + '\t' + share.encode("utf-8").replace('??',"") + '\t' + comment).encode("utf-8")
        f.writelines(joke.encode('utf-8')+'\t'+(like + '\t' + down + '\t' + share.encode("utf-8").replace('??',"") + '\t' + comment).encode("utf-8"))
        f.writelines('\n')
    f.close()

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。