python爬虫学习文档 python爬虫教程

转载

bigrobin 2023-11-08 19:51:46

文章标签 python爬虫学习文档数据 python 命令行 文章分类 Python 后端开发

前言：

本教程所爬取的数据仅用于自己使用，无任何商业用途，若有侵权行为，请联系本人，本人可以删除，另外如果转载，请注明来源链接。

两种方式：

采用scrapy框架的形式。
采用非框架的形式，具体是采用requests和etree。

正题：

首先介绍的是非框架的形式。这种形式便于部署到服务器上，定时批量的爬取数据。
以此网站为例，当然最好用谷歌打开此网站，这样可以更好的上手爬虫。
废话不多说，开始干货了。
谷歌浏览器打开http://stock.stcn.com/，鼠标右击“检查”，出现如下图所示画面

python爬虫学习文档 python爬虫教程_命令行

windows用户同时使用“ctrl”和“f”,mac用户同时使用“win”和“f”,调出搜索框, 如下图所示。

python爬虫学习文档 python爬虫教程_数据_02

这个搜索框用于输入xpath，就可以定位到本页面的任何位置，得到你所需要爬取的数据，这其实是爬虫任务上最繁琐耗时的事情，而且也会出现所爬页面样式改变的情况（例如该网站前端开发人员由于版本迭代将前端样式改变），你就得重新找到爬取数据的新的xpath。
那么如何得到xpath呢，首先你得知道xpath是什么，参考教程xpath用法，
了解了是什么以及怎么用之后就可以教你怎么得到所爬数据的xpath了，例如要爬取文章的标题，那么首先要点击箭头，然后鼠标移动到文章的标题上去，就可以看到文章所处的位置，从而找到xpath。
箭头以及本人找到的xpath如图所示。

python爬虫学习文档 python爬虫教程_python爬虫学习文档_03

当然下一步你就要进入具体的文章，把文章的内容、作者、发行时间等爬取下来，你在浏览器上点击文章标题就会进入文章的详情页，那代码怎么操作呢，实质上上述操作只是从一个网页进入了另一个网页，还记得我们已经找到了文章的标题，一般文章的标题和文章详情页的链接是在一块的，所以这两个的xpath一般只是后面的部分有少许不同，具体参见代码部分“parse_links函数”。

相关代码如下

def parse_links():  
    link_list = []      
    links_xpath_1 = '//div[@class="box_left"]//li//a/@href' 
    titles_xpath_1 = '//div[@class="box_left"]//li//a/@title'                   

    for url in URL_LIST:     
        try:    
            res = requests.get(url, timeout=20)      
            sel = etree.HTML(res.text)      
            links = sel.xpath(links_xpath_1)[:10]   
            titles = sel.xpath(titles_xpath_1)[:10]   
            assert len(links) == len(titles)                                    
            if links and titles:    
                for l, t in zip(links, titles):
                    join_l = urljoin(url, l)  
                    link_list.append((t, join_l))    
      
        except Exception as e:                                                  
            logger.error("get links error: %s", e, exc_info=True)
    return link_list

接下来就是爬取文章详情页的内容，具体参见代码部分"parse_details"

相关代码如下

def parse_details(link_list):
    publ_date_xpath = '//div[@class="info"]/text()'      
    author_xpath = '//div[@class="info"]/span[1]/text()'  
    content_xpath = '//div[@class="txt_con"]/p//text()' 
 
    res_list = []     
    count = 0                                                                   
    def convert(s):     
        return s.strip()    

    for t, l in link_list:   
        try:    
            res = requests.get(l, timeout=20)  
            sel = etree.HTML(res.text)   
            publ_date = sel.xpath(publ_date_xpath) 
            if not publ_date:   
                continue
            publ_date = datetime.strptime(publ_date[0].strip(), '%Y-%m-%d %H:%M')  
            if publ_date < datetime.now() - timedelta(days=2):  
                continue  
            author = sel.xpath(author_xpath)   
            author = author[0].split('：')[1].strip()   
            content = sel.xpath(content_xpath)
            content = ' '.join(map(convert, content))  
  
        except Exception as e:    
            logger.error("failed to parse detail: %s", e, exc_info=True)        
            count += 1    
        else:  
            res_list.append((t, l, publ_date, author, content))     
            print(t, l, publ_date, author, content) 
    if count == len(link_list) and len(link_list) > 0: 
        return [], False    
    return res_list, True

还有一些具体的小处理，自己看看代码就好了。

完整代码分享如下：

# !/usr/bin/python    
# -*- coding: utf-8 -*-   
from __future__ import absolute_import, print_function, unicode_literals                                                                                        
import logging 
from datetime import datetime, timedelta              
import requests 
from lxml.html import etree     
# python2 and 3                                                                 
try:      
    from urlparse import urljoin    
except ImportError:      
    from urllib.parse import urljoin     
logger = logging.getLogger(__name__)                                            

URL_LIST = [                                                                    
    'http://stock.stcn.com/'                                                    
]                                                                               
     
def parse_links():  
    link_list = []      
    links_xpath_1 = '//div[@class="box_left"]//li//a/@href' 
    titles_xpath_1 = '//div[@class="box_left"]//li//a/@title'                   

    for url in URL_LIST:     
        try:    
            res = requests.get(url, timeout=20)      
            sel = etree.HTML(res.text)      
            links = sel.xpath(links_xpath_1)[:10]   
            titles = sel.xpath(titles_xpath_1)[:10]   
            assert len(links) == len(titles)                                    
            if links and titles:    
                for l, t in zip(links, titles):
                    join_l = urljoin(url, l)  
                    link_list.append((t, join_l))    
      
        except Exception as e:                                                  
            logger.error("get links error: %s", e, exc_info=True)
    return link_list 


def parse_details(link_list):
    publ_date_xpath = '//div[@class="info"]/text()'      
    author_xpath = '//div[@class="info"]/span[1]/text()'  
    content_xpath = '//div[@class="txt_con"]/p//text()' 
 
    res_list = []     
    count = 0                                                                   
    def convert(s):     
        return s.strip()    

    for t, l in link_list:   
        try:    
            res = requests.get(l, timeout=20)  
            sel = etree.HTML(res.text)   
            publ_date = sel.xpath(publ_date_xpath) 
            if not publ_date:   
                continue
            publ_date = datetime.strptime(publ_date[0].strip(), '%Y-%m-%d %H:%M')  
            if publ_date < datetime.now() - timedelta(days=2):  
                continue  
            author = sel.xpath(author_xpath)   
            author = author[0].split('：')[1].strip()   
            content = sel.xpath(content_xpath)
            content = ' '.join(map(convert, content))  
  
        except Exception as e:    
            logger.error("failed to parse detail: %s", e, exc_info=True)        
            count += 1    
        else:  
            res_list.append((t, l, publ_date, author, content))     
            print(t, l, publ_date, author, content) 
    if count == len(link_list) and len(link_list) > 0: 
        return [], False    
    return res_list, True     
     

def process():
    link_list = parse_links()  
    res_list, ret = parse_details(link_list)   
    if res_list:  
        logger.info('get %d news', len(res_list)) 
        if ret:  
            logger.info('successfull save news')   


if __name__ == "__main__":                                                      
    process()

上述代码你可以放到以“crawl1.py”命名的文件里，然后在命令行输入“python3 crawl1.py > ttt”那么爬虫结果就会自动写入到ttt文件中，当然如果winows命令行不能用输出流’>’，则可以直接在windows的命令行输入“python3 crawl1.py”结果就直接打在了屏幕上，windows命令行进入的方式是“win+r”，然后cmd回车就可以进入命令行了，然后“cd Desktop”就进入桌面了，你可以把crawl.py文件放到桌面上，就可以保证运行了。
完整代码亲测可以运行，请放心学习，有问题可以评论
本博客的爬虫结果直接存储到文件中，当然最好可以放到数据库中，这样可以查重，避免重复爬取，如有读者需要数据库存储爬虫数据的可以评论留言，人多就会发怎么样设置数据库，以及怎么样查重避免重复抓取数据。

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。