python爬虫抓取全网招标竞价 python3网络爬虫数据采集

转载

架构领航博主 2024-08-20 15:12:43

文章标签 python爬虫抓取全网招标竞价 python 爬虫 xml ide 文章分类 Python 后端开发

一、预备知识

处理网页的链接（只列出一种）

# 用request和BeautifulSoup处理网页
def requestOver(url):
    req = request.Request(url)
    response = request.urlopen(req)
    soup = BeautifulSoup(response, 'lxml')
    return soup

从网页下载到本地txt的模块

# 从网页下载标题和内容到txt文档
def download(title, url, y):
    soup = requestOver(url)
    tag = soup.find('div', class_="article")
    if(tag == None):
        return 0
    title = title.replace(':', '')
    title = title.replace('"', '')
    title = title.replace('|', '')
    title = title.replace('/', '')
    title = title.replace('\\', '')
    title = title.replace('*', '')
    title = title.replace('<', '')
    title = title.replace('>', '')
    title = title.replace('?', '')
    # print(tag.get_text())
    content = ""
    for p in tag.findAll('p'):
        if (p.string != None):
            content = content + p.string
    filename = r'E:\code\python\spider_news\sina_news\society\\' + title + '.txt'
    with open(filename, 'w', encoding='utf8') as file_object:
        file_object.write('           ')
        file_object.write(title)
        file_object.write(tag.get_text())
    print('正在爬取第', y, '个新闻', title)

正则表达式知识（最好要有）

二、静态爬取

静态网页爬取，就是通过网页源码定位到你所要爬取的内容，获取内容的标签和属性，再通过爬虫爬取出来即可。
本段代码思想：以新浪新闻网为例，通过url的findAll()函数找到所有符合的网址，从头遍历。进入一个网址后如果存在内容，则下载内容和标题，并且再次寻找该网页内符合条件的url，否则该网页不符合条件，退出该网页。

from bs4 import BeautifulSoup
from urllib import request

# 用request和BeautifulSoup处理网页
def requestOver(url):
    req = request.Request(url)
    response = request.urlopen(req)
    soup = BeautifulSoup(response, 'lxml')
    return soup

# 从网页下载标题和内容到txt文档
def download(title, url, y):
    soup = requestOver(url)
    tag = soup.find('div', class_="article")
    if(tag == None):
        return 0
    title = title.replace(':', '')
    title = title.replace('"', '')
    title = title.replace('|', '')
    title = title.replace('/', '')
    title = title.replace('\\', '')
    title = title.replace('*', '')
    title = title.replace('<', '')
    title = title.replace('>', '')
    title = title.replace('?', '')
    content = ""
    for p in tag.findAll('p'):
        if (p.string != None):
            content = content + p.string
    filename = r'E:\code\python\\' + title + '.txt'
    with open(filename, 'w', encoding='utf8') as file_object:
        file_object.write('           ')
        file_object.write(title)
        file_object.write(tag.get_text())
    print('正在爬取第', y, '个新闻', title)

# 爬虫具体执行过程
def crawlAll(url, y):
    soup = requestOver(url)
    for tag in soup.findAll("a", target="_blank"):
        if tag.string != None:	#标题非空
            if len(tag.string) > 8: # 标题长度大于8
                if(("https://news.sina.com.cn/" in tag.attrs["href"]) or ("http://news.sina.com.cn/" in tag.attrs["href"])):
                    alllist.append(tag.attrs["href"])
                    if ((tag.attrs["href"] not in collection)):
                        collection.add(tag.attrs["href"])
                        try:
                            print(tag.attrs['href'])
                            download(tag.string, tag.attrs['href'], y)
                            y += 1
                        except Exception:
                            print("第" + str(y) + "个新闻爬取失败")
                        else:
                            crawlAll(tag.attrs['href'], y)
    return y

if __name__ == '__main__':
    y = 1
    collection = set() # 用于链接去重
    alllist = set()	# 用于存放你需要爬取的网页
    alllist = ["https://news.sina.com.cn/"]
    for n in alllist:
        target_url = n
        y = crawlAll(target_url, y)

三、动态爬取

爬取动态网页则更为复杂，本次介绍一种爬取动态加载的方法。需要用到：postman。
本次实验的网址是：
由于无法用静态方法爬取，所以需要对网页进行抓包。

打开网页后，右键检查-点击网络（NETWORK）-点击网页的下一页-显示出有一个请求

python爬虫抓取全网招标竞价 python3网络爬虫数据采集_python

右键该包-copy-copy as cURL（bash）

python爬虫抓取全网招标竞价 python3网络爬虫数据采集_python爬虫抓取全网招标竞价_02

放入postman中，import-raw text，导入后可以点击send尝试发送请求，如果出现了内容则抓成功了。

python爬虫抓取全网招标竞价 python3网络爬虫数据采集_ide_03

点击code-python requests则可以复制代码到pycharm中粘贴，一般修改page即可达到抓取动态加载页面的效果。

import requests

url = "https://feed.mix.sina.com.cn/api/roll/get?pageid=153&lid=2669&k=&num=50&page=2&r=0.3982520273586394&callback=jQuery1112024604807702249287_1604838144359&_=1604838144361"

payload={}
headers = {
  'authority': 'feed.mix.sina.com.cn',
  'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36',
  'accept': '*/*',
  'sec-fetch-site': 'same-site',
  'sec-fetch-mode': 'no-cors',
  'sec-fetch-dest': 'script',
  'referer': 'https://news.sina.com.cn/roll/',
  'accept-language': 'zh-CN,zh;q=0.9',
}

response = requests.request("GET", url, headers=headers, data=payload)

print(response.text)

具体代码如下：

import re
from bs4 import BeautifulSoup
from urllib import request
import requests

# 动态获取网页下符合条件的链接
def solve(page):
    url1 = "https://feed.mix.sina.com.cn/api/roll/get?pageid=153&lid=2669&k=&num=50&page="
    url2 = "&r=0.7488014654950375&callback=jQuery1112025760955190502766_1604665024595&_=1604665024597"
    url = url1 + str(page) + url2
    payload = {}
    headers = {
      'authority': 'feed.mix.sina.com.cn',
      'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36',
      'accept': '*/*',
      'sec-fetch-site': 'same-site',
      'sec-fetch-mode': 'no-cors',
      'sec-fetch-dest': 'script',
      'referer': 'https://news.sina.com.cn/roll/',
      'accept-language': 'zh-CN,zh;q=0.9',
    }
    response = requests.request("GET", url, headers=headers, data=payload)
    response.encoding = "utf-8"
    l1 = str(response.text.replace("\\", "").split())
    res = re.findall(r'"url":"([^"]+)"', l1)
    return res

# 用requeset和BeautifulSoup处理网页
def requestOver(url):
    req = request.Request(url)
    response = request.urlopen(req)
    soup = BeautifulSoup(response, 'lxml')
    return soup

# 从网页下载标题和内容到txt文档
def download(title, url, y):
    soup = requestOver(url)
    tag = soup.find('div', class_="article")
    if(tag == None):
        return 0
    title = title.replace(':', '')
    title = title.replace('"', '')
    title = title.replace('|', '')
    title = title.replace('/', '')
    title = title.replace('\\', '')
    title = title.replace('*', '')
    title = title.replace('<', '')
    title = title.replace('>', '')
    title = title.replace('?', '')
    # print(tag.get_text())
    content = ""
    for p in tag.findAll('p'):
        if (p.string != None):
            content = content + p.string
    filename = r'E:\code\python\spider_news\sina_news\society\\' + title + '.txt'
    with open(filename, 'w', encoding='utf8') as file_object:
        file_object.write('           ')
        file_object.write(title)
        file_object.write(tag.get_text())
    print('正在爬取第', y, '个新闻', title)

if __name__ == '__main__':
    y = 1
    # 该网页50页以后会出错
    for page in range(50):
        url = solve(page)	
        for each in url:
            soup = requestOver(each)
            download(soup.find("h1", class_="main-title").string, each, y)
            y += 1

五、题外话

当然你也可以选择静态用scrapy，动态用selenium。这些都是爬虫的一些框架和包。

六、静态爬取中国新闻网

由于实验需要，特爬取中国新闻网滚动新闻的财经新闻
发现用"lxml"解析爬不到的用"html.parser"可以爬的到
需要爬取中国新闻网其他频道的新闻修改相关参数即可
因为流程简单，并未制作严谨的异常处理

# -*- coding: utf-8 -*-
# encoding='utf-8'
import re
from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup
from urllib import request
import datetime

# 用request和BeautifulSoup处理网页
def requestOver(url):
    response = requests.get(url)
    response.encoding = 'utf-8'
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

# 从网页下载标题和内容到txt文档
def download(title, url, y):
    soup = requestOver(url)
    tag = soup.find('div', class_="left_zw")
    if(tag == None):
        return 0
    # print(type(tag))
    # print(tag.get_text())
    title = title.replace(':', '')
    title = title.replace('"', '')
    title = title.replace('|', '')
    title = title.replace('/', '')
    title = title.replace('\\', '')
    title = title.replace('*', '')
    title = title.replace('<', '')
    title = title.replace('>', '')
    title = title.replace('?', '')
    # print(tag.get_text())
    content = ""
    for p in tag.findAll('p'):
        if (p.string != None):
            content = content + p.string
    filename = r'E:\code\python\spider_news\sina_news\eco\\' + title + '.txt'
    with open(filename, 'w', encoding='utf-8', errors='ignore') as file_object:
        file_object.write('           ')
        file_object.write(title)
        file_object.write(tag.get_text())
    print('正在爬取第', y, '个新闻', title)

# 爬虫具体执行过程
def crawlAll(url, y):
    soup = requestOver(url)
    for s in soup.findAll("div", class_="content_list"):
        for tag in s.findAll("li"):
            sp = tag.findAll("a")
            if("财经" in str(sp)):
                title = list(sp)[1].string
                urlAll = "http://www.chinanews.com" + str(list(sp)[1])[9:str(list(sp)[1]).find("shtml")+5]
                try:
                    download(title, urlAll, y)
                except Exception:
                    print("第" + str(y) + "个新闻爬取失败")
                else:
                    y += 1
    return y

if __name__ == '__main__':
    y = 1
    url1 = "http://www.chinanews.com/scroll-news/"
    date = "2020/1112"
    url2 = "/news.shtml"
    for i in range(3650):
        date1 = datetime.datetime.strptime(date, "%Y/%m%d")
        date2 = datetime.timedelta(days=-1)
        date = (date1 + date2).strftime("%Y/%m%d")
        target_url = url1 + date + url2
        print(target_url)
        y = crawlAll(target_url, y)

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。