一、预备知识

  • 处理网页的链接(只列出一种)
# 用request和BeautifulSoup处理网页
def requestOver(url):
    req = request.Request(url)
    response = request.urlopen(req)
    soup = BeautifulSoup(response, 'lxml')
    return soup
  • 从网页下载到本地txt的模块
# 从网页下载标题和内容到txt文档
def download(title, url, y):
    soup = requestOver(url)
    tag = soup.find('div', class_="article")
    if(tag == None):
        return 0
    title = title.replace(':', '')
    title = title.replace('"', '')
    title = title.replace('|', '')
    title = title.replace('/', '')
    title = title.replace('\\', '')
    title = title.replace('*', '')
    title = title.replace('<', '')
    title = title.replace('>', '')
    title = title.replace('?', '')
    # print(tag.get_text())
    content = ""
    for p in tag.findAll('p'):
        if (p.string != None):
            content = content + p.string
    filename = r'E:\code\python\spider_news\sina_news\society\\' + title + '.txt'
    with open(filename, 'w', encoding='utf8') as file_object:
        file_object.write('           ')
        file_object.write(title)
        file_object.write(tag.get_text())
    print('正在爬取第', y, '个新闻', title)
  • 正则表达式知识(最好要有)

二、静态爬取

  1. 静态网页爬取,就是通过网页源码定位到你所要爬取的内容,获取内容的标签和属性,再通过爬虫爬取出来即可。
  2. 本段代码思想:以新浪新闻网为例,通过url的findAll()函数找到所有符合的网址,从头遍历。进入一个网址后如果存在内容,则下载内容和标题,并且再次寻找该网页内符合条件的url,否则该网页不符合条件,退出该网页。
from bs4 import BeautifulSoup
from urllib import request

# 用request和BeautifulSoup处理网页
def requestOver(url):
    req = request.Request(url)
    response = request.urlopen(req)
    soup = BeautifulSoup(response, 'lxml')
    return soup

# 从网页下载标题和内容到txt文档
def download(title, url, y):
    soup = requestOver(url)
    tag = soup.find('div', class_="article")
    if(tag == None):
        return 0
    title = title.replace(':', '')
    title = title.replace('"', '')
    title = title.replace('|', '')
    title = title.replace('/', '')
    title = title.replace('\\', '')
    title = title.replace('*', '')
    title = title.replace('<', '')
    title = title.replace('>', '')
    title = title.replace('?', '')
    content = ""
    for p in tag.findAll('p'):
        if (p.string != None):
            content = content + p.string
    filename = r'E:\code\python\\' + title + '.txt'
    with open(filename, 'w', encoding='utf8') as file_object:
        file_object.write('           ')
        file_object.write(title)
        file_object.write(tag.get_text())
    print('正在爬取第', y, '个新闻', title)

# 爬虫具体执行过程
def crawlAll(url, y):
    soup = requestOver(url)
    for tag in soup.findAll("a", target="_blank"):
        if tag.string != None:	#标题非空
            if len(tag.string) > 8: # 标题长度大于8
                if(("https://news.sina.com.cn/" in tag.attrs["href"]) or ("http://news.sina.com.cn/" in tag.attrs["href"])):
                    alllist.append(tag.attrs["href"])
                    if ((tag.attrs["href"] not in collection)):
                        collection.add(tag.attrs["href"])
                        try:
                            print(tag.attrs['href'])
                            download(tag.string, tag.attrs['href'], y)
                            y += 1
                        except Exception:
                            print("第" + str(y) + "个新闻爬取失败")
                        else:
                            crawlAll(tag.attrs['href'], y)
    return y

if __name__ == '__main__':
    y = 1
    collection = set() # 用于链接去重
    alllist = set()	# 用于存放你需要爬取的网页
    alllist = ["https://news.sina.com.cn/"]
    for n in alllist:
        target_url = n
        y = crawlAll(target_url, y)

三、动态爬取

爬取动态网页则更为复杂,本次介绍一种爬取动态加载的方法。需要用到:postman。
本次实验的网址是:
由于无法用静态方法爬取,所以需要对网页进行抓包。

  1. 打开网页后,右键检查-点击网络(NETWORK)-点击网页的下一页-显示出有一个请求
  2. python爬虫抓取全网招标竞价 python3网络爬虫数据采集_python

  3. 右键该包-copy-copy as cURL(bash)
  4. python爬虫抓取全网招标竞价 python3网络爬虫数据采集_python爬虫抓取全网招标竞价_02

  5. 放入postman中,import-raw text,导入后可以点击send尝试发送请求,如果出现了内容则抓成功了。
  6. python爬虫抓取全网招标竞价 python3网络爬虫数据采集_ide_03

  7. 点击code-python requests则可以复制代码到pycharm中粘贴,一般修改page即可达到抓取动态加载页面的效果。
import requests

url = "https://feed.mix.sina.com.cn/api/roll/get?pageid=153&lid=2669&k=&num=50&page=2&r=0.3982520273586394&callback=jQuery1112024604807702249287_1604838144359&_=1604838144361"

payload={}
headers = {
  'authority': 'feed.mix.sina.com.cn',
  'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36',
  'accept': '*/*',
  'sec-fetch-site': 'same-site',
  'sec-fetch-mode': 'no-cors',
  'sec-fetch-dest': 'script',
  'referer': 'https://news.sina.com.cn/roll/',
  'accept-language': 'zh-CN,zh;q=0.9',
}

response = requests.request("GET", url, headers=headers, data=payload)

print(response.text)
  1. 具体代码如下:
import re
from bs4 import BeautifulSoup
from urllib import request
import requests

# 动态获取网页下符合条件的链接
def solve(page):
    url1 = "https://feed.mix.sina.com.cn/api/roll/get?pageid=153&lid=2669&k=&num=50&page="
    url2 = "&r=0.7488014654950375&callback=jQuery1112025760955190502766_1604665024595&_=1604665024597"
    url = url1 + str(page) + url2
    payload = {}
    headers = {
      'authority': 'feed.mix.sina.com.cn',
      'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36',
      'accept': '*/*',
      'sec-fetch-site': 'same-site',
      'sec-fetch-mode': 'no-cors',
      'sec-fetch-dest': 'script',
      'referer': 'https://news.sina.com.cn/roll/',
      'accept-language': 'zh-CN,zh;q=0.9',
    }
    response = requests.request("GET", url, headers=headers, data=payload)
    response.encoding = "utf-8"
    l1 = str(response.text.replace("\\", "").split())
    res = re.findall(r'"url":"([^"]+)"', l1)
    return res

# 用requeset和BeautifulSoup处理网页
def requestOver(url):
    req = request.Request(url)
    response = request.urlopen(req)
    soup = BeautifulSoup(response, 'lxml')
    return soup

# 从网页下载标题和内容到txt文档
def download(title, url, y):
    soup = requestOver(url)
    tag = soup.find('div', class_="article")
    if(tag == None):
        return 0
    title = title.replace(':', '')
    title = title.replace('"', '')
    title = title.replace('|', '')
    title = title.replace('/', '')
    title = title.replace('\\', '')
    title = title.replace('*', '')
    title = title.replace('<', '')
    title = title.replace('>', '')
    title = title.replace('?', '')
    # print(tag.get_text())
    content = ""
    for p in tag.findAll('p'):
        if (p.string != None):
            content = content + p.string
    filename = r'E:\code\python\spider_news\sina_news\society\\' + title + '.txt'
    with open(filename, 'w', encoding='utf8') as file_object:
        file_object.write('           ')
        file_object.write(title)
        file_object.write(tag.get_text())
    print('正在爬取第', y, '个新闻', title)

if __name__ == '__main__':
    y = 1
    # 该网页50页以后会出错
    for page in range(50):
        url = solve(page)	
        for each in url:
            soup = requestOver(each)
            download(soup.find("h1", class_="main-title").string, each, y)
            y += 1

五、题外话

当然你也可以选择静态用scrapy,动态用selenium。这些都是爬虫的一些框架和包。

六、静态爬取中国新闻网

  • 由于实验需要,特爬取中国新闻网滚动新闻的财经新闻
  • 发现用"lxml"解析爬不到的用"html.parser"可以爬的到
  • 需要爬取中国新闻网其他频道的新闻修改相关参数即可
  • 因为流程简单,并未制作严谨的异常处理
# -*- coding: utf-8 -*-
# encoding='utf-8'
import re
from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup
from urllib import request
import datetime

# 用request和BeautifulSoup处理网页
def requestOver(url):
    response = requests.get(url)
    response.encoding = 'utf-8'
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

# 从网页下载标题和内容到txt文档
def download(title, url, y):
    soup = requestOver(url)
    tag = soup.find('div', class_="left_zw")
    if(tag == None):
        return 0
    # print(type(tag))
    # print(tag.get_text())
    title = title.replace(':', '')
    title = title.replace('"', '')
    title = title.replace('|', '')
    title = title.replace('/', '')
    title = title.replace('\\', '')
    title = title.replace('*', '')
    title = title.replace('<', '')
    title = title.replace('>', '')
    title = title.replace('?', '')
    # print(tag.get_text())
    content = ""
    for p in tag.findAll('p'):
        if (p.string != None):
            content = content + p.string
    filename = r'E:\code\python\spider_news\sina_news\eco\\' + title + '.txt'
    with open(filename, 'w', encoding='utf-8', errors='ignore') as file_object:
        file_object.write('           ')
        file_object.write(title)
        file_object.write(tag.get_text())
    print('正在爬取第', y, '个新闻', title)

# 爬虫具体执行过程
def crawlAll(url, y):
    soup = requestOver(url)
    for s in soup.findAll("div", class_="content_list"):
        for tag in s.findAll("li"):
            sp = tag.findAll("a")
            if("财经" in str(sp)):
                title = list(sp)[1].string
                urlAll = "http://www.chinanews.com" + str(list(sp)[1])[9:str(list(sp)[1]).find("shtml")+5]
                try:
                    download(title, urlAll, y)
                except Exception:
                    print("第" + str(y) + "个新闻爬取失败")
                else:
                    y += 1
    return y

if __name__ == '__main__':
    y = 1
    url1 = "http://www.chinanews.com/scroll-news/"
    date = "2020/1112"
    url2 = "/news.shtml"
    for i in range(3650):
        date1 = datetime.datetime.strptime(date, "%Y/%m%d")
        date2 = datetime.timedelta(days=-1)
        date = (date1 + date2).strftime("%Y/%m%d")
        target_url = url1 + date + url2
        print(target_url)
        y = crawlAll(target_url, y)