一、预备知识
- 处理网页的链接(只列出一种)
# 用request和BeautifulSoup处理网页
def requestOver(url):
req = request.Request(url)
response = request.urlopen(req)
soup = BeautifulSoup(response, 'lxml')
return soup- 从网页下载到本地txt的模块
# 从网页下载标题和内容到txt文档
def download(title, url, y):
soup = requestOver(url)
tag = soup.find('div', class_="article")
if(tag == None):
return 0
title = title.replace(':', '')
title = title.replace('"', '')
title = title.replace('|', '')
title = title.replace('/', '')
title = title.replace('\\', '')
title = title.replace('*', '')
title = title.replace('<', '')
title = title.replace('>', '')
title = title.replace('?', '')
# print(tag.get_text())
content = ""
for p in tag.findAll('p'):
if (p.string != None):
content = content + p.string
filename = r'E:\code\python\spider_news\sina_news\society\\' + title + '.txt'
with open(filename, 'w', encoding='utf8') as file_object:
file_object.write(' ')
file_object.write(title)
file_object.write(tag.get_text())
print('正在爬取第', y, '个新闻', title)- 正则表达式知识(最好要有)
二、静态爬取
- 静态网页爬取,就是通过网页源码定位到你所要爬取的内容,获取内容的标签和属性,再通过爬虫爬取出来即可。
- 本段代码思想:以新浪新闻网为例,通过url的findAll()函数找到所有符合的网址,从头遍历。进入一个网址后如果存在内容,则下载内容和标题,并且再次寻找该网页内符合条件的url,否则该网页不符合条件,退出该网页。
from bs4 import BeautifulSoup
from urllib import request
# 用request和BeautifulSoup处理网页
def requestOver(url):
req = request.Request(url)
response = request.urlopen(req)
soup = BeautifulSoup(response, 'lxml')
return soup
# 从网页下载标题和内容到txt文档
def download(title, url, y):
soup = requestOver(url)
tag = soup.find('div', class_="article")
if(tag == None):
return 0
title = title.replace(':', '')
title = title.replace('"', '')
title = title.replace('|', '')
title = title.replace('/', '')
title = title.replace('\\', '')
title = title.replace('*', '')
title = title.replace('<', '')
title = title.replace('>', '')
title = title.replace('?', '')
content = ""
for p in tag.findAll('p'):
if (p.string != None):
content = content + p.string
filename = r'E:\code\python\\' + title + '.txt'
with open(filename, 'w', encoding='utf8') as file_object:
file_object.write(' ')
file_object.write(title)
file_object.write(tag.get_text())
print('正在爬取第', y, '个新闻', title)
# 爬虫具体执行过程
def crawlAll(url, y):
soup = requestOver(url)
for tag in soup.findAll("a", target="_blank"):
if tag.string != None: #标题非空
if len(tag.string) > 8: # 标题长度大于8
if(("https://news.sina.com.cn/" in tag.attrs["href"]) or ("http://news.sina.com.cn/" in tag.attrs["href"])):
alllist.append(tag.attrs["href"])
if ((tag.attrs["href"] not in collection)):
collection.add(tag.attrs["href"])
try:
print(tag.attrs['href'])
download(tag.string, tag.attrs['href'], y)
y += 1
except Exception:
print("第" + str(y) + "个新闻爬取失败")
else:
crawlAll(tag.attrs['href'], y)
return y
if __name__ == '__main__':
y = 1
collection = set() # 用于链接去重
alllist = set() # 用于存放你需要爬取的网页
alllist = ["https://news.sina.com.cn/"]
for n in alllist:
target_url = n
y = crawlAll(target_url, y)三、动态爬取
爬取动态网页则更为复杂,本次介绍一种爬取动态加载的方法。需要用到:postman。
本次实验的网址是:
由于无法用静态方法爬取,所以需要对网页进行抓包。
- 打开网页后,右键检查-点击网络(NETWORK)-点击网页的下一页-显示出有一个请求
- 右键该包-copy-copy as cURL(bash)
- 放入postman中,import-raw text,导入后可以点击send尝试发送请求,如果出现了内容则抓成功了。
- 点击code-python requests则可以复制代码到pycharm中粘贴,一般修改page即可达到抓取动态加载页面的效果。



import requests
url = "https://feed.mix.sina.com.cn/api/roll/get?pageid=153&lid=2669&k=&num=50&page=2&r=0.3982520273586394&callback=jQuery1112024604807702249287_1604838144359&_=1604838144361"
payload={}
headers = {
'authority': 'feed.mix.sina.com.cn',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36',
'accept': '*/*',
'sec-fetch-site': 'same-site',
'sec-fetch-mode': 'no-cors',
'sec-fetch-dest': 'script',
'referer': 'https://news.sina.com.cn/roll/',
'accept-language': 'zh-CN,zh;q=0.9',
}
response = requests.request("GET", url, headers=headers, data=payload)
print(response.text)- 具体代码如下:
import re
from bs4 import BeautifulSoup
from urllib import request
import requests
# 动态获取网页下符合条件的链接
def solve(page):
url1 = "https://feed.mix.sina.com.cn/api/roll/get?pageid=153&lid=2669&k=&num=50&page="
url2 = "&r=0.7488014654950375&callback=jQuery1112025760955190502766_1604665024595&_=1604665024597"
url = url1 + str(page) + url2
payload = {}
headers = {
'authority': 'feed.mix.sina.com.cn',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36',
'accept': '*/*',
'sec-fetch-site': 'same-site',
'sec-fetch-mode': 'no-cors',
'sec-fetch-dest': 'script',
'referer': 'https://news.sina.com.cn/roll/',
'accept-language': 'zh-CN,zh;q=0.9',
}
response = requests.request("GET", url, headers=headers, data=payload)
response.encoding = "utf-8"
l1 = str(response.text.replace("\\", "").split())
res = re.findall(r'"url":"([^"]+)"', l1)
return res
# 用requeset和BeautifulSoup处理网页
def requestOver(url):
req = request.Request(url)
response = request.urlopen(req)
soup = BeautifulSoup(response, 'lxml')
return soup
# 从网页下载标题和内容到txt文档
def download(title, url, y):
soup = requestOver(url)
tag = soup.find('div', class_="article")
if(tag == None):
return 0
title = title.replace(':', '')
title = title.replace('"', '')
title = title.replace('|', '')
title = title.replace('/', '')
title = title.replace('\\', '')
title = title.replace('*', '')
title = title.replace('<', '')
title = title.replace('>', '')
title = title.replace('?', '')
# print(tag.get_text())
content = ""
for p in tag.findAll('p'):
if (p.string != None):
content = content + p.string
filename = r'E:\code\python\spider_news\sina_news\society\\' + title + '.txt'
with open(filename, 'w', encoding='utf8') as file_object:
file_object.write(' ')
file_object.write(title)
file_object.write(tag.get_text())
print('正在爬取第', y, '个新闻', title)
if __name__ == '__main__':
y = 1
# 该网页50页以后会出错
for page in range(50):
url = solve(page)
for each in url:
soup = requestOver(each)
download(soup.find("h1", class_="main-title").string, each, y)
y += 1五、题外话
当然你也可以选择静态用scrapy,动态用selenium。这些都是爬虫的一些框架和包。
六、静态爬取中国新闻网
- 由于实验需要,特爬取中国新闻网滚动新闻的财经新闻
- 发现用"lxml"解析爬不到的用"html.parser"可以爬的到
- 需要爬取中国新闻网其他频道的新闻修改相关参数即可
- 因为流程简单,并未制作严谨的异常处理
# -*- coding: utf-8 -*-
# encoding='utf-8'
import re
from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup
from urllib import request
import datetime
# 用request和BeautifulSoup处理网页
def requestOver(url):
response = requests.get(url)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
return soup
# 从网页下载标题和内容到txt文档
def download(title, url, y):
soup = requestOver(url)
tag = soup.find('div', class_="left_zw")
if(tag == None):
return 0
# print(type(tag))
# print(tag.get_text())
title = title.replace(':', '')
title = title.replace('"', '')
title = title.replace('|', '')
title = title.replace('/', '')
title = title.replace('\\', '')
title = title.replace('*', '')
title = title.replace('<', '')
title = title.replace('>', '')
title = title.replace('?', '')
# print(tag.get_text())
content = ""
for p in tag.findAll('p'):
if (p.string != None):
content = content + p.string
filename = r'E:\code\python\spider_news\sina_news\eco\\' + title + '.txt'
with open(filename, 'w', encoding='utf-8', errors='ignore') as file_object:
file_object.write(' ')
file_object.write(title)
file_object.write(tag.get_text())
print('正在爬取第', y, '个新闻', title)
# 爬虫具体执行过程
def crawlAll(url, y):
soup = requestOver(url)
for s in soup.findAll("div", class_="content_list"):
for tag in s.findAll("li"):
sp = tag.findAll("a")
if("财经" in str(sp)):
title = list(sp)[1].string
urlAll = "http://www.chinanews.com" + str(list(sp)[1])[9:str(list(sp)[1]).find("shtml")+5]
try:
download(title, urlAll, y)
except Exception:
print("第" + str(y) + "个新闻爬取失败")
else:
y += 1
return y
if __name__ == '__main__':
y = 1
url1 = "http://www.chinanews.com/scroll-news/"
date = "2020/1112"
url2 = "/news.shtml"
for i in range(3650):
date1 = datetime.datetime.strptime(date, "%Y/%m%d")
date2 = datetime.timedelta(days=-1)
date = (date1 + date2).strftime("%Y/%m%d")
target_url = url1 + date + url2
print(target_url)
y = crawlAll(target_url, y)
















