文章目录
- 问题
- 解决过程
- 1.查找json请求
- 2.分析发布时间和标题
- 3.分析正文页的json请求
- 4.获取正文数据
- 答案
- 相关问题补充
# 动态网页
当用户请求的是一个动态网页时,服务器要做更多的工作才能把用户请求的信息发送回去,服务器一般按照以下步骤进行工作:
1、服务器端接受请求。
2、Web服务器从服务器硬盘指定的位置或内存中读取动态网页文件。
3、执行网页文件的程序代码,将含有程序代码的动态网页转化为标准的静态页面(如HTML)。
3、Web服务器将生成的静态页面代码发送给请求浏览器。
动态网页对应的网页实体是在执行程序过程中动态生成的,页面内容和html源码不一样。所以用requests无法返回对应的只有渲染前的html元素
要是实现动态网页的爬取有两种方法:
1.根据采集的数据,手动分析异步请求的规律
2.selenium模拟渲染过程
首先介绍第一种方法
问题
爬取网页每一条新闻的标题、发布时间、正文内容
解决过程
- 查找列表页json请求(使用network查询面板)
- 用json库和关键字分析出标题、发布时间
- 分析正文页json请求(使用network查询面板)
- 分析出每个正文页的json请求地址和正文内容,并请求对应的正文数据
1.查找json请求
2.分析发布时间和标题
def list_page():
headers = {
'Accept': '*/*',
'Referer': 'http://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=914&itemId=915&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%8A%A8%E6%80%81',
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36',
}
response = requests.get('http://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectDocByItemIdAndChild/data_itemId=915,pageIndex=1,pageSize=18.json', headers=headers, verify=False)
res=response.text
return res
res=list_page()
res = json.loads(res)
print(res["data"]["rows"][0]["docId"])
lens=len(res["data"]["rows"])
sq=res["data"]["rows"]
count=1
for i in range(lens):
dic={}
print(sq[i]["docId"])
dic["doctitle"] = sq[i]["docSubtitle"]
dic["docdate"] = datetime.datetime.strptime(sq[i]["publishDate"],"%Y-%m-%d %H:%M:%S")
可以用postman快速获取python代码,见方法
3.分析正文页的json请求
值得一提的是用正文内容查找urL时,用的查不到,可能中文被加密了,可能这些中文不是连续存储的;遇到的正文页连接和列表页id关联性不强时,可能找错了请求连接,当然也存在反爬虫的可能。
最后还是查询了首页日期才找到全文的json数据,这里的正文内容也以html源代码的形式保存,可以保存在一个html网页里分析解析方法
with open('1.html', 'w+') as f:
f.write((response.json()['data']['docClob']))
4.获取正文数据
import requests
from scrapy import Selector
def detail_page(id: int):
url = "http://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectByDocId/data_docId={}.json".format(id)
payload={}
headers = {
'Accept': '*/*',
'Referer': 'http://www.cbirc.gov.cn/cn/view/pages/ItemDetail.html?docId=1016432&itemId=915&generaltype=0',
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
}
response = requests.request("GET", url, headers=headers, data=payload)
res=Selector(text=response.text)
lst=[]
for pc in res.xpath("//p/span"):
#lst.append()
str=pc.xpath("./text()").get()
if str != None:
lst.append(str)
return ''.join(lst),url
答案
import requests
import json
import datetime
from scrapy import Selector
import pymysql
from scrapy import Selector
from pymysql.cursors import DictCursor
def mysql_conn():
"""开发连接库"""
_conn = pymysql.connect(
host='localhost',
user='root',
passwd='123454321',
database='practice',
port=3306,
charset='utf8mb4'
)
_cur = _conn.cursor(DictCursor)
return _conn, _cur
class MysqlORM(object):
def __init__(self, conn, cur):
self.conn = conn
self.cur = cur
def insert_one(self, table: str, data: dict):
name = ','.join(data.keys())
print(name)
col = ','.join('%({})s'.format(k) for k in data.keys())
print(col)
sql = f'insert ignore into {table}({name}) values({col})'
self.cur.execute(sql, data)
self.conn.commit()
rowid = self.cur.lastrowid
print(f'{table} 插入一条数据 {rowid}')
return rowid
def update_one(self, table: str, data: dict, fixed: list):
fileds = [f'{name}=%({name})s' for name in data.keys() if name not in fixed]
where_phrase = [f'{name}=%({name})s' for name in fixed]
where = ' and '.join(where_phrase)
update_sql = f'update {table} set {",".join(fileds)} where {where}'
self.cur.execute(update_sql, data)
self.conn.commit()
print(f'{table} 更新一条数据到 {table} 成功')
##获取列表页面
def list_page():
headers = {
'Accept': '*/*',
'Referer': 'http://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=914&itemId=915&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%8A%A8%E6%80%81',
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36',
}
response = requests.get('http://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectDocByItemIdAndChild/data_itemId=915,pageIndex=1,pageSize=18.json', headers=headers, verify=False)
res=response.text
return res
#此处返回的是字符串
#获取正文页面
def detail_page(id: int):
url = "http://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectByDocId/data_docId={}.json".format(id)
payload={}
headers = {
'Accept': '*/*',
'Referer': 'http://www.cbirc.gov.cn/cn/view/pages/ItemDetail.html?docId=1016432&itemId=915&generaltype=0',
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
}
response = requests.request("GET", url, headers=headers, data=payload)
res=Selector(text=response.text)
lst=[]
for pc in res.xpath("//p/span"):
#lst.append()
str=pc.xpath("./text()").get()
if str != None:
lst.append(str)
return ''.join(lst),url
if __name__=="__main__":
conn, cur = mysql_conn()
mysql_client = MysqlORM(conn, cur)
res=list_page()
# 加载为json数据才能用关键字索引
res = json.loads(res)
print(res["data"]["rows"][0]["docId"])
lens=len(res["data"]["rows"])
sq=res["data"]["rows"]
count=1
for i in range(lens):
count = count + 1
dic={}
print(sq[i]["docId"])
dic["doctitle"] = sq[i]["docSubtitle"]
dic["docdate"] = datetime.datetime.strptime(sq[i]["publishDate"],"%Y-%m-%d %H:%M:%S")
dic["collectdate"]=datetime.datetime.now()
dic["content"],dic['url'] = detail_page(sq[i]["docId"])
mysql_client.insert_one('govern', dic)
print(count)
print(dic)
相关问题补充
1.从str获取datetime
import datetime
tim=datetime.datetime.strptime("22 October 2021","%d %B %Y")
tod=datetime.datetime.now()
str1=datetime.datetime.strftime(tim,"%Y-%m-%d")
- 数据存储在数据库
import datetime
import requests
import pymysql
from scrapy import Selector
from pymysql.cursors import DictCursor
def mysql_conn():
"""开发连接库"""
_conn = pymysql.connect(
host='localhost',
user='root',
passwd='123454321',
database='practice',
port=3306,
charset='utf8mb4'
)
_cur = _conn.cursor(DictCursor)
return _conn, _cur
class MysqlORM(object):
def __init__(self, conn, cur):
self.conn = conn
self.cur = cur
def insert_one(self, table: str, data: dict):
name = ','.join(data.keys())
print(name)
col = ','.join('%({})s'.format(k) for k in data.keys())
print(col)
sql = f'insert ignore into {table}({name}) values({col})'
self.cur.execute(sql, data)
self.conn.commit()
rowid = self.cur.lastrowid
print(f'{table} 插入一条数据 {rowid}')
return rowid
def update_one(self, table: str, data: dict, fixed: list):
fileds = [f'{name}=%({name})s' for name in data.keys() if name not in fixed]
where_phrase = [f'{name}=%({name})s' for name in fixed]
where = ' and '.join(where_phrase)
update_sql = f'update {table} set {",".join(fileds)} where {where}'
self.cur.execute(update_sql, data)
self.conn.commit()
print(f'{table} 更新一条数据到 {table} 成功')
if __name__=='__main__':
conn, cur = mysql_conn()
mysql_client = MysqlORM(conn, cur)
#dis_data属于字典格式,disease为数据库中的表名称
mysql_client.insert_one('disease', dis_data)
此文引用了刘祥龙
老师的有道云笔记和代码