爬虫---requests模块简单使用

原创

mb61037a3723f67 2021-07-30 14:00:04 ©著作权

文章标签 html github safari chrome android 文章分类 MongoDB 数据库

©著作权归作者所有：来自51CTO博客作者mb61037a3723f67的原创作品，请联系作者获取转载授权，否则将追究法律责任

一： requests模块的基本使用：

response.text存在编码问题，原因是requests底层会自己推到编码，然后进行解码，如果推导的和编码不一致就会出现编码问题，需要提前指定：response.encoding = “utf-8”
response.content获取的是二进制类型，需要自行转换:response.content.decode(‘gbk’)

import requests

url = "https://www.baidu.com"
response = requests.get(url)
# 方案一：使用response.text获取响应内容
# response.encoding = "utf-8"
# print(response.text)

# 方案二： 使用response.content获取响应内容
# decode()默认是utf-8
print(response.content.decode())

with open("baidu.html", "w") as f:
    f.write(response.text)

1：response相应的其他常用属性和方法：

import requests

url = "http://www.baidu.com"
response = requests.get(url)

# 1: 获取响应的url
print(response.url)
# 2: 获取请求的url
print(response.request.url)
# 3: 获取响应的cookies
print(response.cookies)
# 4: 获取请求的cookies
print(response.request._cookies)
# 5： 获取响应头
print(response.headers)
# 6: 获取请求头
print(response.request.headers)
# 7: 获取响应内容中的json转换成字典
print(response.json())

http://www.baidu.com/
http://www.baidu.com/
<RequestsCookieJar[<Cookie BDORZ=27315 for .baidu.com/>]>
<RequestsCookieJar[]>
{'Cache-Control': 'private, no-cache, no-store, proxy-revalidate, no-transform', 'Connection': 'keep-alive', 'Content-Encoding': 'gzip', 'Content-Type': 'text/html', 'Date': 'Wed, 18 Nov 2020 15:10:45 GMT', 'Last-Modified': 'Mon, 23 Jan 2017 13:27:57 GMT', 'Pragma': 'no-cache', 'Server': 'bfe/1.0.8.18', 'Set-Cookie': 'BDORZ=27315; max-age=86400; domain=.baidu.com; path=/', 'Transfer-Encoding': 'chunked'}
{'User-Agent': 'python-requests/2.25.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}

2：练习—抓取一张网络上的图片：

import requests

url = "https://img.alicdn.com/tfs/TB1MaLKRXXXXXaWXFXXXXXXXXXX-480-260.png"

response = requests.get(url)

with open("图片1.png", "wb") as f:
    # 注意此处不进行解码
    f.write(response.content)

3：携带请求头发送请求：

import requests

url = "https://www.baidu.com"

headers = {
    'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Mobile Safari/537.36'
}

response = requests.get(url, headers=headers)

with open("百度.html", "wb") as f:
    f.write(response.content)

4：发送带参数请求：

"""
两种方案： 
第一种：直接在url地址中携带
第二种：构建请求参数字典
"""

url = "https://www.baidu.com/s?"

params_dict = {
    "wd": "python"
}
headers = {
    'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Mobile Safari/537.36',
}

response = requests.get(url, headers=headers, params= params_dict)

print(response.content.decode())

with open("python.html", 'wb') as f:
    f.write(response.content)

二： request模块的深入处理：

发送post请求，requests.post(self.url, headers=self.headers, data=self.parames)，注意里面是data。
反向代理：浏览器不知道目标服务器的ip,但是知道nginx的ip, 由nginx转发目标服务器。
正向代理：为了伪装客户端的ip地址，先将请求发送给代理服务器，再由代理服务器转发给目标服务器，目标服务器很难识别客户端的地址信息。

1：金山翻译爬虫：

import json
import requests

class JinshanSpider(object):
    def __init__(self, word=None):
        self.word = word,
        self.url = "http://fy.iciba.com/ajax.php?a=fy"
        self.headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Mobile Safari/537.36'}
        self.parames = {
            'f': 'auto',
            't': 'auto',
            'w': self.word
        }

    def send_request(self):
        # 1: 发送url请求，获取响应：
        response = requests.post(self.url, headers=self.headers, data=self.parames)
        return response.content.decode()

    def get_message(self, response):
        # 将json格式转换成字典格式
        response_dict = json.loads(response)
        ret = response_dict['content']['out']
        return ret

    def run(self):
        # 调用发送请求，返回响应
        response = self.send_request()
        # 调用解析响应
        ret = self.get_message(response)
        return ret

if __name__ == '__main__':
    jinshan = JinshanSpider('牛')
    ret = jinshan.run()
    print(ret)

2：代理：

透明代理
匿名代理
高匿代理

import requests

url = "https://www.baidu.com"
headers = {
    'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Mobile Safari/537.36',
}
proxies = {
    "http": "http://123.57.84.116:8118",
}
response = requests.get(url, headers=headers, proxies=proxies)
print(response.content.decode())

3：cookie

案例：模拟码云登录：

import requests

url = "https://gitee.com/ren_shan_wen"
headers = {
    'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Mobile Safari/537.36',
    'Cookie': '里面是cookie'
}

response = requests.get(url, headers=headers)

with open("github.html", "wb") as f:
    f.write(response.content)

import requests

url = "https://gitee.com/ren_shan_wen"
headers = {
    'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Mobile Safari/537.36',
}

cookie_str = '里面是cookie'

# cookie_list = cookie_str.split("; "), 得到一个cookie的列表。
# for循环遍历列表，得到是分割后的一个一个字符串。
# 再把字符串分割两部分，得到最终的字典。
cookie_dict = {cookie.split("=")[0]: cookie.split("=")[1]for cookie in cookie_str.split("; ")}
response = requests.get(url, headers=headers, cookies = cookie_dict)

with open("github.html", "wb") as f:
    f.write(response.content)

上面代码有问题：cookie如果过期，那么需要手动更换。

思路：
1：首先，向登录界面，发送请求，获取登录页面中的tocken值。
2：携带上次请求的tocken值,向目标地址发送登录请求，此时gitHub后台向向前端发送的cookie信息会被我们代码中的Session模块保存起来。
3:向个人中心页面发送请求，保存页面中心页面。（Session对象中的Cookie信息起作用了）

import requests
import re

"""
# 1.向https://github.com/login发送get请求，获取响应
# 2.从响应对象中使用正则表达式提取authenticity_token的值
# 3.构建请求体字典
# 4.向https://github.com/session发送post模拟登录
# 4.1 一旦登录成功，github后端将会给前端设置cookie做状态保持，cookie将会存储到session对象中
# 5.向个人中心发送get请求：https://github.com/TmacChenQian
# 5.1 保存页面信息到本地html中
"""

# 1.向https://github.com/login发送get请求，获取响应
start_url = "https://github.com/login"

# 1.1 构建session对象
session = requests.session()
# 1.2 构建请求头信息
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36",
}
response = session.get(start_url, headers=headers)

# print(response.content.decode())

# 1.3 提取响应内容字符串
response_str = response.content.decode()

# name="authenticity_token" value="(.*?)" />
# () 分组提取
# 分组提取的结果：[配置成功的整体结果, 组1]
# group(1) 提取列表中第二个元素
# ? 非贪婪
# 2.从响应对象中使用正则表达式提取authenticity_token的值
# w3A8sWv5cA52rIQn92dFjPvFTaFxg9iQKBG1wNEc0vsOWaUBgLzYeJ6F6Wa1Hdb7sJpJZ/7z/RehuhnnFbJhWQ==
token = re.search(r'name="authenticity_token" value="(.*?)" />', response_str).group(1)
print(token)

# 3.构建请求体字典
post_body = {
    "commit": "Sign in",
    "authenticity_token": token,
    "ga_id": "",
    "login": "279752917@qq.com",
    "password": "XIAOxiaozicq520",
    "webauthn-support": "supported",
    "webauthn-iuvpaa-support": "supported",
}

# 4.向https://github.com/session发送post模拟登录
login_url = "https://github.com/session"
session.post(login_url, headers=headers, data=post_body)
# 4.1 一旦登录成功，github后端将会给前端设置cookie做状态保持，cookie将会存储到session对象中

# 5.向个人中心发送get请求：https://github.com/TmacChenQian
profile_url = "https://github.com/TmacChenQian"
profile_response = session.get(profile_url, headers=headers)

# 5.1 保存页面信息到本地html中
with open("github2.html", "w") as f:
    f.write(profile_response.content.decode())

# <title>TmacChenQian (Ai1en)</title> 访问个人中心页面成功的标识