今日成果:爬取百度贴吧

原创

云҉淡҉风҉轻҉ 2021-07-21 17:32:46 ©著作权

文章标签 分享 文章分类 代码人生

©著作权归作者所有：来自51CTO博客作者云҉淡҉风҉轻҉的原创作品，请联系作者获取转载授权，否则将追究法律责任

'''
第一页
https://tieba.baidu.com/f?kw=python&ie=utf-8&pn=0
# 第二页
https://tieba.baidu.com/f?kw=python&ie=utf-8&pn=50
# 第三页
https://tieba.baidu.com/f?kw=python&ie=utf-8&pn=100
'''

from urllib.parse import urlencode
# 导入解析模块
from urllib.request import Request,urlopen
# Request 请求 , urlopen 打开
def get_html(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36'
    }
    request = Request(url,headers=headers)
    response = urlopen(request)
    # print(response.read().decode("gbk","ignore"))
    # 返回的是二进制数据
    return response.read()

def save_html(filename,html_bytes):
    with open(filename,'wb') as f:
        # 使用 wb 进行存储数据
        f.write(html_bytes)

def main():
    base_url = 'https://tieba.baidu.com/f?ie=utf-8&{}'
    # 贴吧 url
    content = input("请输入要进行查询的内容:")
    # 要进行查询的内容
    num = int(input("请输入要下载的页数:"))
    for pn in range(num):
        print("正在下载第{}页".format(pn + 1))
        args = {
            'kw':content,
            # 内容
            'pn':pn * 50
            # 页码
        }
        args = urlencode(args)
        # 进行转码
        html_bytes = get_html(base_url.format(args))
        # 传递拼接后的 base_url 给 get_html 函数
        filename = "第" + str(pn+1) + "页.html"
        # 下载到本地的文件名称
        save_html(filename,html_bytes)

if __name__ == '__main__':
    main()

urlopen方法


#coding=gbk
from urllib.request import urlopen
# urlopen 打开网页使用
url = 'https://www.baidu.com/'
# 要进行访问的 URL
response = urlopen(url)
# 发送请求
print(response.getcode())
# 获取 HTTP 响应码  200
print(response.geturl())
# 获取访问的网址信息 https://www.baidu.com/
print(response.info())
# 获取服务器响应的HTTP请求头
info = response.read()
# 读取内容
print(info.decode())
# 打印内容

Request 方法


request = Request(url = url,headers = headers)
# 带着 ua 去请求信息
print(request.get_header("User-agent"))
# 获取请求头信息
response = urlopen(request)
info = response.read()

安装 fake_useragent 第三方库
查看 headers 信息

from fake_useragent import UserAgent
ua = UserAgent()
print(ua.chrome)
# 打印谷歌浏览器
# print(dir(ua))
print(ua.ie)
# 打印 IE 浏览器
print(ua.firefox)
# 打印火狐浏览器

使用 get 请求


from urllib.request import urlopen,Request
from urllib.parse import quote
# urlopen 打开网页使用
# Request 请求对象,带有 ua
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36'
}
url = 'https://www.baidu.com/s?wd={}'.format(quote('瀚阳的小驿站'))
# 要进行访问的 URL , 使用 quote 进行转码
# 使用 urllib.parse 中的 urlencode 和 quote 方法 进行转码
# quote('瀚阳的小驿站') 进行转码 '%E7%80%9A%E9%98%B3%E7%9A%84%E5%B0%8F%E9%A9%BF%E7%AB%99'
request = Request(url,headers = headers)
# 进行请求
response = urlopen(request)
print(response.read().decode())

from urllib.request import urlopen,Request
from urllib.parse import urlencode
from fake_useragent import UserAgent
# urlopen 打开网页使用
# Request 请求对象,带有 ua
# 使用 urlencode 对字典元素进行转换编码
args = {
    'wd':"瀚阳的小驿站",
    "ie":"utf-8"
}

headers = {
'User-Agent': UserAgent().random
}

url = 'https://www.baidu.com/s?wd={}'.format(urlencode(args))

print(url)
request = Request(url , headers = headers)
response = urlopen(request)
info = response.read()
print(info.decode())