urllib 是 Python 内置的 HTTP 请求库,包含多个用于处理 URL 和进行网络请求的模块。

主要子模块

1. urllib.request - 打开和读取 URL

import urllib.request

# 基本 GET 请求
response = urllib.request.urlopen('https://httpbin.org/get')
print(response.read().decode('utf-8'))

# 带参数的 GET 请求
params = urllib.parse.urlencode({'key1': 'value1', 'key2': 'value2'})
url = f'https://httpbin.org/get?{params}'
response = urllib.request.urlopen(url)

2. urllib.parse - 解析 URL

import urllib.parse

# URL 解析
url = 'https://www.example.com/path/to/page?name=value&key=value#fragment'
parsed = urllib.parse.urlparse(url)
print(f"Scheme: {parsed.scheme}")
print(f"Netloc: {parsed.netloc}")
print(f"Path: {parsed.path}")
print(f"Query: {parsed.query}")

# URL 编码和解码
encoded = urllib.parse.quote('hello world!')
print(encoded)  # hello%20world%21

decoded = urllib.parse.unquote('hello%20world%21')
print(decoded)  # hello world!

# 参数编码
params = urllib.parse.urlencode({'q': 'python tutorial', 'page': 1})
print(params)  # q=python+tutorial&page=1

3. urllib.error - 异常处理

import urllib.request
import urllib.error

try:
    response = urllib.request.urlopen('https://httpbin.org/status/404')
except urllib.error.HTTPError as e:
    print(f'HTTP Error: {e.code} - {e.reason}')
except urllib.error.URLError as e:
    print(f'URL Error: {e.reason}')

4. urllib.robotparser - 解析 robots.txt

import urllib.robotparser

rp = urllib.robotparser.RobotFileParser()
rp.set_url('https://www.example.com/robots.txt')
rp.read()

# 检查是否允许爬取
can_fetch = rp.can_fetch('MyBot', 'https://www.example.com/admin')
print(f"Can fetch: {can_fetch}")

实用示例

1. 发送 POST 请求

import urllib.request
import urllib.parse

# 表单数据
data = urllib.parse.urlencode({
    'name': 'John Doe',
    'email': 'john@example.com'
}).encode('utf-8')

req = urllib.request.Request('https://httpbin.org/post', data=data)
response = urllib.request.urlopen(req)
print(response.read().decode('utf-8'))

2. 添加请求头

import urllib.request

url = 'https://httpbin.org/get'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
    'Accept': 'application/json'
}

req = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(req)
print(response.read().decode('utf-8'))

3. 使用代理

import urllib.request

proxy_handler = urllib.request.ProxyHandler({
    'http': 'http://proxy.example.com:8080',
    'https': 'https://proxy.example.com:8080'
})
opener = urllib.request.build_opener(proxy_handler)
urllib.request.install_opener(opener)

response = urllib.request.urlopen('https://httpbin.org/ip')
print(response.read().decode('utf-8'))

4. 处理 Cookie

import urllib.request
import http.cookiejar

# 创建 CookieJar
cookie_jar = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie_jar))
urllib.request.install_opener(opener)

# 发送请求,自动处理 cookies
response = urllib.request.urlopen('https://httpbin.org/cookies/set?name=value')
print(response.read().decode('utf-8'))

5. 下载文件

import urllib.request

def download_file(url, filename):
    try:
        urllib.request.urlretrieve(url, filename)
        print(f"文件下载成功: {filename}")
    except Exception as e:
        print(f"下载失败: {e}")

# 使用示例
download_file('https://httpbin.org/image/jpeg', 'image.jpg')

高级用法

自定义 Opener

import urllib.request
import urllib.parse

# 创建自定义处理器
handler = urllib.request.HTTPHandler(debuglevel=1)  # 启用调试
opener = urllib.request.build_opener(handler)

# 发送请求
data = urllib.parse.urlencode({'key': 'value'}).encode()
req = urllib.request.Request('https://httpbin.org/post', data=data)
response = opener.open(req)
print(response.read().decode())

处理重定向

import urllib.request

class NoRedirectHandler(urllib.request.HTTPRedirectHandler):
    def http_error_302(self, req, fp, code, msg, headers):
        return fp
    http_error_301 = http_error_303 = http_error_307 = http_error_302

# 禁用重定向
opener = urllib.request.build_opener(NoRedirectHandler())
response = opener.open('https://httpbin.org/redirect/1')
print(f"状态码: {response.status}")

与 requests 库对比

虽然 urllib 是标准库,但第三方库 requests 通常更易用:

# 使用 urllib
import urllib.request
import urllib.parse

data = urllib.parse.urlencode({'key': 'value'}).encode()
req = urllib.request.Request('https://httpbin.org/post', data=data)
response = urllib.request.urlopen(req)

# 使用 requests (更简洁)
import requests
response = requests.post('https://httpbin.org/post', data={'key': 'value'})

总结

urllib 模块提供了:

  • 完整的 HTTP 客户端功能
  • URL 解析和操作工具
  • 错误处理机制
  • robots.txt 解析
  • 是 Python 标准库的一部分,无需额外安装

对于简单的 HTTP 请求,urllib 足够使用。但对于复杂的应用,建议考虑使用更友好的 requests 库。