urllib 是 Python 内置的 HTTP 请求库,包含多个用于处理 URL 和进行网络请求的模块。
主要子模块
1. urllib.request - 打开和读取 URL
import urllib.request
# 基本 GET 请求
response = urllib.request.urlopen('https://httpbin.org/get')
print(response.read().decode('utf-8'))
# 带参数的 GET 请求
params = urllib.parse.urlencode({'key1': 'value1', 'key2': 'value2'})
url = f'https://httpbin.org/get?{params}'
response = urllib.request.urlopen(url)2. urllib.parse - 解析 URL
import urllib.parse
# URL 解析
url = 'https://www.example.com/path/to/page?name=value&key=value#fragment'
parsed = urllib.parse.urlparse(url)
print(f"Scheme: {parsed.scheme}")
print(f"Netloc: {parsed.netloc}")
print(f"Path: {parsed.path}")
print(f"Query: {parsed.query}")
# URL 编码和解码
encoded = urllib.parse.quote('hello world!')
print(encoded) # hello%20world%21
decoded = urllib.parse.unquote('hello%20world%21')
print(decoded) # hello world!
# 参数编码
params = urllib.parse.urlencode({'q': 'python tutorial', 'page': 1})
print(params) # q=python+tutorial&page=13. urllib.error - 异常处理
import urllib.request
import urllib.error
try:
response = urllib.request.urlopen('https://httpbin.org/status/404')
except urllib.error.HTTPError as e:
print(f'HTTP Error: {e.code} - {e.reason}')
except urllib.error.URLError as e:
print(f'URL Error: {e.reason}')4. urllib.robotparser - 解析 robots.txt
import urllib.robotparser
rp = urllib.robotparser.RobotFileParser()
rp.set_url('https://www.example.com/robots.txt')
rp.read()
# 检查是否允许爬取
can_fetch = rp.can_fetch('MyBot', 'https://www.example.com/admin')
print(f"Can fetch: {can_fetch}")实用示例
1. 发送 POST 请求
import urllib.request
import urllib.parse
# 表单数据
data = urllib.parse.urlencode({
'name': 'John Doe',
'email': 'john@example.com'
}).encode('utf-8')
req = urllib.request.Request('https://httpbin.org/post', data=data)
response = urllib.request.urlopen(req)
print(response.read().decode('utf-8'))2. 添加请求头
import urllib.request
url = 'https://httpbin.org/get'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': 'application/json'
}
req = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(req)
print(response.read().decode('utf-8'))3. 使用代理
import urllib.request
proxy_handler = urllib.request.ProxyHandler({
'http': 'http://proxy.example.com:8080',
'https': 'https://proxy.example.com:8080'
})
opener = urllib.request.build_opener(proxy_handler)
urllib.request.install_opener(opener)
response = urllib.request.urlopen('https://httpbin.org/ip')
print(response.read().decode('utf-8'))4. 处理 Cookie
import urllib.request
import http.cookiejar
# 创建 CookieJar
cookie_jar = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie_jar))
urllib.request.install_opener(opener)
# 发送请求,自动处理 cookies
response = urllib.request.urlopen('https://httpbin.org/cookies/set?name=value')
print(response.read().decode('utf-8'))5. 下载文件
import urllib.request
def download_file(url, filename):
try:
urllib.request.urlretrieve(url, filename)
print(f"文件下载成功: {filename}")
except Exception as e:
print(f"下载失败: {e}")
# 使用示例
download_file('https://httpbin.org/image/jpeg', 'image.jpg')高级用法
自定义 Opener
import urllib.request
import urllib.parse
# 创建自定义处理器
handler = urllib.request.HTTPHandler(debuglevel=1) # 启用调试
opener = urllib.request.build_opener(handler)
# 发送请求
data = urllib.parse.urlencode({'key': 'value'}).encode()
req = urllib.request.Request('https://httpbin.org/post', data=data)
response = opener.open(req)
print(response.read().decode())处理重定向
import urllib.request
class NoRedirectHandler(urllib.request.HTTPRedirectHandler):
def http_error_302(self, req, fp, code, msg, headers):
return fp
http_error_301 = http_error_303 = http_error_307 = http_error_302
# 禁用重定向
opener = urllib.request.build_opener(NoRedirectHandler())
response = opener.open('https://httpbin.org/redirect/1')
print(f"状态码: {response.status}")与 requests 库对比
虽然 urllib 是标准库,但第三方库 requests 通常更易用:
# 使用 urllib
import urllib.request
import urllib.parse
data = urllib.parse.urlencode({'key': 'value'}).encode()
req = urllib.request.Request('https://httpbin.org/post', data=data)
response = urllib.request.urlopen(req)
# 使用 requests (更简洁)
import requests
response = requests.post('https://httpbin.org/post', data={'key': 'value'})总结
urllib 模块提供了:
- 完整的 HTTP 客户端功能
- URL 解析和操作工具
- 错误处理机制
- robots.txt 解析
- 是 Python 标准库的一部分,无需额外安装
对于简单的 HTTP 请求,urllib 足够使用。但对于复杂的应用,建议考虑使用更友好的 requests 库。
















