文章目录
- 一、修改爬取页面的逻辑
- 二、缓存方式
- 2.1 磁盘缓存
- 2.2 数据库缓存
在
爬虫学习系列02-常见的下载和抽取网页的方法中我们下载了网页,然后抽取了了需要的数据。但是若是现在在抽取其它数据呢?难道还需要重新下载该网页吗?这对于大型网站而言,重新爬取网页可能需要耗费很长的时间,所以,有必要对已经爬取的网页进行缓存,让每个网页只下载一次。
一、修改爬取页面的逻辑
为了避免在每次下载的时候都要传入多个参数,将上一节的download函数改写为download类,只需要在构造方法中设置一次,就能在后续的下载时多次复用了。
import re
import urllib
import random
import datetime
import robotparser
from urllib import parse
from downloader import Downloader
# 需要注意按照pep8书写规范,类的名字需要是名词
class Downloader:
# 构造方法
def __init__(self, delay=5,
user_agent='wswp', proxies=None,
num_retries=1, cache=None):
self.throttle = Throttle(delay) # 延时
self.user_agent = user_agent
self.proxies = proxies
self.num_retries = num_retries
self.cache = cache
# 当像使用函数一样,使用类时,调用此方法
def __call__(self, url):
result = None
# 下载前检查缓存
if self.cache:
try:
result = self.cache[url]
except KeyError:
# 缓存此url不可用
pass
# try/except,没有捕获到KeyError是执行else下语句
else:
if self.num_retries > 0 and \
500 <= result['code'] < 600:
# 缓存时遇到服务器端错误,需要重新爬取网页
result = None
# 如果没有缓存,这个时候需要去爬取网页,并且要控制延时,防止爬取过快,给网站造成压力
if result is None:
self.throttle.wait(url)
proxy = random.choice(self.proxies) if self.proxies else None
headers = {'User-Agent': self.user_agent}
result = self.download(url, headers, proxy, self.num_retries)
if self.cache:
self.cache[url] = result
return result['html']
def download(self, url, headers, proxy, num_retries, data=None):
print('Downloading: {}'.format(url))
request = urllib.request.Request(url, headers=headers)
opener = urllib.request.build_opener()
if proxy:
proxy_params = {parse.urlparse(url}.scheme: proxy}
opener.add_handler(urllib.request.ProxyHandler(proxy_params))
try:
response = opener.open(request)
html = response.read()
code = response.code
except Exception as e:
print('Download error: {}'.format(e.reason))
html = ''
if hasattr(e, 'code'):
code = e.code
if num_retries > 0 and 500 <= code < 600:
# retry 5XX HTTP errors
return self._get(url, headers, proxy, num_retries-1, data)
else:
code = None
return {'html': html, 'code': code}
class Throttle:
"""Throttle downloading by sleeping between requests to same domain
"""
def __init__(self, delay):
# amount of delay between downloads for each domain
self.delay = delay
# timestamp of when a domain was last accessed
self.domains = {}
def wait(self, url):
"""Delay if have accessed this domain recently
"""
domain = parse.urlsplit(url).netloc
last_accessed = self.domains.get(domain)
if self.delay > 0 and last_accessed is not None:
sleep_secs = self.delay - (datetime.now() - last_accessed).seconds
if sleep_secs > 0:
time.sleep(sleep_secs)
self.domains[domain] = datetime.now()
略微修改
def link_crawler(seed_url, link_regex=None, delay=5, max_depth=-1, max_urls=-1, user_agent='wswp', proxies=None, num_retries=1, scrape_callback=None, cache=None):
"""Crawl from the given seed URL following links matched by link_regex
"""
# the queue of URL's that still need to be crawled
crawl_queue = [seed_url]
# the URL's that have been seen and at what depth
seen = {seed_url: 0}
# track how many URL's have been downloaded
num_urls = 0
rp = get_robots(seed_url)
D = Downloader(delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, cache=cache)
while crawl_queue:
url = crawl_queue.pop()
depth = seen[url]
# check url passes robots.txt restrictions
if rp.can_fetch(user_agent, url):
html = D(url)
links = []
if scrape_callback:
links.extend(scrape_callback(url, html) or [])
if depth != max_depth:
# can still crawl further
if link_regex:
# filter for links matching our regular expression
links.extend(link for link in get_links(html) if re.match(link_regex, link))
for link in links:
link = normalize(seed_url, link)
# check whether already crawled this link
if link not in seen:
seen[link] = depth + 1
# check link is within same domain
if same_domain(seed_url, link):
# success! add this new link to queue
crawl_queue.append(link)
# check whether have reached downloaded maximum
num_urls += 1
if num_urls == max_urls:
break
else:
print 'Blocked by robots.txt:', url
def normalize(seed_url, link):
"""Normalize this URL by removing hash and adding domain
"""
link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates
return urlparse.urljoin(seed_url, link)
def same_domain(url1, url2):
"""Return True if both URL's belong to same domain
"""
return urlparse.urlparse(url1).netloc == urlparse.urlparse(url2).netloc
def get_robots(url):
"""Initialize robots parser for this domain
"""
rp = robotparser.RobotFileParser()
rp.set_url(urlparse.urljoin(url, '/robots.txt'))
rp.read()
return rp
def get_links(html):
"""Return a list of links from html
"""
# a regular expression to extract all links from the webpage
webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
# list of all links from the webpage
return webpage_regex.findall(html)
二、缓存方式
在上一步中,我们已经实现了网页只下载一次,不用重复下载,但是只是保存在了脚本中,当这个脚本执行完毕,缓存的数据就没有了。
2.1 磁盘缓存
将下载到的网页存储在文件系统中,为了实现该功能,我们需要将URL安全的映射为跨平台的文件名。所以需要注意操作系统中的非法文件名字符是哪些。
为了保证在不同文件系统中,我们的文件路径都是安全的,就需要限制其只能包含数字、字母和基本符号,并将其他字符替换为下划线,其实现代码如下所示。
import re
url = 'http://example.webscraping.com/default/view/#$@Australia-1'
# 将不是/0-9a-zA-Z\-.,;_空格的字符都替换成下划线
file_name = re.sub('[^/0-9a-zA-Z\-.,;_ ]', '_', url)
print(file_name)
# 输出
'http_//example.webscraping.com/default/view/___Australia-1'
需要留意以/结尾的url。
import os
import re
from urllib import parse
import shutil
import zlib
from datetime import datetime, timedelta
# 导入模块还可以这些的啊,牛逼
try:
import cPickle as pickle
except ImportError:
import pickle
from link_crawler import link_crawler
class DiskCache:
"""
Dictionary interface that stores cached
values in the file system rather than in memory.
The file path is formed from an md5 hash of the key.
>>> cache = DiskCache()
>>> url = 'http://example.webscraping.com'
>>> result = {'html': '...'}
>>> cache[url] = result
>>> cache[url]['html'] == result['html']
True
>>> cache = DiskCache(expires=timedelta())
>>> cache[url] = result
>>> cache[url]
Traceback (most recent call last):
...
KeyError: 'http://example.webscraping.com has expired'
>>> cache.clear()
"""
def __init__(self, cache_dir='cache', expires=timedelta(days=30), compress=True):
"""
cache_dir: the root level folder for the cache
expires: timedelta of amount of time before a cache entry is considered expired
compress: whether to compress data in the cache
"""
self.cache_dir = cache_dir
self.expires = expires
self.compress = compress
def __getitem__(self, url):
"""Load data from disk for this URL
在使用cache[url]时调用此方法
"""
path = self.url_to_path(url)
if os.path.exists(path):
with open(path, 'rb') as fp:
data = fp.read()
if self.compress:
data = zlib.decompress(data)
result, timestamp = pickle.loads(data)
if self.has_expired(timestamp):
raise KeyError(url + ' has expired')
return result
else:
# URL has not yet been cached
raise KeyError(url + ' does not exist')
def __setitem__(self, url, result):
"""Save data to disk for this url
在使用cache[url] = result时调用此方法
"""
path = self.url_to_path(url)
folder = os.path.dirname(path)
if not os.path.exists(folder):
os.makedirs(folder)
data = pickle.dumps((result, datetime.utcnow()))
if self.compress:
# 使用zlib压缩,之后再存储,节省磁盘空间
data = zlib.compress(data)
with open(path, 'wb') as fp:
fp.write(data)
def __delitem__(self, url):
"""Remove the value at this key and any empty parent sub-directories
"""
path = self._key_path(url)
try:
os.remove(path)
os.removedirs(os.path.dirname(path))
except OSError:
pass
def url_to_path(self, url):
"""Create file system path for this URL
"""
components = urlparse.urlsplit(url)
# when empty path set to /index.html
path = components.path
if not path:
path = '/index.html'
elif path.endswith('/'):
path += 'index.html'
filename = components.netloc + path + components.query
# replace invalid characters
filename = re.sub('[^/0-9a-zA-Z\-.,;_ ]', '_', filename)
# restrict maximum number of characters
filename = '/'.join(segment[:255] for segment in filename.split('/'))
return os.path.join(self.cache_dir, filename)
def has_expired(self, timestamp):
"""Return whether this timestamp has expired
"""
return datetime.utcnow() > timestamp + self.expires
def clear(self):
"""Remove all the cached values
"""
if os.path.exists(self.cache_dir):
shutil.rmtree(self.cache_dir)
if __name__ == '__main__':
link_crawler('http://example.webscraping.com/', '/(index|view)', cache=DiskCache())
优点:容易实现,不用安装额外的模块,直接能够查看到结果。
缺点:受制于本地文件系统的限制。而且有些域名在经过处理后保存的文件名相同了(当然有同学会说了,那用url的哈希值作为文件名不就解决了吗)!但是还有另外一个原因,文件系统的可存储文件总数是有限制的。
2.2 数据库缓存
为了避免存储在磁盘中这种方法的限制,更好的方式是将数据存储在数据库中。
我们选用mongodb作为数据库来存储爬取的数据,之所以用mongodb这种NoSQL数据库(Not Only SQL),是因为扩展性好,当数据量过大无法存储在一台服务器中时,数据可能在不同的服务器,而关系型数据库常常需要查询多表,这就不太好用了。
try:
import cPickle as pickle
except ImportError:
import pickle
import zlib
from datetime import datetime, timedelta
from pymongo import MongoClient
from bson.binary import Binary
class MongoCache:
"""
Wrapper around MongoDB to cache downloads
>>> cache = MongoCache()
>>> cache.clear()
>>> url = 'http://example.webscraping.com'
>>> result = {'html': '...'}
>>> cache[url] = result
>>> cache[url]['html'] == result['html']
True
>>> cache = MongoCache(expires=timedelta())
>>> cache[url] = result
>>> # every 60 seconds is purged http://docs.mongodb.org/manual/core/index-ttl/
>>> import time; time.sleep(60)
>>> cache[url]
Traceback (most recent call last):
...
KeyError: 'http://example.webscraping.com does not exist'
"""
def __init__(self, client=None, expires=timedelta(days=30)):
"""
client: mongo database client
expires: timedelta of amount of time before a cache entry is considered expired
"""
# if a client object is not passed
# then try connecting to mongodb at the default localhost port
self.client = MongoClient('localhost', 27017) if client is None else client
#create collection to store cached webpages,
# which is the equivalent of a table in a relational database
self.db = self.client.cache
self.db.webpage.create_index('timestamp', expireAfterSeconds=expires.total_seconds())
def __contains__(self, url):
try:
self[url]
except KeyError:
return False
else:
return True
def __getitem__(self, url):
"""Load value at this URL
"""
record = self.db.webpage.find_one({'_id': url})
if record:
#return record['result']
return pickle.loads(zlib.decompress(record['result']))
else:
raise KeyError(url + ' does not exist')
def __setitem__(self, url, result):
"""Save value for this URL
"""
#record = {'result': result, 'timestamp': datetime.utcnow()}
record = {'result': Binary(zlib.compress(pickle.dumps(result))), 'timestamp': datetime.utcnow()}
self.db.webpage.update({'_id': url}, {'$set': record}, upsert=True)
def clear(self):
self.db.webpage.drop()
三、
参考文献
[1]《用python写web爬虫(web scraping with python)》