一个比较简单的爬虫,获取网站作品,可供参考,爬取日志调用库 logging ,只需一行配置代码即可实现日志文件的记录,方便查看爬虫运行情况,非常方便而且好用!
写的比较啰嗦,比较渣,可以实现页面详情内容的获取,包括标题、描述以及图片下载文件。
爬取日志调用库:
# 日志的基本配置
logging.basicConfig(filename='access.log',
format='%(asctime)s - %(name)s - %(levelname)s -%(module)s: %(message)s',
datefmt='%Y-%m-%d %H:%M:%S %p',
level=10)
logging.debug('调试信息') # 10
logging.info('正常信息') # 20
logging.warning('警告信息') # 30
logging.error('报错信息') # 40
logging.critical('严重错误信息') # 50
附源码参考:
#红点奖作品爬取
#公众号:Python与SEO学习
import requests,os,re
import random,time
import logging
from lxml import etree
# 日志的基本配置
logging.basicConfig(filename='access.log',
format='%(asctime)s - %(name)s - %(levelname)s -%(module)s: %(message)s',
datefmt='%Y-%m-%d %H:%M:%S %p',
level=10)
def get_ua():
ua_list = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36Chrome 17.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0Firefox 4.0.1',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
]
ua=random.choice(ua_list)
return ua
def get_result(pagenum):
url=f"https://www.red-dot.org/search/search.json?solr%5Bfilter%5D%5B%5D=meta_categories%3A%2F11%2F&solr%5Bfilter%5D%5B%5D=year%3A2024&solr%5Bpage%5D={pagenum}"
response=get_resp(url)
results=response.json()['result']['docs']
print(results)
for result in results:
title=f'{result['title']}_{result['meta_first']}_{result['meta_fourth']}'
url=f'https://www.red-dot.org{result['url']}'
with open(f'{pagenum}.txt','a+',encoding='utf-8') as f:
f.write(f'{url}\n')
get_detail(title,url)
time.sleep(2)
def get_detail(title,url):
pattern = r"[\/\\\:\*\?\"\<\>\|]"
title = re.sub(pattern, "_", title) # 替换为下划线
path = f'{title}/'
os.makedirs(path, exist_ok=True)
logging.info(f'生成{title}文件目录成功!')
print(f'正在获取{url}详情页内容信息..')
logging.info(f'正在获取{url}详情页内容信息..')
response=get_resp(url)
html=response.content.decode('utf-8')
tree=etree.HTML(html)
hs=tree.xpath('//div[@class="col-12"]//text()')
h=''.join(hs)
print(h)
texts=tree.xpath('//div[@class="col-12 col-md-10 offset-md-1"]//text()')
text='\n'.join(texts)
print(text)
with open(f'{path}{title}.txt','w',encoding='utf-8') as f:
f.write(f'{h}\n\n{text}')
print(f"保存{title}.txt文件成功!")
logging.info(f"保存{title}.txt文件成功!")
imgs=tree.xpath('//div[@class="col-12 col-md-10 offset-md-1"]//img/@src')
print(imgs)
get_imgs(path, imgs)
downhref=tree.xpath('//a[@class="download-link"]/@href')[0]
print(downhref)
download_file(path, title, downhref)
print(f'获取{url}详情页内容信息成功!')
logging.info(f'获取{url}详情页内容信息成功!')
def get_imgs(path,imgs):
i=1
for img in imgs:
imgname=f'{i}-{img.split('/')[-1]}'
imgurl=f'https://www.red-dot.org{img}'
print(f">>开始下载图片:{imgname}")
logging.info(f">>开始下载图片:{imgname}")
r = get_resp(imgurl)
with open(f'{path}{imgname}', 'wb') as f:
f.write(r.content)
print(f"下载图片:{imgname}完毕!")
logging.info(f"下载图片:{imgname}完毕!")
i=i+1
time.sleep(1)
def download_file(path,title,downhref):
downname=f'{title}.zip'
print(f">>开始下载图片文件:{downname}")
logging.info(f">>开始下载图片:{downname}")
r = get_resp(downhref)
with open(f'{path}{downname}', 'wb') as f:
f.write(r.content)
print(f"下载图片文件:{downname}完毕!")
logging.info(f"下载图片文件:{downname}完毕!")
time.sleep(2)
# 3次重试
def get_resp(url):
i = 0
while i < 4:
try:
response = get_response(url,time=10)
# print(response.status_code)
return response
except requests.exceptions.RequestException:
i += 1
print(f">> 获取网页出错,{i*2}S后将重试获取第:{i} 次")
logging.error(f">> {url}---获取网页出错,{i*2}S后将重试获取第:{i} 次")
time.sleep(i * 2)
def get_response(url, time):
ua = random.choice(get_ua())
headers = {
'User-Agent': ua,
}
response = requests.get(url=url, headers=headers, timeout=time)
return response
def main():
for pagenum in range(1,11):
print(f'>> 正在获取第{pagenum}页json列表页内容信息..')
logging.info(f'>> 正在获取第{pagenum}页json列表页内容信息..')
get_result(pagenum)
time.sleep(6)
if __name__=="__main__":
main()
改进版:
#红点奖作品爬取
import requests,os,re
import random,time
import logging
from lxml import etree
# 日志的基本配置
logging.basicConfig(filename='access.log',
format='%(asctime)s - %(name)s - %(levelname)s -%(module)s: %(message)s',
datefmt='%Y-%m-%d %H:%M:%S %p',
level=10)
def get_ua():
ua_list = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36Chrome 17.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0Firefox 4.0.1',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
]
ua=random.choice(ua_list)
return ua
def get_result(pagenum):
url=f"https://www.red-dot.org/search/search.json?solr%5Bfilter%5D%5B%5D=meta_categories%3A%2F11%2F&solr%5Bfilter%5D%5B%5D=year%3A2024&solr%5Bpage%5D={pagenum}"
response=get_resp(url)
results=response.json()['result']['docs']
print(results)
for result in results:
title=f'{result['title']}_{result['meta_first']}_{result['meta_fourth']}'
url=f'https://www.red-dot.org{result['url']}'
if pagenum==86:
furls=quurls(pagenum)
if url in furls:
print(f"{url}--已经采集,跳过!")
logging.info(f"{url}--已经采集,跳过!")
pass
else:
with open(f'{pagenum}.txt', 'a+', encoding='utf-8') as f:
f.write(f'{url}\n')
try:
get_detail(title, url)
except Exception as e:
print(f"{e}--{url}")
logging.error(f"{e}--{url}")
with open('fail.txt', 'a+', encoding='utf-8') as f:
f.write(f"{e}--{url}")
time.sleep(2)
else:
with open(f'{pagenum}.txt','a+',encoding='utf-8') as f:
f.write(f'{url}\n')
try:
get_detail(title,url)
except Exception as e:
print(f"{e}--{url}")
logging.error(f"{e}--{url}")
with open('fail.txt','a+',encoding='utf-8') as f:
f.write(f"{e}--{url}")
time.sleep(2)
def get_detail(title,url):
pattern = r"[\/\\\:\*\?\"\<\>\|]-’:"
title = re.sub(pattern, "_", title) # 替换为下划线
title=title.replace("\n", " ")
path = f'{title}/'
os.makedirs(path, exist_ok=True)
logging.info(f'生成{title}文件目录成功!')
print(f'正在获取{url}详情页内容信息..')
logging.info(f'正在获取{url}详情页内容信息..')
response=get_resp(url)
html=response.content.decode('utf-8')
tree=etree.HTML(html)
hs=tree.xpath('//div[@class="col-12"]//text()')
h=''.join(hs)
print(h)
texts=tree.xpath('//div[@class="col-12 col-md-10 offset-md-1"]//text()')
text='\n'.join(texts)
print(text)
with open(f'{path}{title}.txt','w',encoding='utf-8') as f:
f.write(f'{h}\n\n{text}')
print(f"保存{title}.txt文件成功!")
logging.info(f"保存{title}.txt文件成功!")
imgs=tree.xpath('//div[@class="col-12 col-md-10 offset-md-1"]//img/@src')
print(imgs)
get_imgs(path, imgs)
downhref=tree.xpath('//a[@class="download-link"]/@href')[0]
print(downhref)
download_file(path, title, downhref)
print(f'获取{url}详情页内容信息成功!')
logging.info(f'获取{url}详情页内容信息成功!')
def get_imgs(path,imgs):
i=1
for img in imgs:
imgname=f'{i}-{img.split('/')[-1]}'
imgurl=f'https://www.red-dot.org{img}'
print(f">>开始下载图片:{imgname}")
logging.info(f">>开始下载图片:{imgname}")
r = get_resp(imgurl)
with open(f'{path}{imgname}', 'wb') as f:
f.write(r.content)
print(f"下载图片:{imgname}完毕!")
logging.info(f"下载图片:{imgname}完毕!")
i=i+1
time.sleep(1)
def download_file(path,title,downhref):
downname=f'{title}.zip'
print(f">>开始下载图片文件:{downname}")
logging.info(f">>开始下载图片:{downname}")
r = get_resp(downhref)
with open(f'{path}{downname}', 'wb') as f:
f.write(r.content)
print(f"下载图片文件:{downname}完毕!")
logging.info(f"下载图片文件:{downname}完毕!")
time.sleep(2)
# 3次重试
def get_resp(url):
i = 0
while i < 4:
try:
response = get_response(url,time=10)
# print(response.status_code)
return response
except requests.exceptions.RequestException:
i += 1
print(f">> 获取网页出错,{i*2}S后将重试获取第:{i} 次")
logging.error(f">> {url}---获取网页出错,{i*2}S后将重试获取第:{i} 次")
time.sleep(i * 2)
def get_response(url, time):
ua = random.choice(get_ua())
headers = {
'User-Agent': ua,
}
response = requests.get(url=url, headers=headers, timeout=time)
return response
def quurls(page):
with open(f'{page}.txt','r',encoding='utf-8') as f:
furls=f.readlines()
urls=[]
for furl in furls:
urls.append(furl.strip())
print(urls)
return urls
def main():
for pagenum in range(1,11):
print(f'>> 正在获取第{pagenum}页json列表页内容信息..')
logging.info(f'>> 正在获取第{pagenum}页json列表页内容信息..')
get_result(pagenum)
time.sleep(6)
if __name__=="__main__":
main()
·················END·················
你好,我是二大爷,
革命老区外出进城务工人员,
互联网非早期非专业站长,
喜好python,写作,阅读,英语
不入流程序,自媒体,seo . . .
关注我的都变秃了
说错了,都变强了!
不信你试试
扫码关注最新动态
公众号ID:eryeji