#!/usr/bin/env python # -*- coding:utf-8 -*- import os import re import datetime import requests url_name_str='''朱子超 https://www.cnblogs.com/heroknot/ 赵嘉豪 https://www.cnblogs.com/zhoajiahao/ 巩景云 https://www.cnblogs.com/gongjingyun123--/ 李琦 https://www.cnblogs.com/1naonao/ 潘立府 https://www.cnblogs.com/plf-Jack/ 胡凯琴 https://www.cnblogs.com/863652104kai/ 雷俊 https://www.cnblogs.com/lucky75/ 刘闯 https://www.cnblogs.com/miaowugulu/ 毛毅智 https://www.cnblogs.com/acate/ 葛林丽 https://www.cnblogs.com/geyatou322/ 朱缘应 https://www.cnblogs.com/zhuyuanying123--/ 雷鸣 https://www.cnblogs.com/leimingqq2/ 赵刚 https://www.cnblogs.com/zhaogang0104/ 吴锡 https://www.cnblogs.com/ZDQ1/ 张岩 https://www.cnblogs.com/zuihoudebieli/ 高化焱 https://www.cnblogs.com/gaohuayan/ 孔凡平 https://www.cnblogs.com/WilliamKong94/ 王强 https://www.cnblogs.com/bruce123/ 杨文益 https://www.cnblogs.com/pythonywy/ 伍开日 https://www.cnblogs.com/clarence203/ 朱竹平 https://www.cnblogs.com/Hades123/ 周瑞星 https://www.cnblogs.com/zrx19960128/ 许长义 https://www.cnblogs.com/xcyandwxl/ 储皖浏 https://www.cnblogs.com/chuwanliu/ 陈石 https://www.cnblogs.com/chencharry/ 徐浩 https://www.cnblogs.com/einsam/ 吴奇宇 https://www.cnblogs.com/blog5434/ 张天承 https://www.cnblogs.com/bladecheng/ 赵志强 https://www.cnblogs.com/wsxiaoyao/ 朱健 https://www.cnblogs.com/masterjian924/ 魏义军 https://www.cnblogs.com/Dr-wei/ 曹降祥 https://www.cnblogs.com/fengxuemuyangren/ 陈跃春 https://www.cnblogs.com/chenych/ 黄云 https://www.cnblogs.com/yellowcloud/ 段力钢 https://www.cnblogs.com/raynduan/ 刘金 https://www.cnblogs.com/itboy-newking/ ''' def get_name_url_dict(): """读取文件""" if not os.path.exists('博客地址.txt'): with open('博客地址.txt', 'w', encoding='utf8') as fw: fw.write(url_name_str) fw.flush() print('写入文件成功...') with open('博客地址.txt', 'r', encoding='utf8') as fr: name_urls = fr.readlines() name_url_dict = dict() for name_url in name_urls: name_url_split = name_url.split() name = name_url_split[0] url = name_url_split[1] name_url_dict[name] = url print(f'同学数:{len(name_url_dict)}') return name_url_dict def request_next_url_data(next_url, url_list): """请求下一个网页""" next_response = requests.get(next_url) next_data = next_response.text next_url_list = re.findall('href="(.*?)">(.*?)', next_data) url_list.extend(next_url_list) re_next_url = re.findall('下一页', next_data) if re_next_url: re_next_url = re_next_url[0] request_next_url_data(re_next_url, url_list) return url_list def for_every_name_urls(name_url_dict): """循环爬取所有人的博客信息""" s_sum = '' for name, home_url in name_url_dict.items(): # 拼接主页 s_sum = f'{s_sum}{name} {home_url}\n' print(name, home_url) # 获取第一页的内容 response = requests.get(home_url) data = response.text url_list = re.findall('href="(.*?)">(.*?)', data) # 判断是否存在下一页 next_url = re.findall('[^;]下一页', data) if next_url: next_url = next_url[0] url_list = request_next_url_data(next_url, url_list) # 去重处理 url_set = set() for url in url_list: if url[0].startswith(f'{home_url}p/') and url[0].endswith('html'): url_set.add(url) print(url_set) for url in url_set: s = f'{name} {url[0]} {url[1]}' s_sum = f'{s_sum}{s}\n' s_sum = f'{s_sum}\n' return s_sum def save_file(s_sum): day_time = str(datetime.datetime.now()).split(' ')[0] f = open(f'{day_time}-py9博客情况汇总.txt', 'w', encoding='utf8') f.write(s_sum) f.close() if __name__ == '__main__': name_url_dict = get_name_url_dict() s_sum = for_every_name_urls(name_url_dict) print(s_sum) save_file(s_sum)
Python之py9-py9博客情况获取
原创wx5b1fd43180419 ©著作权
©著作权归作者所有:来自51CTO博客作者wx5b1fd43180419的原创作品,请联系作者获取转载授权,否则将追究法律责任
上一篇:Python之爬虫-全民k歌
提问和评论都可以,用心的回复会被更多人看到
评论
发布评论
相关文章
-
使用PyCharm远程调试PY代码
在PyCharm上实现上传代码到远程服务器,并进行远程调试。
远程服务器 Deployment 虚拟环境 PyCharm