刚接触python,试一下爬虫。拿自己的Blog开刀
import requests from bs4 import BeautifulSoup import pprint url = "https://www.cnblogs.com/zyqgold/" #爬取分页 def download_all_htmls(): htmls = [] for i in range(7): url = f"https://www.cnblogs.com/zyqgold/default.html?page={i+1}" #print("页面URL:",url) r = requests.get(url) if r.status_code != 200: raise Exception("error") htmls.append(r.text) return htmls #爬取分页里边的文章链接 def parse_single_html(html): soup = BeautifulSoup(html,"html.parser") articles = soup.find_all("a",class_= "postTitle2 vertical-middle") nodes =[] for article in articles: nodes.append({"name":article.span.string,"link":article.attrs["href"]}) return nodes htmls = download_all_htmls() all_html = [] for html in htmls: all_html.extend(parse_single_html(html)) pprint.pprint(all_html)