针对各种类型新闻网站,包含动态加载网页,无需分析URL的爬虫方法。代码中包含环球网、中新网、新华网新闻内容爬取,包含通过自动填充关键词爬取新闻内容,也包括指定从某一网页开始爬取后续网页。
一、非动态加载网页的爬取
(1)通过分析URL信息,拼接URL获取将要爬取的网站,例如:url = 'http://s.huanqiu.com/' + 's?q=' + s_keyword + '&p='+ str(i),爬取环球网的新闻,通过自适应构建包含关键词和页码的URL信息,获取将要爬去的网页。
#-*- coding:utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import requests
import time
import os
from lxml import etree
import codecs
# 构建请求头
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36 Core/1.47.516.400 QQBrowser/9.4.8186.400'}
num = 0
#获取每个链接下的新闻内容
def get_html(url1):
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36 Core/1.47.516.400 QQBrowser/9.4.8186.400'}
request = requests.get(url=url1, headers=header)
response = request.content
return response
#获取HTML中的主体文本内容并保存
def getInfobox(new_html):
# 设置存储路径
path = "./huanqiu_mil_news/"
mkdir(path)
html_page = etree.HTML(new_html)
try:
links = html_page.xpath("//span[@class='link']") #定位每天新闻的URL和发布时间
#形如 (http://world.huanqiu.com/article/2018-02/11625608.html 2018-02-27)
for link in links:
new_link = link.text.split(' ')[0] #获取URL
new_time = link.text.split(' ')[1] #获取发布时间
mkdir(path + new_time)
if(new_link.split('/')[-3]=='Sociology')or(new_link.split('/')[-3]=='opinion_world')or(new_link.split('/')[-3]=='hqpl')or (new_link.split('/')[-3]=='photo'): #不获取资讯-渠道合作和评论类型的新闻
print('不获取该类型文本')
continue
html = get_html(new_link) #通过URL获取新闻HTML
page = etree.HTML(html) #解析HTML
time.sleep(1)
title = page.xpath("//h1")[0].text #获取新闻标题
print('新闻标题:')
print(title)
print('新闻URL:')
print(new_link)
print('发布时间:')
print(new_time)
contents = page.xpath("//p") #获取新闻内容
print('新闻内容:')
fileName = path + '/' + new_time + '/' + new_link.split('/')[-1].split('.')[0] + ".txt"
info = codecs.open(fileName, 'w', 'utf-8')
info.write('标题:' + title + '\n')
for i in range(len(contents)):
print(contents[i].text)
if(contents[i].text==None):
continue
else:
info.write(contents[i].text.strip(' '))
print(fileName + '下载完成!')
info.close()
except Exception, e: # 'utf8' codec can't decode byte
print ("Error: ", e)
finally:
print( '\n')
#创建目标文件夹
def mkdir(path):
folder = os.path.exists(path)
if not folder: # 判断是否存在文件夹如果不存在则创建为文件夹
os.makedirs(path) # makedirs 创建文件时如果路径不存在会创建这个路径
print( "--- create new folder... ---" + path +"--- OK ---")
def main():
source = open("search_keywords.txt", 'r') #获取多组关键字
lines = source.readlines()
for s_keyword in lines:
s_keyword = unicode(s_keyword,'utf-8')
for i in range(1,100): #针对每组关键词和页码构建URL
print('***********************************\t第'+ str(i)+'页\t************************************')
#构建包含关键词和页码的URL信息
url = 'http://s.huanqiu.com/' + 's?q=' + s_keyword + '&p='+ str(i)
new_html = get_html(url) #获取URL对应的HTML页面
getInfobox(new_html) #抽取页面中所需要的信息
print ('End Read Files!')
source.close()
if __name__ == '__main__':
main()
其中search_keywords.txt中的内容如下所示:
朝鲜 军事
韩国 军事
美国 军事
中国 军事
日本 军事
俄国 军事
(2)无需分析URL的爬取方法。通过selenium模拟点击下一页,获取下一页内容。例如,爬取中新网新闻,模拟输入并提交关键词,获取目标网页并爬取新闻内容。
#-*- coding:utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import requests
from bs4 import BeautifulSoup
import time
import random
import os
import math
# 构建请求头
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36 Core/1.47.516.400 QQBrowser/9.4.8186.400'}
# 创建浏览器对象
browser = webdriver.Chrome()
# 设置加载超时时间
wait = WebDriverWait(browser,10)
#定义爬取的URL
url = 'http://sou.chinanews.com.cn/'
# 设置新闻下载路径
path = './zxNews/'
if not os.path.exists(path):
os.makedirs(path)
# 发送请求
browser.get(url)
keywords = raw_input('please input search keywords:')
browser.find_element_by_name('q').send_keys(keywords)
browser.find_element_by_name('submitBtn').click()
print(u'跳转到查询详情页')
#获取每个链接下的新闻内容
def get_html(url):
#print('get_html调用')
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36 Core/1.47.516.400 QQBrowser/9.4.8186.400'}
request = requests.get(url=url, headers=header)
response = request.content
with open('./first_html.txt','wb') as f:
f.write(response)
return response
#获取HTML中的主体文本内容并保存
def get_content(html,url):
soup = BeautifulSoup(html,"html.parser")
if(soup.find("div", class_="left_zw")==None):
pass
else:
content = soup.find("div", class_="left_zw")
print(content.get_text())
textpath =url.split('/')[4]+url.split('/')[5]+ '-' + url.split('/')[6]
with open( './zxNews/' + textpath.split('.')[0] + '.txt','wb') as f:
for string in content.strings:
f.write(string.encode('utf-8'))
print(textpath.split('.')[0] + '.txt 下载完成!')
f.close()
def main():
news_num = raw_input('please enter the number of crawl items: ')
links = browser.find_elements_by_xpath("//ul[@class = 'news_item']/li/a[@href]")
news_page = math.ceil(int(news_num)/len(links))
for i in range(1,int(news_page)+1):
# 获取详情页中的新闻URL
time.sleep(1)
links = browser.find_elements_by_xpath("//li[@class = 'news_title']/a[@href]")
for link in links:
html = get_html(link.get_attribute("href"))
get_content(html,link.get_attribute("href"))
nextBtn_href = "//a[@href='javascript:ongetkey(" + str(i) + ")']" #获取下一页button的节点元素
browser.find_element_by_xpath(nextBtn_href).click() #模拟点击翻页
if __name__ == '__main__':
main()
二、动态加载页面的爬取
无需分析URL的爬取方法。通过selenium模拟加载网页,例如,爬取新华网新闻内容,确定爬取的页数,模拟点击加载下一页内容。
#-*- encoding:utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
import requests
import os
from lxml import etree
# 构建请求头
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36 Core/1.47.516.400 QQBrowser/9.4.8186.400'}
# 创建浏览器对象
browser = webdriver.Chrome()
# 设置加载超时时间
wait = WebDriverWait(browser,10)
#定义爬取的URL
url = 'http://www.news.cn/mil/gundong.htm'
# 设置新闻下载路径
path = './xinhua_news/'
if not os.path.exists(path):
os.makedirs(path)
# 发送请求
browser.get(url)
#在控制台输入要爬取的页数
news_page = input('please enter crawl pages = ')
#模拟点击加载相应页数的新闻信息
for i in range(0,news_page):
next_button = browser.find_elements_by_xpath("//li[@id='dataMoreBtn']")[0] #通过xpath定位到加载更多的button
if(next_button == None):
break
next_button.click()
#创建目标文件夹
def mkdir(path):
folder = os.path.exists(path)
if not folder: # 判断是否存在文件夹如果不存在则创建为文件夹
os.makedirs(path) # makedirs 创建文件时如果路径不存在会创建这个路径
print( "--- create new folder... ---" + path +"--- OK ---")
#获取每个链接下的新闻内容
def get_html(url):
#print('get_html调用')
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36 Core/1.47.516.400 QQBrowser/9.4.8186.400'}
request = requests.get(url=url, headers=header)
response = request.content
return response
def main():
lis = browser.find_elements_by_xpath("//li[@class='clearfix']") #获取新闻条列表
titles = browser.find_elements_by_xpath("//li[@class='clearfix']/h3") #获取标题列表
links = browser.find_elements_by_xpath("//li[@class='clearfix']/h3/a[@href]") #获取新闻URL列表
summarys = browser.find_elements_by_xpath("//li[@class='clearfix']/p[@class='summary']") #获取摘要列表
news_time = browser.find_elements_by_xpath("//span[@class='time']") #获取新闻发布时间列表
try:
for i in range(len(lis)):
filedir = news_time[i].text.split(' ')[0]
mkdir(path + filedir)
f = open(path + filedir + '/' + links[i].get_attribute("href").split('/')[-1].split('.')[0] + '.txt','w')
html = get_html(links[i].get_attribute("href"))
print('新闻URL:')
print(links[i].get_attribute("href"))
page = etree.HTML(html)
html_contents = page.xpath("//p")
content = ""
print('标题:')
print(titles[i].text + '\n')
print('时间:')
print(news_time[i].text + '\n')
print('摘要:')
print(summarys[i].text + '\n')
print('详情:')
for html_content in html_contents:
if (html_content.text == None):
continue
else:
content = content + html_content.text
print(html_content.text)
f.write('标题:'+ titles[i].text + '\n')
f.write('时间:'+ news_time[i].text + '\n')
f.write('摘要:' + summarys[i].text + '\n')
f.write('详情:' + content.strip(' '))
print(str(i)+' 下载完成!')
f.close()
except Exception, e: # 'utf8' codec can't decode byte
print ("Error: ", e)
finally:
print ('\n')
browser.close()
if __name__ == '__main__':
main()