针对各种类型新闻网站,包含动态加载网页,无需分析URL的爬虫方法。代码中包含环球网、中新网、新华网新闻内容爬取,包含通过自动填充关键词爬取新闻内容,也包括指定从某一网页开始爬取后续网页。

一、非动态加载网页的爬取

       (1)通过分析URL信息,拼接URL获取将要爬取的网站,例如:url = 'http://s.huanqiu.com/' + 's?q=' + s_keyword + '&p='+ str(i),爬取环球网的新闻,通过自适应构建包含关键词和页码的URL信息,获取将要爬去的网页。

#-*- coding:utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import requests
import time
import os
from lxml import etree
import codecs

# 构建请求头
header = {
    'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36 Core/1.47.516.400 QQBrowser/9.4.8186.400'}
num = 0
#获取每个链接下的新闻内容
def get_html(url1):
    header = {
        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36 Core/1.47.516.400 QQBrowser/9.4.8186.400'}
    request = requests.get(url=url1, headers=header)
    response = request.content
    return response

#获取HTML中的主体文本内容并保存
def getInfobox(new_html):
    # 设置存储路径
    path = "./huanqiu_mil_news/"
    mkdir(path)
    html_page = etree.HTML(new_html)
    try:
        links = html_page.xpath("//span[@class='link']") #定位每天新闻的URL和发布时间
        #形如 (http://world.huanqiu.com/article/2018-02/11625608.html 2018-02-27)
        for link in links:
            new_link = link.text.split(' ')[0]  #获取URL
            new_time = link.text.split(' ')[1]  #获取发布时间
            mkdir(path + new_time)
            if(new_link.split('/')[-3]=='Sociology')or(new_link.split('/')[-3]=='opinion_world')or(new_link.split('/')[-3]=='hqpl')or (new_link.split('/')[-3]=='photo'): #不获取资讯-渠道合作和评论类型的新闻
                print('不获取该类型文本')
                continue
            html = get_html(new_link) #通过URL获取新闻HTML
            page = etree.HTML(html)   #解析HTML
            time.sleep(1)
            title = page.xpath("//h1")[0].text #获取新闻标题
            print('新闻标题:')
            print(title)
            print('新闻URL:')
            print(new_link)
            print('发布时间:')
            print(new_time)
            contents = page.xpath("//p") #获取新闻内容
            print('新闻内容:')
            fileName = path + '/' + new_time + '/' + new_link.split('/')[-1].split('.')[0] + ".txt"
            info = codecs.open(fileName, 'w', 'utf-8')
            info.write('标题:' + title + '\n')
            for i in range(len(contents)):
                print(contents[i].text)
                if(contents[i].text==None):
                    continue
                else:
                    info.write(contents[i].text.strip(' '))
            print(fileName + '下载完成!')
            info.close()

    except Exception, e:  # 'utf8' codec can't decode byte
        print ("Error: ", e)
    finally:
        print( '\n')

#创建目标文件夹
def mkdir(path):
    folder = os.path.exists(path)
    if not folder:  # 判断是否存在文件夹如果不存在则创建为文件夹
        os.makedirs(path)  # makedirs 创建文件时如果路径不存在会创建这个路径
        print( "--- create new folder...  ---" + path +"---  OK  ---")

def main():
    source = open("search_keywords.txt", 'r') #获取多组关键字
    lines = source.readlines()
    for s_keyword in lines:  
        s_keyword = unicode(s_keyword,'utf-8')
        for i in range(1,100):                 #针对每组关键词和页码构建URL
            print('***********************************\t第'+ str(i)+'页\t************************************')
            #构建包含关键词和页码的URL信息
            url = 'http://s.huanqiu.com/' + 's?q=' + s_keyword + '&p='+ str(i)
            new_html = get_html(url) #获取URL对应的HTML页面
            getInfobox(new_html) #抽取页面中所需要的信息
    print ('End Read Files!')
    source.close()
if __name__ == '__main__':
    main()

其中search_keywords.txt中的内容如下所示:

朝鲜 军事
韩国 军事
美国 军事
中国 军事
日本 军事
俄国 军事

(2)无需分析URL的爬取方法。通过selenium模拟点击下一页,获取下一页内容。例如,爬取中新网新闻,模拟输入并提交关键词,获取目标网页并爬取新闻内容。

#-*- coding:utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import requests
from bs4 import BeautifulSoup
import time
import random
import os
import math

# 构建请求头
header = {
    'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36 Core/1.47.516.400 QQBrowser/9.4.8186.400'}

# 创建浏览器对象
browser = webdriver.Chrome()

# 设置加载超时时间
wait = WebDriverWait(browser,10)

#定义爬取的URL
url = 'http://sou.chinanews.com.cn/'

# 设置新闻下载路径
path = './zxNews/'
if not os.path.exists(path):
    os.makedirs(path)

# 发送请求
browser.get(url)
keywords = raw_input('please input search keywords:')
browser.find_element_by_name('q').send_keys(keywords)
browser.find_element_by_name('submitBtn').click()
print(u'跳转到查询详情页')



#获取每个链接下的新闻内容
def get_html(url):
    #print('get_html调用')
    header = {
        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36 Core/1.47.516.400 QQBrowser/9.4.8186.400'}
    request = requests.get(url=url, headers=header)
    response = request.content
    with open('./first_html.txt','wb') as f:
        f.write(response)
    return response


#获取HTML中的主体文本内容并保存
def get_content(html,url):
    soup = BeautifulSoup(html,"html.parser")
    if(soup.find("div", class_="left_zw")==None):
        pass
    else:
        content = soup.find("div", class_="left_zw")
        print(content.get_text())
        textpath =url.split('/')[4]+url.split('/')[5]+ '-' + url.split('/')[6]
        with open( './zxNews/' + textpath.split('.')[0] + '.txt','wb') as f:
            for string in content.strings:
                f.write(string.encode('utf-8'))
            print(textpath.split('.')[0] + '.txt 下载完成!')
            f.close()
def main():
    news_num = raw_input('please enter the number of crawl items: ')
    links = browser.find_elements_by_xpath("//ul[@class = 'news_item']/li/a[@href]")
    news_page = math.ceil(int(news_num)/len(links))
    for i in range(1,int(news_page)+1):
        # 获取详情页中的新闻URL
        time.sleep(1)
        links = browser.find_elements_by_xpath("//li[@class = 'news_title']/a[@href]")
        for link in links:
            html = get_html(link.get_attribute("href"))
            get_content(html,link.get_attribute("href"))
        nextBtn_href = "//a[@href='javascript:ongetkey(" + str(i) + ")']"   #获取下一页button的节点元素
        browser.find_element_by_xpath(nextBtn_href).click() #模拟点击翻页
if __name__ == '__main__':
    main()

二、动态加载页面的爬取

        无需分析URL的爬取方法。通过selenium模拟加载网页,例如,爬取新华网新闻内容,确定爬取的页数,模拟点击加载下一页内容。

#-*- encoding:utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
import requests
import os
from lxml import etree

# 构建请求头
header = {
    'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36 Core/1.47.516.400 QQBrowser/9.4.8186.400'}

# 创建浏览器对象
browser = webdriver.Chrome()

# 设置加载超时时间
wait = WebDriverWait(browser,10)

#定义爬取的URL
url = 'http://www.news.cn/mil/gundong.htm'

# 设置新闻下载路径
path = './xinhua_news/'
if not os.path.exists(path):
    os.makedirs(path)

# 发送请求
browser.get(url)
#在控制台输入要爬取的页数
news_page = input('please enter crawl pages =  ')
#模拟点击加载相应页数的新闻信息
for i in range(0,news_page):
    next_button = browser.find_elements_by_xpath("//li[@id='dataMoreBtn']")[0]  #通过xpath定位到加载更多的button
    if(next_button == None):
        break
    next_button.click()

#创建目标文件夹
def mkdir(path):
    folder = os.path.exists(path)
    if not folder:  # 判断是否存在文件夹如果不存在则创建为文件夹
        os.makedirs(path)  # makedirs 创建文件时如果路径不存在会创建这个路径
        print( "--- create new folder...  ---" + path +"---  OK  ---")

#获取每个链接下的新闻内容
def get_html(url):
    #print('get_html调用')
    header = {
        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36 Core/1.47.516.400 QQBrowser/9.4.8186.400'}
    request = requests.get(url=url, headers=header)
    response = request.content
    return response

def main():
    lis = browser.find_elements_by_xpath("//li[@class='clearfix']")  #获取新闻条列表
    titles = browser.find_elements_by_xpath("//li[@class='clearfix']/h3") #获取标题列表
    links = browser.find_elements_by_xpath("//li[@class='clearfix']/h3/a[@href]") #获取新闻URL列表
    summarys = browser.find_elements_by_xpath("//li[@class='clearfix']/p[@class='summary']") #获取摘要列表
    news_time = browser.find_elements_by_xpath("//span[@class='time']") #获取新闻发布时间列表

    try:
        for i in range(len(lis)):
            filedir = news_time[i].text.split(' ')[0]
            mkdir(path + filedir)
            f = open(path + filedir + '/' + links[i].get_attribute("href").split('/')[-1].split('.')[0] + '.txt','w')
            html = get_html(links[i].get_attribute("href"))
            print('新闻URL:')
            print(links[i].get_attribute("href"))
            page = etree.HTML(html)
            html_contents = page.xpath("//p")
            content = ""
            print('标题:')
            print(titles[i].text + '\n')
            print('时间:')
            print(news_time[i].text + '\n')
            print('摘要:')
            print(summarys[i].text + '\n')
            print('详情:')
            for html_content in html_contents:
                if (html_content.text == None):
                    continue
                else:
                    content = content + html_content.text
                print(html_content.text)
            f.write('标题:'+ titles[i].text + '\n')
            f.write('时间:'+ news_time[i].text + '\n')
            f.write('摘要:' + summarys[i].text + '\n')
            f.write('详情:' + content.strip(' '))
            print(str(i)+' 下载完成!')
            f.close()
    except Exception, e:  # 'utf8' codec can't decode byte
        print ("Error: ", e)
    finally:
        print ('\n')
        browser.close()


if __name__ == '__main__':
    main()