先展示,结果,爬取保存的txt。

先把评论给抓下来,后面再做jieba云词分析,geo评论区域 分布

python猫眼电影票房预测 python爬取猫眼电影评论_html


python猫眼电影票房预测 python爬取猫眼电影评论_html_02

直接上接口api,不犯法吧。大家都知道~~~

http://m.maoyan.com/mmdb/comments/movie/{movie_id}.json?_v_=yes&offset=1
http://api.maoyan.com/mmdb/comments/movie/{movie_id}.json?_v_=yes&offset=1

m接口因太频繁测试访问,自个儿机子被抓住了,哈哈。api是多年前抓的接口,现在还能用。漂亮…!!!接口2 不会触发验证!!!

抓取这个接口的时候,会有美团滑块验证。。刚好,之前看到了擦姐的滑块文章,就尝试着拿过来了。

当时看的是这个文章, 文中的测试地址无效啦,所以刚好碰到一个需要滑块的,拿来学习。
如何配置使用Selenium ,擦姐也都写好了,Selenium 与 Python 之间如何才能交融在一起

简直不要太棒,她啥都有!!!

文中所需模块没有的,自行安装。很简单 pip install xxx , 最新版就是了

1. 接着,就是开始滑块测试了。

滑块源码:无用的注释,自行删除。

from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import TimeoutException
import traceback
import time
import random, sys
# driver = webdriver.Firefox()
# # 浏览器最大化
# driver.maximize_window()
# # 打开注册页面
# driver.get('https://reg.taobao.com/member/reg/fill_mobile.htm')

# navigator.appVersion
def selenium_get_html(url="http://m.maoyan.com/mmdb/comments/movie/341516.json?_v_=yes&offset=1"):
    # url = "http://m.maoyan.com/mmdb/comments/movie/341516.json?_v_=yes&offset=1"
    option = webdriver.ChromeOptions()
    # 不打开窗口 , 静默模式
    # option.add_argument('headless')
    # 禁用js
    # prefs = {
    #     'profile.default_content_setting_values': {
    #         'images': 2,
    #         'javascript':2
    #     }
    # }
    # option.add_experimental_option('prefs', prefs)
    # 防止打印一些无用的日志
    option.add_experimental_option("excludeSwitches", ['enable-automation', 'enable-logging'])
    driver = webdriver.Chrome(chrome_options=option)
    # driver.set_window_size(200,200)
    # driver.maximize_window()

    driver.get(url)

    locator = (By.ID, 'yodaMoveingBar')
    try:
        a = WebDriverWait(driver, 5, 0.5).until(EC.presence_of_element_located(locator))
        time.sleep(1)
        # 发现滑块
        yodaBox = driver.find_element_by_id("yodaBox")
        # print(yodaBox.size)
        # 滑块区域
        source = driver.find_element_by_id("yodaBoxWrapper")
        # print(source.size, source.size["width"], type(source.size["width"]))

        ActionChains(driver).drag_and_drop_by_offset(yodaBox, source.size["width"], source.size["height"]).perform()

    except TimeoutException as e :
        print('等待超时...')
        sys.exit(1)
    except BaseException as e:
        print ('repr(e):\t')
        #以下两步都是输出错误的具体位置的
        traceback.print_exc()
        print ('traceback.format_exc():\n%s' % traceback.format_exc())
    finally: 
        time.sleep(12)
        driver.quit()
        return print(driver.current_url)   # current_url 方法可以得到当前页面的URL

if __name__ =='__main__':
    url = 'http://m.maoyan.com/mmdb/comments/movie/1263235.json?_v_=yes&offset=1'
    # 验证滑块
    a = selenium_get_html(url)
    print(a)

标注

这里是等待网页渲染完成,滑块节点出现后,才能操作。

locator = (By.ID, 'yodaMoveingBar')
    a = WebDriverWait(driver, 5, 0.5).until(EC.presence_of_element_located(locator))

获取滑块,并滑动至。。那个xy坐标

# 发现滑块
        yodaBox = driver.find_element_by_id("yodaBox")
        # print(yodaBox.size)
        # 滑块区域
        source = driver.find_element_by_id("yodaBoxWrapper")
        # print(source.size, source.size["width"], type(source.size["width"]))

        ActionChains(driver).drag_and_drop_by_offset(yodaBox, source.size["width"], source.size["height"]).perform()

刚开始是有成功过,但后来 就没有成功过,哈哈哈哈。一直都是请求异常,怎么优化解决,暂不知晓了哦。 最终为了,能成功爬取,我采取了手动验证后,再接着下面的爬取工作!!以下爬取工作,加入了这个验证操作代码哦。如若不行,还是注释掉,手动去滑块验证吧

2. 评论中需要电影的id,所以先通过搜索获取影片id

接口:https://maoyan.com/ajax/suggest?kw={keyword}
这里需要JSON去解析返回的数据,util封装的模块,在上篇文章中有,再贴一下。还写了个超级简单的exception封装,哈哈哈。都贴下面了,不要笑我哦。

下面这个方法,返回的是list中的一个,还没有拿到id的哦。
# 通过关键字,返回电影列表。只返回第一个
def get_movies(keyword):
    html = util.get_html(f'https://maoyan.com/ajax/suggest?kw={keyword}')
    # print(html)
    mvs = json.loads(html)['movies']['list']
    # print(mvs)
    if (len(mvs) == 0):
        raise SkipException(f'找不到{keyword}')
    return mvs[0]

Util 模块,加了个函数运行时间

import os, time, requests, random, telnetlib, json, pypinyin
from bs4 import BeautifulSoup
__dir__ = os.path.dirname(os.path.abspath(__file__))
# print(__dir__)

def get_headers(localhost=True, refer="https://www.baidu.com", host=None):

	ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36"
	if not localhost:
		uas = [
			"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36",
			"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
			"Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)",
			"Baiduspider-image+(+http://www.baidu.com/search/spider.htm)",
			"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
			"Mozilla/5.0 (compatible; Googlebot-Image/1.0; +http://www.google.com/bot.html)",
			"Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
			"Sogou News Spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
			"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);",
			"Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)",
			"Sosospider+(+http://help.soso.com/webspider.htm)",
			"Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)"
		]
		ua = random.choice(uas)
	headers = {
		"User-Agent": ua,
		"Referer": refer,
		"Host": host
	}
	return headers

def get_html(url, ret_type="text", timeout=50, encoding="utf-8"):
	headers = get_headers()
	res = requests.get(url, headers=headers, timeout=timeout)
	res.encoding = encoding
	# print(res.status_code)
	# print(res.text)
	if ret_type == "text":
		return res.text
	elif ret_type == "image":
		return res.content
	elif ret_type == "json":
		return res.json()

# 装饰器,函数运行时间
def run_time(func):
    # 这里的 wrapper 函数名可以为任意名称
    def wrapper(*args):
        s_time = time.perf_counter()
        res = func(*args)
        e_time = time.perf_counter()
        print(f"func --- {func.__name__}, runtime --- {e_time-s_time}")
        return res
    return wrapper
# 异常类 继承Exception

# 跳过异常
class SkipException(Exception):
    def __init__(self,msg):
        print(f'不可控异常,为不影响原程序运行,抛出此异常提示...{msg}')
        self.msg = msg

3. 拿到id后,就可以开始抓取了。接口返回的是JSON,测试发现 该接口只返回直 offset=1000。所以程序里大于1000的直接pass了,不会有数据

# 爬取,XX电影
def scrawl_mv(keyword):
    try:
        mv = get_movies(keyword)
    except SkipException as e :
        print(e)
    
    movie_id = mv['id']
    movie_name = mv['nm']
    print(type(mv), movie_id, movie_name)
    url = f'http://api.maoyan.com/mmdb/comments/movie/{movie_id}.json?_v_=yes&offset=1'
    print(f'正在爬取 --- 【{movie_name}】====第一页======= {url}')
    do_scrawl(url, movie_name)

do_scrawl 里的valid,这里是直接在第一次,进行滑块验证。可能一直失败,我就尝试做了3次。失败就sys.exit(1)退出了。

try:
        html = util.get_html(url)
        print('第一次 进行反复循环,检测是否需要验证')
        # 第一次 进行反复循环,检测是否需要验证
        html = valid(html, url)
    except SkipException as obj:
        print(obj)
        sys.exit(1)

滑块可能会不起作用,建议还是手动去验证吧,哈哈。然后注掉代码,再去跑

好吧,直接贴一下所有代码。。。200多行 , 不算多吧。

import sys, os
__dir__ = os.path.dirname(os.path.abspath(__file__))
sys.path.append(__dir__)
sys.path.append(os.path.abspath(os.path.join(__dir__, '../common')))
import util, maoyan_jieba
from exp import SkipException
import requests, json, time, random, slide_selenium, threading
from bs4 import BeautifulSoup

# 解析一页数据
def parse_ono_page(html):
    try:
        data = json.loads(html)['cmts'] #评论以json形式存储,故以json形式截取
    except Exception:
        raise SkipException('json解析错误,获取不到cmts') 
    #data = json.loads(html)['hcmts'] #评论以json形式存储,故以json形式截取
    for item in data:
        yield { #该方法返回一个字典
            'comment':item['content'],
            'date':item['time'].split(' ')[0],
            'rate':item['score'],
            'city':item['cityName'],
            'nickname':item['nickName']
        }

# 返回评论总数
def parse_ono_pages(html):
    # 总数
    total = json.loads(html)['total']
    return total

# {"approve":0,"assistAwardInfo":{"avatar":"","celebrityId":0,"celebrityName":"","rank":0,"title":""},"avatarurl":"https://img.meituan.net/maoyanuser/0d20974fe7a2dcb726680f4d94493b8511096.png","cityName":"北京","content":"刘德华演技在线,画面真美,剧情太烂!","id":1143035845,"isMajor":false,"juryLevel":0,"movieId":341516,"nick":"zhangsq0812","nickName":"zhangsq0812","oppose":0,"pro":false,"reply":0,"score":0.5,"spoiler":0,"startTime":"2021-09-10 11:55:57","supportComment":true,"supportLike":true,"sureViewed":1,"tagList":{"fixed":[{"id":2,"name":"购票差评"},{"id":4,"name":"购票"}]},"time":"2021-09-10 11:55","userId":220211944,"userLevel":0,"vipType":0}
#保存数据到文本文档
def save_to_txt(url, filepath=os.path.join(__dir__, f'../files/{str(round(time.time() * 1000))}.txt')):
    html = util.get_html(url)
    # print(filepath)
    try:
        cmts = parse_ono_page(html)
    except Exception:
        raise SkipException('解析JSON异常') 
    
    for item in cmts:
        # print(item)
        with open(filepath,'a',encoding='utf-8') as f:
            f.write(item['date'] + ',' + item['nickname'] + ',' + item['city'] + ',' +str(item['rate'])+','+item['comment']+'\n')

# 获取的评论可能有重复,为了最终统计的真实性,需做去重处理
def delete_repeat(old,new):
    oldfile = open(old,'r',encoding='UTF-8')
    newfile = open(new,'w',encoding='UTF-8')
    content_list = oldfile.readlines() #读取的数据集
    content_alreadly_ditinct = [] #存储不重复的评论数据
    for line in content_list:
        if line not in content_alreadly_ditinct: #评论不重复
            newfile.write(line+'\n')
            content_alreadly_ditinct.append(line)

# 获取网页,并保存至txt
def scrawl(url):
    print(f'正在爬取{url}')
    html = util.get_html(url)
    save_to_txt(html)

# 第一次访问 可能需要人工验证,滑块
@util.run_time
def valid(html, url):
    times = 1
    while(True):
        soup = BeautifulSoup(html, 'html.parser')
        titles = soup.select('title')
        print(titles)
        if len(titles) > 0:
            title = titles[0].text
            if '验证' in title:
                slide_selenium.selenium_get_html(url)
                time.sleep(1)
                html = util.get_html(url)
                times += 1
            else:
                break
        else: 
            break
            # return html
        if times > 3:
            raise SkipException('无法通过滑块验证,error')
    return util.get_html(url)

# 通过id,获取电影名称
@util.run_time
def get_movie_name(movie_id):
    """
    :param :movie_id - 电影id
    """
    html = util.get_html(f'http://api.maoyan.com/mmdb/movie/v5/{movie_id}.json')
    data = json.loads(html)['data']['movie']
    return data

# 通过关键字,返回电影列表。只返回第一个
def get_movies(keyword):
    html = util.get_html(f'https://maoyan.com/ajax/suggest?kw={keyword}')
    # print(html)
    mvs = json.loads(html)['movies']['list']
    # print(mvs)
    if (len(mvs) == 0):
        raise SkipException(f'找不到{keyword}')
    return mvs[0]

# 爬url
@util.run_time
def do_scrawl(url, movie_name=f'movie{str(round(time.time() * 1000))}'):
    """
    :param :url 要爬取的链接
    :param :movie_name 电影名称,保存txt文件名
    """
    try:
        html = util.get_html(url)
        print('第一次 进行反复循环,检测是否需要验证')
        # 第一次 进行反复循环,检测是否需要验证
        html = valid(html, url)
    except SkipException as obj:
        print(obj)
        sys.exit(1)
    # print(html)
    # 评论总数
    total = parse_ono_pages(html)
    # 发现接口只返回1000条
    if total > 1000:
        total = 1000
    # 接口 返回的评论条数
    size = 15
    # 取整的页数
    pages = round(total/size)
    # 每个线程的工作量
    # thrs = 2
    thrs = random.randint(2,10)
    works = round(pages / thrs)

    # 如果 线程*每个线程工作量<总页数,需要在启动一个线程
    if thrs * works < pages:
        thrs += 1

    root_path = util.JarProjectPath.project_root_path('py')
    filepath = root_path + f'files/{movie_name}.txt'
    print(f'共{total}条评论,每页{size}条,可爬{pages}页,随机预设【{thrs}】个线程,每个线程需爬取【{works}】页')
    # 多线程爬取
    l = []
    # 线程从1开始,各加1
    for i in range(1, thrs+1):
        if i == thrs and pages % works != 0:
            # 最后一个线程,不能超出可爬取的页数
            t = threading.Thread(target=save_batch, args=(i, works*(i-1), works*(i-1) + pages % works, filepath))
        else: 
            t = threading.Thread(target=save_batch, args=(i, works*(i-1), works*i, filepath))
        l.append(t)
        print("线程{} 启动".format(i))
        t.start()
        
    for p in l:
        p.join()

    print("多线程执行完成,爬取完毕")
    print(f'共{total}条评论,每页{size}条,可爬{pages}页,随机预设【{thrs}】个线程,每个线程需爬取【{works}】页')

# 爬取,XX电影
def scrawl_mv(keyword):
    try:
        mv = get_movies(keyword)
    except SkipException as e :
        print(e)
    
    movie_id = mv['id']
    movie_name = mv['nm']
    print(type(mv), movie_id, movie_name)
    url = f'http://api.maoyan.com/mmdb/comments/movie/{movie_id}.json?_v_=yes&offset=1'
    print(f'正在爬取 --- 【{movie_name}】====第一页======= {url}')
    do_scrawl(url, movie_name)

# 多线程爬取
def save_batch(no, start, end, filepath):
    # 这里因为接口,第一页返回数据1-15,第二页返回数据2-16,造成数据重复。
    # 所以 这里给offset 循环一次加上15
    size = 15 # 固定
    for i in range(start, end):
        # 反爬
        time.sleep(1 + float(random.randint(1,100)) / 20)
        url = f'http://api.maoyan.com/mmdb/comments/movie/1263235.json?_v_=yes&offset={i*size+1}'
        print(f'Thread.{no} >>> 正在保存 --- {url}')
        try:
            save_to_txt(url, filepath)
        except SkipException as obj:
            continue

# 测试
@util.run_time
def thread_test(movie_name):
    # 评论总数
    total = 1530
    # 接口 返回的评论条数
    size = 15
    # 取整的页数
    pages = round(total/size)
    # 每个线程的工作量
    works = 50

    # 最大线程数
    r = int(pages / works) + 1 if pages % works > 0 else 0
    print(f'共{total}条评论,每页{size}条,可爬{pages}页,预设每个线程爬取【{works}】页,需要【{r}】个线程')
    
    root_path = util.JarProjectPath.project_root_path('py')
    filepath = root_path + f'files/{movie_name}.txt'

    # l = []
    # for i in range(1, r+1):
    #     if i == r:
    #         # 最后一个线程,不能超出可爬取的页数
    #         t = ThreadCrawl(str(i), save_batch, works*(i-1), works*(i-1) + pages % works, filepath)
    #     else: 
    #         t = ThreadCrawl(str(i), save_batch, works*(i-1), works*i, filepath)
    #     l.append(t)
    #     t.start()

    # for p in l:
    #     p.join()

    print("多线程执行完成,爬取完毕")

class ThreadCrawl(threading.Thread):
    """
    :param :thread_name 线程名称
    :param :func 线程要执行的函数
    """
    def __init__(self, thread_name, func, *args):
        # threading.Thread.__init__(self)
        # 调用父类初始化方法
        super(ThreadCrawl, self).__init__()
        self.threadName = thread_name
        self.func = func(*args)
        print('线程初始化', *args)
    def run(self):
        # runrun(self.threadName)
        print(f'线程{self.threadName}:************启动************')
        self.func
        
if __name__ =='__main__':
    # print(get_movies('怒火·重案'))
    movie_name = '我的青春有个你'
    scrawl_mv(movie_name)

    filepath = f'files/{movie_name}.txt'
    
    if os.path.exists(filepath):
        print(os.path.abspath(filepath))
        maoyan_jieba.analysis(os.path.abspath(filepath))

大部分有注释说明的吧,哈哈。。

爬取txt后,之后做了 云词分析跟geo评论分布,大概这样。文中maoyan_jieba,代码还没贴哦。

python猫眼电影票房预测 python爬取猫眼电影评论_python猫眼电影票房预测_03


python猫眼电影票房预测 python爬取猫眼电影评论_爬虫_04