先展示,结果,爬取保存的txt。
先把评论给抓下来,后面再做jieba云词分析,geo评论区域 分布
直接上接口api,不犯法吧。大家都知道~~~
http://m.maoyan.com/mmdb/comments/movie/{movie_id}.json?_v_=yes&offset=1
http://api.maoyan.com/mmdb/comments/movie/{movie_id}.json?_v_=yes&offset=1
m接口因太频繁测试访问,自个儿机子被抓住了,哈哈。api是多年前抓的接口,现在还能用。漂亮…!!!接口2 不会触发验证!!!
抓取这个接口的时候,会有美团滑块验证。。刚好,之前看到了擦姐的滑块文章,就尝试着拿过来了。
当时看的是这个文章, 文中的测试地址无效啦,所以刚好碰到一个需要滑块的,拿来学习。
如何配置使用Selenium ,擦姐也都写好了,Selenium 与 Python 之间如何才能交融在一起
简直不要太棒,她啥都有!!!
文中所需模块没有的,自行安装。很简单 pip install xxx , 最新版就是了
1. 接着,就是开始滑块测试了。
滑块源码:无用的注释,自行删除。
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import TimeoutException
import traceback
import time
import random, sys
# driver = webdriver.Firefox()
# # 浏览器最大化
# driver.maximize_window()
# # 打开注册页面
# driver.get('https://reg.taobao.com/member/reg/fill_mobile.htm')
# navigator.appVersion
def selenium_get_html(url="http://m.maoyan.com/mmdb/comments/movie/341516.json?_v_=yes&offset=1"):
# url = "http://m.maoyan.com/mmdb/comments/movie/341516.json?_v_=yes&offset=1"
option = webdriver.ChromeOptions()
# 不打开窗口 , 静默模式
# option.add_argument('headless')
# 禁用js
# prefs = {
# 'profile.default_content_setting_values': {
# 'images': 2,
# 'javascript':2
# }
# }
# option.add_experimental_option('prefs', prefs)
# 防止打印一些无用的日志
option.add_experimental_option("excludeSwitches", ['enable-automation', 'enable-logging'])
driver = webdriver.Chrome(chrome_options=option)
# driver.set_window_size(200,200)
# driver.maximize_window()
driver.get(url)
locator = (By.ID, 'yodaMoveingBar')
try:
a = WebDriverWait(driver, 5, 0.5).until(EC.presence_of_element_located(locator))
time.sleep(1)
# 发现滑块
yodaBox = driver.find_element_by_id("yodaBox")
# print(yodaBox.size)
# 滑块区域
source = driver.find_element_by_id("yodaBoxWrapper")
# print(source.size, source.size["width"], type(source.size["width"]))
ActionChains(driver).drag_and_drop_by_offset(yodaBox, source.size["width"], source.size["height"]).perform()
except TimeoutException as e :
print('等待超时...')
sys.exit(1)
except BaseException as e:
print ('repr(e):\t')
#以下两步都是输出错误的具体位置的
traceback.print_exc()
print ('traceback.format_exc():\n%s' % traceback.format_exc())
finally:
time.sleep(12)
driver.quit()
return print(driver.current_url) # current_url 方法可以得到当前页面的URL
if __name__ =='__main__':
url = 'http://m.maoyan.com/mmdb/comments/movie/1263235.json?_v_=yes&offset=1'
# 验证滑块
a = selenium_get_html(url)
print(a)
标注
这里是等待网页渲染完成,滑块节点出现后,才能操作。
locator = (By.ID, 'yodaMoveingBar')
a = WebDriverWait(driver, 5, 0.5).until(EC.presence_of_element_located(locator))
获取滑块,并滑动至。。那个xy坐标
# 发现滑块
yodaBox = driver.find_element_by_id("yodaBox")
# print(yodaBox.size)
# 滑块区域
source = driver.find_element_by_id("yodaBoxWrapper")
# print(source.size, source.size["width"], type(source.size["width"]))
ActionChains(driver).drag_and_drop_by_offset(yodaBox, source.size["width"], source.size["height"]).perform()
刚开始是有成功过,但后来 就没有成功过,哈哈哈哈。一直都是请求异常,怎么优化解决,暂不知晓了哦。 最终为了,能成功爬取,我采取了手动验证后,再接着下面的爬取工作!!以下爬取工作,加入了这个验证操作代码哦。如若不行,还是注释掉,手动去滑块验证吧
2. 评论中需要电影的id,所以先通过搜索获取影片id
接口:https://maoyan.com/ajax/suggest?kw={keyword}
这里需要JSON去解析返回的数据,util封装的模块,在上篇文章中有,再贴一下。还写了个超级简单的exception封装,哈哈哈。都贴下面了,不要笑我哦。
下面这个方法,返回的是list中的一个,还没有拿到id的哦。
# 通过关键字,返回电影列表。只返回第一个
def get_movies(keyword):
html = util.get_html(f'https://maoyan.com/ajax/suggest?kw={keyword}')
# print(html)
mvs = json.loads(html)['movies']['list']
# print(mvs)
if (len(mvs) == 0):
raise SkipException(f'找不到{keyword}')
return mvs[0]
Util 模块,加了个函数运行时间
import os, time, requests, random, telnetlib, json, pypinyin
from bs4 import BeautifulSoup
__dir__ = os.path.dirname(os.path.abspath(__file__))
# print(__dir__)
def get_headers(localhost=True, refer="https://www.baidu.com", host=None):
ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36"
if not localhost:
uas = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36",
"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
"Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)",
"Baiduspider-image+(+http://www.baidu.com/search/spider.htm)",
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
"Mozilla/5.0 (compatible; Googlebot-Image/1.0; +http://www.google.com/bot.html)",
"Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
"Sogou News Spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);",
"Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)",
"Sosospider+(+http://help.soso.com/webspider.htm)",
"Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)"
]
ua = random.choice(uas)
headers = {
"User-Agent": ua,
"Referer": refer,
"Host": host
}
return headers
def get_html(url, ret_type="text", timeout=50, encoding="utf-8"):
headers = get_headers()
res = requests.get(url, headers=headers, timeout=timeout)
res.encoding = encoding
# print(res.status_code)
# print(res.text)
if ret_type == "text":
return res.text
elif ret_type == "image":
return res.content
elif ret_type == "json":
return res.json()
# 装饰器,函数运行时间
def run_time(func):
# 这里的 wrapper 函数名可以为任意名称
def wrapper(*args):
s_time = time.perf_counter()
res = func(*args)
e_time = time.perf_counter()
print(f"func --- {func.__name__}, runtime --- {e_time-s_time}")
return res
return wrapper
# 异常类 继承Exception
# 跳过异常
class SkipException(Exception):
def __init__(self,msg):
print(f'不可控异常,为不影响原程序运行,抛出此异常提示...{msg}')
self.msg = msg
3. 拿到id后,就可以开始抓取了。接口返回的是JSON,测试发现 该接口只返回直 offset=1000。所以程序里大于1000的直接pass了,不会有数据
# 爬取,XX电影
def scrawl_mv(keyword):
try:
mv = get_movies(keyword)
except SkipException as e :
print(e)
movie_id = mv['id']
movie_name = mv['nm']
print(type(mv), movie_id, movie_name)
url = f'http://api.maoyan.com/mmdb/comments/movie/{movie_id}.json?_v_=yes&offset=1'
print(f'正在爬取 --- 【{movie_name}】====第一页======= {url}')
do_scrawl(url, movie_name)
do_scrawl 里的valid,这里是直接在第一次,进行滑块验证。可能一直失败,我就尝试做了3次。失败就sys.exit(1)退出了。
try:
html = util.get_html(url)
print('第一次 进行反复循环,检测是否需要验证')
# 第一次 进行反复循环,检测是否需要验证
html = valid(html, url)
except SkipException as obj:
print(obj)
sys.exit(1)
滑块可能会不起作用,建议还是手动去验证吧,哈哈。然后注掉代码,再去跑
好吧,直接贴一下所有代码。。。200多行 , 不算多吧。
import sys, os
__dir__ = os.path.dirname(os.path.abspath(__file__))
sys.path.append(__dir__)
sys.path.append(os.path.abspath(os.path.join(__dir__, '../common')))
import util, maoyan_jieba
from exp import SkipException
import requests, json, time, random, slide_selenium, threading
from bs4 import BeautifulSoup
# 解析一页数据
def parse_ono_page(html):
try:
data = json.loads(html)['cmts'] #评论以json形式存储,故以json形式截取
except Exception:
raise SkipException('json解析错误,获取不到cmts')
#data = json.loads(html)['hcmts'] #评论以json形式存储,故以json形式截取
for item in data:
yield { #该方法返回一个字典
'comment':item['content'],
'date':item['time'].split(' ')[0],
'rate':item['score'],
'city':item['cityName'],
'nickname':item['nickName']
}
# 返回评论总数
def parse_ono_pages(html):
# 总数
total = json.loads(html)['total']
return total
# {"approve":0,"assistAwardInfo":{"avatar":"","celebrityId":0,"celebrityName":"","rank":0,"title":""},"avatarurl":"https://img.meituan.net/maoyanuser/0d20974fe7a2dcb726680f4d94493b8511096.png","cityName":"北京","content":"刘德华演技在线,画面真美,剧情太烂!","id":1143035845,"isMajor":false,"juryLevel":0,"movieId":341516,"nick":"zhangsq0812","nickName":"zhangsq0812","oppose":0,"pro":false,"reply":0,"score":0.5,"spoiler":0,"startTime":"2021-09-10 11:55:57","supportComment":true,"supportLike":true,"sureViewed":1,"tagList":{"fixed":[{"id":2,"name":"购票差评"},{"id":4,"name":"购票"}]},"time":"2021-09-10 11:55","userId":220211944,"userLevel":0,"vipType":0}
#保存数据到文本文档
def save_to_txt(url, filepath=os.path.join(__dir__, f'../files/{str(round(time.time() * 1000))}.txt')):
html = util.get_html(url)
# print(filepath)
try:
cmts = parse_ono_page(html)
except Exception:
raise SkipException('解析JSON异常')
for item in cmts:
# print(item)
with open(filepath,'a',encoding='utf-8') as f:
f.write(item['date'] + ',' + item['nickname'] + ',' + item['city'] + ',' +str(item['rate'])+','+item['comment']+'\n')
# 获取的评论可能有重复,为了最终统计的真实性,需做去重处理
def delete_repeat(old,new):
oldfile = open(old,'r',encoding='UTF-8')
newfile = open(new,'w',encoding='UTF-8')
content_list = oldfile.readlines() #读取的数据集
content_alreadly_ditinct = [] #存储不重复的评论数据
for line in content_list:
if line not in content_alreadly_ditinct: #评论不重复
newfile.write(line+'\n')
content_alreadly_ditinct.append(line)
# 获取网页,并保存至txt
def scrawl(url):
print(f'正在爬取{url}')
html = util.get_html(url)
save_to_txt(html)
# 第一次访问 可能需要人工验证,滑块
@util.run_time
def valid(html, url):
times = 1
while(True):
soup = BeautifulSoup(html, 'html.parser')
titles = soup.select('title')
print(titles)
if len(titles) > 0:
title = titles[0].text
if '验证' in title:
slide_selenium.selenium_get_html(url)
time.sleep(1)
html = util.get_html(url)
times += 1
else:
break
else:
break
# return html
if times > 3:
raise SkipException('无法通过滑块验证,error')
return util.get_html(url)
# 通过id,获取电影名称
@util.run_time
def get_movie_name(movie_id):
"""
:param :movie_id - 电影id
"""
html = util.get_html(f'http://api.maoyan.com/mmdb/movie/v5/{movie_id}.json')
data = json.loads(html)['data']['movie']
return data
# 通过关键字,返回电影列表。只返回第一个
def get_movies(keyword):
html = util.get_html(f'https://maoyan.com/ajax/suggest?kw={keyword}')
# print(html)
mvs = json.loads(html)['movies']['list']
# print(mvs)
if (len(mvs) == 0):
raise SkipException(f'找不到{keyword}')
return mvs[0]
# 爬url
@util.run_time
def do_scrawl(url, movie_name=f'movie{str(round(time.time() * 1000))}'):
"""
:param :url 要爬取的链接
:param :movie_name 电影名称,保存txt文件名
"""
try:
html = util.get_html(url)
print('第一次 进行反复循环,检测是否需要验证')
# 第一次 进行反复循环,检测是否需要验证
html = valid(html, url)
except SkipException as obj:
print(obj)
sys.exit(1)
# print(html)
# 评论总数
total = parse_ono_pages(html)
# 发现接口只返回1000条
if total > 1000:
total = 1000
# 接口 返回的评论条数
size = 15
# 取整的页数
pages = round(total/size)
# 每个线程的工作量
# thrs = 2
thrs = random.randint(2,10)
works = round(pages / thrs)
# 如果 线程*每个线程工作量<总页数,需要在启动一个线程
if thrs * works < pages:
thrs += 1
root_path = util.JarProjectPath.project_root_path('py')
filepath = root_path + f'files/{movie_name}.txt'
print(f'共{total}条评论,每页{size}条,可爬{pages}页,随机预设【{thrs}】个线程,每个线程需爬取【{works}】页')
# 多线程爬取
l = []
# 线程从1开始,各加1
for i in range(1, thrs+1):
if i == thrs and pages % works != 0:
# 最后一个线程,不能超出可爬取的页数
t = threading.Thread(target=save_batch, args=(i, works*(i-1), works*(i-1) + pages % works, filepath))
else:
t = threading.Thread(target=save_batch, args=(i, works*(i-1), works*i, filepath))
l.append(t)
print("线程{} 启动".format(i))
t.start()
for p in l:
p.join()
print("多线程执行完成,爬取完毕")
print(f'共{total}条评论,每页{size}条,可爬{pages}页,随机预设【{thrs}】个线程,每个线程需爬取【{works}】页')
# 爬取,XX电影
def scrawl_mv(keyword):
try:
mv = get_movies(keyword)
except SkipException as e :
print(e)
movie_id = mv['id']
movie_name = mv['nm']
print(type(mv), movie_id, movie_name)
url = f'http://api.maoyan.com/mmdb/comments/movie/{movie_id}.json?_v_=yes&offset=1'
print(f'正在爬取 --- 【{movie_name}】====第一页======= {url}')
do_scrawl(url, movie_name)
# 多线程爬取
def save_batch(no, start, end, filepath):
# 这里因为接口,第一页返回数据1-15,第二页返回数据2-16,造成数据重复。
# 所以 这里给offset 循环一次加上15
size = 15 # 固定
for i in range(start, end):
# 反爬
time.sleep(1 + float(random.randint(1,100)) / 20)
url = f'http://api.maoyan.com/mmdb/comments/movie/1263235.json?_v_=yes&offset={i*size+1}'
print(f'Thread.{no} >>> 正在保存 --- {url}')
try:
save_to_txt(url, filepath)
except SkipException as obj:
continue
# 测试
@util.run_time
def thread_test(movie_name):
# 评论总数
total = 1530
# 接口 返回的评论条数
size = 15
# 取整的页数
pages = round(total/size)
# 每个线程的工作量
works = 50
# 最大线程数
r = int(pages / works) + 1 if pages % works > 0 else 0
print(f'共{total}条评论,每页{size}条,可爬{pages}页,预设每个线程爬取【{works}】页,需要【{r}】个线程')
root_path = util.JarProjectPath.project_root_path('py')
filepath = root_path + f'files/{movie_name}.txt'
# l = []
# for i in range(1, r+1):
# if i == r:
# # 最后一个线程,不能超出可爬取的页数
# t = ThreadCrawl(str(i), save_batch, works*(i-1), works*(i-1) + pages % works, filepath)
# else:
# t = ThreadCrawl(str(i), save_batch, works*(i-1), works*i, filepath)
# l.append(t)
# t.start()
# for p in l:
# p.join()
print("多线程执行完成,爬取完毕")
class ThreadCrawl(threading.Thread):
"""
:param :thread_name 线程名称
:param :func 线程要执行的函数
"""
def __init__(self, thread_name, func, *args):
# threading.Thread.__init__(self)
# 调用父类初始化方法
super(ThreadCrawl, self).__init__()
self.threadName = thread_name
self.func = func(*args)
print('线程初始化', *args)
def run(self):
# runrun(self.threadName)
print(f'线程{self.threadName}:************启动************')
self.func
if __name__ =='__main__':
# print(get_movies('怒火·重案'))
movie_name = '我的青春有个你'
scrawl_mv(movie_name)
filepath = f'files/{movie_name}.txt'
if os.path.exists(filepath):
print(os.path.abspath(filepath))
maoyan_jieba.analysis(os.path.abspath(filepath))
大部分有注释说明的吧,哈哈。。
爬取txt后,之后做了 云词分析跟geo评论分布,大概这样。文中maoyan_jieba,代码还没贴哦。