import requests
from urllib import error
import re
import json
import time

def getHTML(url):
header={
'User-Agent':'Mozilla/5.0(Macintosh;Intel Mac OS X 10_13_3) AppleWebKit/537.36(KHTML,like Gecko) Chrome/'
'65.0.3325.162 Safari/537.36'
}
try:
response=requests.get(url)
if response.status_code==200: #如果请求成功
return response.text
else:
return None
except error.URLError as e:
print(e.reason) #打印失败的原因


def parse_one_page(html):
pattern=re.compile('<dd>.*?board-index.*?>(.*?)</i>.*?data-src="(.*?)".*?name.*?a.*?>(.*?)</a>.*?star.*?>(.*?)</p>.'
'*?releasetime.*?>(.*?)</p>.*?integer.*?>(.*?)</i>.*?fraction.*?>(.*?)</i>.*?</dd>',re.S)
items=re.findall(pattern,html) #正则表达式提取信息
for item in items:
yield {
'index':item[0],
'image':item[1],
'title':item[2],
'actor':item[3].strip()[3:],
'time':item[4].strip()[5:],
'score':item[5]+item[6]
}


def write_to_file(txt):
with open('movie.txt','a',encoding='utf-8') as f:
f.write(json.dumps(txt,ensure_ascii=False)+'\n') #实现字典序列化,保证是中文形式而不是Unicode编码


if __name__ == '__main__':
for i in range(10):
j=i*10 #为网页的偏移量
url="https://maoyan.com/board/4?offset="+str(j) #网页地址
html=getHTML(url) #获取超文本
time.sleep(1) #暂停一秒,因为猫眼有反爬虫,如果速度过快则无响应
for item in parse_one_page(html): #遍历字典
print(item) #打印出来
write_to_file(item) #写入文本