python 豆瓣电影 python豆瓣电影爬虫

转载

互联网小思悟 2023-12-04 18:49:53

文章标签 python 豆瓣电影 python 豆瓣爬虫编程 文章分类 Python 后端开发

前言

作为本人的第一篇博客，不知道怎么写才好。同时作为一个编程的菜鸟，第一次分享自己的练习之作，希望能够通过写博客这种方式慢慢的提高自己的表述能力。
本人最近学习python，学习了基本的语法，就想着做点东西练练手，反正大家说起python第一反应都是爬虫，那我就做一个爬虫吧。
本人经常由于上豆瓣看各类电影的评分，从中选出自己想要看的电影，但是豆瓣提供的几种筛选方式都不能满足，所以打算做一个爬虫爬取豆瓣的电影，顺便在根据这个数据做一个电影筛选程序，提供一个GUI界面。

第一个类Spider

首先编写了一个类Spider，它的主要功能是将上面的url连接的网页抓取下来，然后分析和提取html其中的电影名字和url，并保存如文本文件中。代码如下:

#coding=utf-8
"""
Created on 2015年12月31日
python的版本是3.4
@author: ypk
本爬虫主要是通过豆瓣的年份标签获取都有那些电影及url并存储，然后在获取这些电影的具体网页并分析，最后提供GUI界面进行查找相关电影。本爬虫用到了BeautifulSoup这个包，必须先引入这个包。
这个类主要是先调用startSpiderHtml，从豆瓣http://www.douban.com/tag/2015/movie?start=0类似网页下载下来，
接下来调用startSpiderMovie，分析之前下载html文件，将其中的电影名字及url提取出来并保存到文本中。
"""
from bs4 import BeautifulSoup  
import urllib.request;
import urllib.error;
import string;
import socket
import time;
from email._header_value_parser import Header
import os
class Spider:
    def getHtml(self,url):#从url网站上获取html
        try:
            page=urllib.request.urlopen(url,timeout=10)
            html=page.read()
        except urllib.error.HTTPError:
            return False
        except socket.timeout:
            html=Spider().getHtml(url)
        return html
    def saveHtml(self,html,filename):#将html保存到filename
        fb=open(filename,"wb")
        fb.write(html)
    def saveMovieUrl(self,html):#将html页面中的电影及url提取并保存起来
        soup=BeautifulSoup(html,"html.parser")
        for str in soup.find_all('dl')  :
            if str==None:
                return Flase
            str=str.dd
            print(str.a['href'])#电影名字
            file=open("./1.txt",'a',encoding="UTF-8")
            file.write(str.a['href'])
            file.write('\n')
            print(str.a.string)#电影url
            file.write(str.a.string,)
            file.write('\n')
        return True
    def startSpiderHtml(self,year,start,end_year,end_start):#爬取网页,从year年start页开始，到end_year年end_start结束
        """
        由于是从豆瓣标签年份标签爬取的数据，url格式是http://www.douban.com/tag/year/movie?start=start
        里面的year和start就是我们需要设置的。这个函数将存放电影目录的网页抓取下来。
        """
        year=year
        start=start
        proxy_support = urllib.request.ProxyHandler({'sock5': 'localhost:1080'})
        opener = urllib.request.build_opener(proxy_support)
        urllib.request.install_opener(opener)
        while (year>end_year or start<end_start):
            url="http://www.douban.com/tag/"+str(year)+"/movie?start="+str(start)
            html=Spider.getHtml(Spider(),url)
            soup=BeautifulSoup(html,"html.parser")
            if soup.dl==None:
                print(str(year)+"已经结束了~")
                year-=1
                start=0
                time.sleep(150)
                continue
            else :
                Spider.saveHtml(Spider(),html,"./"+str(year)+"_"+str(start)+".html")
                print(str(year)+"_"+str(start)+".html已经下载")
            start+=15
            if start%1200==0:
                time.sleep(120)
    def startSpiderMovie(self,year,start,*end):#开始爬取网页中的数据，year是开始年份，start是开始页数
        """
        将startSpiderHtml中获取的html网页解析，从中取出电影名字和具体的url信息，存入文本中。
        """
        year=2015
        start=0
        while year>1980:
            filename="./豆瓣电影列表/"+str(year)+"_"+str(start)+".html"
            if(os.path.exists(filename)):
                Spider.saveMovieUrl(Spider(),open(filename,'r',encoding="UTF-8").read())
                start+=15
            else:
                print("~")
                year-=1
                start=0
#Spider().startSpiderMovie(2015, 0)
#Spider().saveHtml(Spider().getHtml("http://movie.douban.com/subject/25835474/?from=showing"),"./test.html")
#Spider().startSpiderHtml(1981,90,1980,1)

这个时候我们就得到了这个文本：

python 豆瓣电影 python豆瓣电影爬虫_python

这个这个文本文件中存储的是电影名字及url，这个URL类似这个http://movie.douban.com/subject/26366465/?from=tag_all，其中存放着电影的具体信息。

第二个类DetailSpider

接下来编写第二个类DetailSpider，这个类的功能第一是从之前的文本中获得电影名字及url，并将url指向的网页抓取下来；第二是分析抓取下来的这些网页，从中获得具体的电影信息，存储到文本中。代码如下：

#coding=utf-8
from Spider import Spider
from bs4 import BeautifulSoup  
import urllib.request
import re
from _codecs import encode
import time
import threading
class DetailSpider:
    '''
  Created on 2015年12月31日
  python的版本是3.4
  @author: ypk
    首先调用getMovieHtml，将Spider中抓取的电影名字及url提取出来，并将url指向的网页抓取下来，
    接下来调用getMovieDetail将分析和提取使用getMovieHtml抓取的html页面，这个提取的信息就是影片的具体信息，保存到文本文件中
    '''


    def getMovieHtml(self,filename,name,j,end):#从filename中获取影片存放影片具体信息的网页url，抓取url指向的网页，name是开始url，抓取下来保存文件的名字，end是结束url
        file=open(filename,'r',encoding="UTF-8")
        i=0
        j=j
        flag=False
        for url in file:
            if i%4==0:
                m=re.search(r'h.*', url)
                if m.group()==name:
                    flag=True
                if flag:
                    html=Spider().getHtml(m.group())
                    if html:
                        Spider().saveHtml(html, "./豆瓣电影详细信息/"+str(j)+".html")
                        print(str(m)+" :"+str(j)+".html已存储~")
                        j+=1
                if m.group()==end:
                    return
            else:
                print(url.strip())
            i+=1
        if j%90==0 :
            time.sleep(120)#为了防止请求过于频繁，抓取一定页数就暂停一下
    def getMovieDetail(self,filename):#分析网页，获取网页中的数据，存入文本中
        file=open(filename,'r',encoding="UTF-8")
        html=file.read()
        soup=BeautifulSoup(html,"html.parser")
        if not soup.find("div",id='content'):
            return
        soup=soup.find("div",id='content')
        if not soup.find('span',property="v:itemreviewed"):
            return
        print(soup.find('span',property="v:itemreviewed").string)
        dict={'name':soup.find('span',property="v:itemreviewed").string}
        dict['name']=soup.find('span',property="v:itemreviewed").string#电影名称
        if  soup.find('span',class_="year"):
            print(soup.find('span',class_="year").string.strip('(').strip(')'))
            dict['year']=soup.find('span',class_="year").string.strip('(').strip(')')#大致时间

        info=soup.find(id='info')
        s="{"
        #print(info.getText().replace(": ",":"))
        for line in info.getText().replace(": ",":").replace(":\n",":").split("\n"):
            line=line.replace("'","")
            if line !='':
                #print(line.split(":",1))
                s+="'"+line.split(":",1)[0]+"'"+":"+"'"+line.split(":",1)[1]+"',"
        s=s[:-1]+"}"
        dict.update(eval（s))
        #print(dict)
        score=soup.find(id="interest_sectl")
        #print(score.getText().replace(" ","").replace("\n"," ").replace("  "," ").split())
        s="{"
        count=0
        for temp in score.getText().replace(" ","").replace("\n"," ").replace("  "," ").split():
            if count==2:
                s=str(s+"'"+"评价人数"+"':")
                s=str(s+"'"+temp+"',")
                count+=2
                continue
            if re.match("好于", temp):
                break

            if count%2==0:
                s=str(s+"'"+temp+"':")
            else:
                s=str(s+"'"+temp+"',")
            count+=1
        s=s+"}"
        #dict1=dict.copy()
        dict.update(eval（s))
        print(dict)
        file=open("./movieDetail.txt","a",encoding="UTF-8")
        file.write(str(dict))
        file.write("\n")
        file.close()    
#使用下面的循环，分析你抓取的下来的html页面
"""
start=0
while start<17093:
    print("start="+str(start))
    filename="./豆瓣电影详细信息/"+str(start)+".html"
    DetailSpider().getMovieDetail(filename)
    start+=1
"""
#使用下面的线程，从豆瓣上抓取html页面
#threading.Thread(target=DetailSpider().getMovieHtml,args=("./DouBanMovie.txt","http://movie.douban.com/subject/3526311/?from=tag_all",14573,"http://movie.douban.com/subject/5050645/?from=tag_all")).start()
#threading.Thread(target=DetailSpider().getMovieHtml,args=("./DouBanMovie.txt","http://movie.douban.com/subject/2282473/?from=tag_all",8392,"http://movie.douban.com/subject/1303280/?from=tag_all")).start()
#threading.Thread(target=DetailSpider().getMovieHtml,args=("./DouBanMovie.txt","http://movie.douban.com/subject/2346837/?from=tag_all",13088,"http://movie.douban.com/subject/5050645/?from=tag_all")).start()
#DetailSpider().getMovieHtml("./DouBanMovie.txt","http://movie.douban.com/subject/2252775/?from=tag_all",3254)

这样我们就获得了我们想要的电影信息，如图所示：

python 豆瓣电影 python豆瓣电影爬虫_python 豆瓣电影_02

可以说，到这里为止，豆瓣电影爬虫的工具基本都完成了，我们已经将我们想要的豆瓣电影数据获取下来了。接下来，就要编写一个类，能够对这个文本中的数据进行相应的查找。

第三个类Seek

第三个类Seek，能够通过(‘导演’,’编剧’,’演员’,’类型’,’时间’,’名字’,’得分’,’人数’,’国家’)关键字进行查找相应的影片。代码如下：

#coding=utf-8
'''
Created on 2015年12月31日
python的版本是3.4
这个类通过调用movieSearch函数，获取具体电影信息，比如获取张艺谋的电影。
@author: ypk
'''
import re
class Seek:

    def movieByDirector(self,dict,director):#搜索dict数据中导演是否是director
        try:
            if re.search(director, dict["导演"]):
                return True
            else:
                return False
        except:
            return False
    def movieByAuthor(self,dict,author):
        try:
            if re.search(author, dict["编剧"]):
                return True
            else:
                return False
        except:
            return False
    def movieByActor(self,dict,actor):
        try:
            if re.search(actor, dict["主演"]):
                return True
            else:
                return False
        except:
            return False
    def movieByType(self,dict,type):
        try:
            if re.search(type, dict["类型"]):
                return True
            else:
                return False
        except:
            return False
    def movieByYear(self,dict,year):
        try:
            for s in dict['year'].split('/'):
                if s.strip()==str(year):
                    return True
                else:
                    return False
        except:
            return False
    def movieByName(self,dict,name):
        try:
            if re.search(name, dict["name"]):
                return True
            else:
                return False
        except:
            return False
    def movieByScore(self,dict,score):
        try:
            for s in dict['豆瓣评分'].split('/'):
                if float(s.strip())>=float(score):
                    return True
                else:
                    return False
        except:
            return False    
    def movieByNum(self,dict,num):
        try:
            for s in dict['评价人数'].split('/'):
                if int(re.match('\d*', s.strip()).group())>=int(num):
                    return True
                else:
                    return False
        except:
            return False            
    def movieByCountry(self,dict,country):
        try:
            if re.search(country, dict["制片国家/地区"]):
                return True
            else:
                return False
        except:
            return False
    def movieSearch(self,filename,director,author,actor,type,year,name,score,num,country):#调用顺序如下：导演，编剧，演员，类型，时间，名字，得分，评价人数，国家
        list=[]
        flag=False
        if director!='*':
            print("start")
            for line in open(filename,'r',encoding="UTF-8"):
                if self.movieByDirector(eval（line),director):
                    list.append(line)
                    #print(line)
            flag=True

        if author!='*':
            if flag:
                temp=list.copy()
                list.clear()
                for line in temp:
                    if self.movieByAuthor(eval（line), author):
                        list.append(line)
            else:
                for line in open(filename,'r',encoding="UTF-8"):
                    if self.movieByAuthor(eval（line), author):
                        list.append(line)
                        #print(line)
                flag=True

        if actor!='*':
            if flag:
                temp=list.copy()
                list.clear()
                for line in temp:
                    if self.movieByActor(eval（line), actor):
                        list.append(line)
            else:
                for line in open(filename,'r',encoding="UTF-8"):
                    if self.movieByActor(eval（line), actor):
                        list.append(line)
                        #print(line)
                flag=True
        if type!='*':
            if flag:
                temp=list.copy()
                list.clear()
                for line in temp:
                    if self.movieByType(eval（line), type):
                        list.append(line)  
            else:
                for line in open(filename,'r',encoding="UTF-8"):
                    if self.movieByType(eval（line), type):
                        list.append(line)
                        #print(line)
                flag=True

        if year!='*':
            if flag:
                temp=list.copy()
                list.clear()
                for line in temp:
                    if self.movieByYear(eval（line), year):
                        list.append(line)
            else:
                for line in open(filename,'r',encoding="UTF-8"):
                    if self.movieByYear(eval（line), year):
                        list.append(line)
                        #print(line)
                flag=True             
        if name!='*':
            if flag:
                temp=list.copy()
                list.clear()
                for line in temp:
                    if self.movieByName(eval（line), name):
                        list.append(line) 
            else:
                for line in open(filename,'r',encoding="UTF-8"):
                    if self.movieByName(eval（line), name):
                        list.append(line)
                        #print(line)
                flag=True            
        if score!='*':
            if flag:
                temp=list.copy()
                list.clear()
                for line in temp:
                    if self.movieByScore(eval（line), score):
                        list.append(line)   
            else:
                for line in open(filename,'r',encoding="UTF-8"):
                    if self.movieByScore(eval（line), score):
                        list.append(line)
                        #print(line)
                flag=True
        if num!='*':
            if flag:
                temp=list.copy()
                list.clear()
                for line in temp:
                    if self.movieByNum(eval（line), num):
                        list.append(line)  
            else:
                for line in open(filename,'r',encoding="UTF-8"):
                    if self.movieByNum(eval（line), num):
                        list.append(line)
                        #print(line)
                flag=True        
        if country!='*':
            if flag:
                temp=list.copy()
                list.clear()
                for line in temp:
                    if self.movieByCountry(eval（line), country):
                        list.append(line)
            else:
                for line in open(filename,'r',encoding="UTF-8"):
                    if self.movieByCountry(eval（line), country):
                        list.append(line)
                        #print(line)
                flag=True

        return list
#下面循环是展示如何使用上面的类的功能
for s in Seek.movieSearch(self=Seek(),filename="./movieDetail.txt", director="张艺谋", author="*", actor="*", type="*", year="*", name="*", score="*", num="*", country="*"):
     print(s)
#dict=eval（"{'上映日期': '2011-12-15(中国大陆) / 2011-12-23(美国)', '又名': 'The Flowers Of War', 'IMDb链接': 'tt1410063', '主演': '克里斯蒂安·贝尔 / 倪妮 / 张歆怡 / 黄天元 / 韩熙庭 / 张逗逗 / 佟大为 / 曹可凡 / 渡部笃郎 / 黄海波 / 窦骁 / 聂远 / 高虎 / 秦昊 / 李玥敏 / 白雪 / 袁杨纯子 / 孙佳 / 朱良奇 / 小林成男 / 山中崇 / 保罗·施耐德', '4星': '44.7%', '3星': '20.6%', 'year': '2011', '类型': '剧情 / 历史 / 战争', '评价人数': '269660人评价', '豆瓣评分': '8.0', 'name': '金陵十三钗', '片长': '145分钟', '制片国家/地区': '中国大陆 / 香港', '官方小站': '金陵十三钗 ', '语言': '汉语普通话 / 英语 / 日语 / 南京话 / 上海话', '导演': '张艺谋', '5星': '30.4%', '2星': '3.0%', '编剧': '刘恒 / 严歌苓', '1星': '1.4%'}")
#print(dict)
#print(dict['导演']=='张艺谋')

#print(Seek.movieByDirector(Seek(),dict, "张艺谋"))

最后一个模块

最后，我使用Tkinter这个GUI库编写了一个简单的界面，效果如下图：

python 豆瓣电影 python豆瓣电影爬虫_爬虫_03

具体的代码如下：

#coding=utf-8
'''
Created on 2015年12月31日
python的版本是3.4
设置GUI界面，提供图形化界面
@author: ypk
'''
import tkinter as tk
import tkinter.ttk as ttk
from Seek import Seek
import time
t=[]
s=('导演','编剧','演员','类型','时间','名字','得分','人数','国家')
def search():
    for row in tree.get_children():
        tree.delete(row)
    print("shanchu")
    i=0
    dict={}
    for temp in s:
        if t[i].get().strip()!="":
            dict[temp]=t[i].get().strip()
        else:
            dict[temp]="*"
        i+=1
    i=1
    for temp in Seek.movieSearch(self=Seek(),filename="./movieDetail.txt", director=dict["导演"], author=dict["编剧"], actor=dict["演员"], type=dict["类型"], year=dict["时间"], name=dict["名字"], score=dict["得分"], num=dict["人数"], country=dict["国家"]):
        tree
        print(temp)
        if temp!=None:
            temp=eval（temp)
            tree.insert("",'end',values=(temp.get('导演'),temp.get('编剧'),temp.get('主演'),temp.get('类型'),temp.get('year'),temp.get('name'),temp.get('豆瓣评分'),temp.get('评价人数'),temp.get('制片国家/地区')))

    print("hi there,everyone")
top = tk.Tk()
top.title("豆瓣电影筛选器")
f=tk.Frame(top,height=100,width=100)
i=0
for temp in s:

    f1=tk.Frame(f)
    l=tk.Label(f1,text=temp,fg='green')
    l.pack(side='left')
    t.append(tk.Entry(f1,width=20,highlightcolor='red'))
    t[i].pack(side='right',ipady=3)
    i+=1
    f1.pack()
b=tk.Button(f,text="查找",command=search)
b.pack(side='top')
f.pack(side='left')


f2=tk.Frame(top,height=240,width=300)

tree=ttk.Treeview(f2,show="headings",columns=s,height=15)
for temp in s:
    tree.column(temp, width=80,anchor='center')
    tree.heading(temp, text=temp)
#f3.pack(side='top')
tree.pack()
f2.pack(side='right')
#f2.pack_propagate(0)
top.mainloop()

到此，这个的爬虫文件都编写完成了。那么总结一下这个爬虫的缺点：
1.由于豆瓣对于每个标签的限制，使得在这个标签下查找的页数不能够超过35页，比如说这个http://www.douban.com/tag/2014/movie?start=0链接，你选择35页之后，就没有数据了。所以我们获取的数据不全面。
解决办法：我没有去实现，但是我的想法是除了通过时间这个标签，还可以通过豆瓣其他标签例如：喜剧，爱情等等。通过不同的标签方式获取数据，相同的删除，不同的保存，这样能够最大的获取最多数据。
2.代码写的很乱，函数名起的很差，作为编程菜鸟需要的还是不断努力与提高。
3.效率问题，DetailSpider这个类中爬取网页部分比较慢，而且防止豆瓣检测爬虫采用的方法是爬取一部分数据就间隔一下。其实应该采取代理措施，最起码也应该换一下Header部分。