python 抓取电影天堂电影信息放入数据库

精选原创

IT阿泽 2018-02-27 11:31:55 博主文章分类：python ©著作权

文章标签 python mysql 电影 文章分类 Python 后端开发

©著作权归作者所有：来自51CTO博客作者IT阿泽的原创作品，请联系作者获取转载授权，否则将追究法律责任

# coding:utf-8
import requests
from bs4 import BeautifulSoup
from multiprocessing import Pool
import urllib2
import re
import json
import chardet
import pymysql
# url = "http://dytt8.net/"
# page = requests.get(url).content
# page_html = BeautifulSoup(page,'lxml')

# name = page_html.select("td.inddline > a:nth-of-type(2)")
# for n in name:
#     if 'dyzz' in n.encode('gbk'):
#         print n.encode('gbk')
#         file = open("move.txt","a+")
#         file.write(n.encode('utf-8')+'\n')
#         file.close()


def getmoveinfo( url ):
    page = requests.get(url).content
    page_html = BeautifulSoup(page,'lxml')
    # title = page_html.select("div.title_all")
    # title = title[4].select("h1")
    # title = title[0].select("font")
    # return title[0].contents;
    title = page_html.find_all("font", attrs={"color": "#07519a"})
    title_content = title[0].contents
    if(re.findall(r"译　　名(.*?)<br/>", str(page_html))):
        yiming = re.findall(r"译　　名(.*?)<br/>", str(page_html))[0]
    else:
        yiming = ''
    if(re.findall(r"类　　别(.*?)<br/>", str(page_html))):
        leibie = re.findall(r"类　　别(.*?)<br/>", str(page_html))[0]
    else:
        leibie = ''
    if(re.findall(r"语　　言(.*?)<br/>", str(page_html))):
        yuyan = re.findall(r"语　　言(.*?)<br/>", str(page_html))[0]
    else:
        yuyan = ''
    if(re.findall(r"字　　幕(.*?)<br/>", str(page_html))):
        zimu = re.findall(r"字　　幕(.*?)<br/>", str(page_html))[0]
    else:
        zimu = ''
    if(re.findall(r"上映日期(.*?)<br/>", str(page_html))):
        date = re.findall(r"上映日期(.*?)<br/>", str(page_html))[0]
    else:
        date = ''
    if(re.findall(r"豆瓣评分(.*?)<br/>", str(page_html))):
        douban = re.findall(r"豆瓣评分(.*?)<br/>", str(page_html))[0]
    else:
        douban = ''
    if(re.findall(r"片　　长(.*?)<br/>", str(page_html))):
        pianchang = re.findall(r"片　　长(.*?)<br/>", str(page_html))[0]
    else:
        pianchang = ''
    if(re.findall(r"导　　演(.*?)<br/>", str(page_html))):
        daoyan = re.findall(r"导　　演(.*?)<br/>", str(page_html))[0]
    else:
        daoyan = ''
    if(re.findall(r"主　　演(.*?)<br/>", str(page_html))):
        zhuyan = re.findall(r"主　　演(.*?)<br/>", str(page_html))[0]
    else:
        zhuyan = ''
    if(re.findall(r"简　　介(.*?)【下载地址】", str(page_html))):
        jianjie = re.findall(r"简　　介(.*?)【下载地址】", str(page_html))[0]
    else:
        jianjie = ''

    addres = page_html.find_all("td", attrs={"bgcolor": "#fdfddf"})
    if(addres):
        addres = addres[0].contents;
        addres = addres[0].get("href").encode('utf-8')
    else:
        addres = ''
    res = {}
    res['title'] =title_content[0].encode("utf-8")
    res['yiming'] = yiming
    res['leibie'] = leibie
    res['yuyan'] = yuyan
    res['zimu'] = zimu
    res['date'] = date
    res['douban'] = douban
    res['pianchang'] = pianchang
    res['daoyan'] = daoyan
    res['zhuyan'] = zhuyan
    res['jianjie'] = jianjie.replace("<br/>", "")
    res['addres'] = addres
    return res


url = "http://dytt8.net/"
page = requests.get(url).content
page_html = BeautifulSoup(page,'lxml')

name = page_html.select("td.inddline > a:nth-of-type(2)")

conn = pymysql.connect(host='localhost',port=3306,user='root',password='root',db='moves',charset='utf8')
cursor = conn.cursor()
 
for n in name:
    if 'dyzz' in n.encode('gbk'):
        info = getmoveinfo("http://dytt8.net"+n.get("href"))
        title = info['title']
        yiming = info['yiming']
        leibie = info['leibie']
        yuyan = info['yuyan']
        zimu = info['zimu']
        date = info['date']
        douban = info['douban']
        pianchang = info['pianchang']
        daoyan = info['daoyan']
        zhuyan = info['zhuyan']
        jianjie = info['jianjie']
        addres = info['addres']
        # print title.decode('utf-8').encode('gbk')
        cursor.execute("INSERT INTO move_info(title,yiming,leibie,yuyan,zimu,date,douban,pianchang,daoyan,zhuyan,jianjie,addres)VALUES('{0}','{1}','{2}','{3}','{4}','{5}','{6}','{7}','{8}','{9}','{10}','{11}');".format(title,yiming,leibie,yuyan,zimu,date,douban,pianchang,daoyan,zhuyan,jianjie,addres))
        conn.commit()
 
cursor.close()
conn.close()
print 'ok'