最近在学习爬虫,用的BeautifulSoup4这个库,设想是把优酷上面的电影的名字及链接爬到,然后存到一个文本文档中。比较简单的需求,第一次写爬虫。贴上代码供参考:
1 # coding:utf-8
2
3 import requests
4 import os
5 from bs4 import BeautifulSoup
6 import re
7 import time
8
9 '''抓优酷网站的电影:http://www.youku.com/ '''
10
11 url = "http://list.youku.com/category/show/c_96_s_1_d_1_u_1.html"
12 h = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0"}
13
14
15
16
17 #存到movie文件夹的文本文件中
18 def write_movie():
19 currentPath = os.path.dirname(os.path.realpath(__file__))
20 #print(currentPath)
21 moviePath = currentPath + "\\" + "movie"+"\\" + "youku_movie_address.text"
22 #print(moviePath)
23 fp = open(moviePath ,encoding="utf-8",mode="a")
24
25 for x in list_a:
26 text = x.get_text()
27 if text == "":
28 try:
29 fp.write(x["title"] + ": " + x["href"]+"\n")
30 except IOError as msg:
31 print(msg)
32
33 fp.write("-------------------------------over-----------------------------" + "\n")
34 fp.close()
35
36 #第一页
37 res = requests.get(url,headers = h)
38 print(res.url)
39 soup = BeautifulSoup(res.content,'html.parser')
40 list_a = soup.find_all(href = re.compile("==.html"),target="_blank")
41 write_movie()
42
43 for num in range(2,1000):
44
45 #获取“下一页”的href属性
46 fanye_a = soup.find(charset="-4-1-999" )
47 fanye_href = fanye_a["href"]
48 print(fanye_href)
49 #请求页面
50 ee = requests.get("http:" + fanye_href,headers = h)
51 time.sleep(3)
52 print(ee.url)
53
54 soup = BeautifulSoup(ee.content,'html.parser')
55 list_a = soup.find_all(href = re.compile("==.html"),target="_blank")
56
57 #调用写入的方法
58 write_movie()
59 time.sleep(6)
运行后的txt内的文本内容: