方案1:一次性爬取全部淘宝美食信息
1. spider.py文件如下
1 __author__ = 'Administrator'
2 from selenium import webdriver
3 from selenium.webdriver.common.by import By
4 from selenium.webdriver.support.ui import WebDriverWait
5 from selenium.webdriver.support import expected_conditions as EC
6 import re
7 from pyquery import PyQuery as pq
8 from config import *
9 import pymongo
10
11 client = pymongo.MongoClient(MONGO_URL)
12 db = client[MONGO_DB]
13
14 browser = webdriver.Chrome()
15 """
16 如果把Chrome修改为使用PhantomJS
17 1. 首先需要安装phantomJS
18 2. 自定义一些配置参数,这里不加载图片以及使用缓存
19 browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
20 3. 设置窗口大小
21 browser.set_window_size(1400,900)
22 """
23
24 wait = WebDriverWait(browser, 10) #显示等待10秒再查找目标元素
25
26
27 def search():
28 # print('正在搜索') 用于phantomJS调试
29 try:
30 browser.get('https://www.taobao.com')
31 input1 = wait.until(
32 EC.presence_of_element_located((By.CSS_SELECTOR, '#q')) #定位搜索框
33 )
34 submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button'))) #定位搜索按钮
35 # 这里的美食可以替换为配置文件中的变量KEYWORD
36 input1.send_keys('KEYWORD')
37 submit.click()
38 total = wait.until(
39 EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.total'))) #定位总页数,使用右键copy selector的方法找出参数名
40 # 调用get_products
41 get_products()
42 return total.text
43
44 except TimeoutError:
45 return search()
46
47
48 # 使用翻页输入框来翻页
49 def next_page(page_number):
50 # print('正在翻页',page_number) 用于phantomJS调试
51 try:
52 input1 = wait.until(
53 EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input'))
54 )
55 submit = wait.until(
56 EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit'))
57 )
58 input1.clear()
59 input1.send_keys(page_number)
60 submit.click()
61 # 根据选择页面会高亮这个条件,来判断是否成功跳转
62 wait.until(EC.text_to_be_present_in_element(
63 (By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span'), str(page_number)))
64 # 调用get_products()
65 get_products()
66
67 except TimeoutError:
68 next_page(page_number)
69
70
71 # 解析信息
72 def get_products():
73 wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-itemlist .items .item')))
74 html = browser.page_source
75 doc = pq(html)
76 items = doc('#mainsrp-itemlist .items .item').items()
77 for item in items:
78 product = {
79 'image': item.find('.pic .img').attr('src'),
80 'price': item.find('.price').text(),
81 'deal': item.find('.deal-cnt').text()[:-3],
82 'title': item.find('.title').text(),
83 'shop': item.find('.shop').text(),
84 'location': item.find('.location').text()
85 }
86 print(product)
87 # 保存数据到mongodb
88 save_to_mongo(product)
89
90
91 # 定义一个保存到mongodb的方法
92 def save_to_mongo(result):
93 try:
94 if db[MON_TABLE].insert(result):
95 print('存储到MONGODB成功', result)
96 except Exception:
97 print('存储到MONGODB失败', result)
98
99
100 def main():
101 try:
102 # 输出100数字
103 total = search()
104 total = int(re.compile('(\d+)').search(total).group(1))
105 # 调用翻页函数
106 for i in range(2, total + 1):
107 next_page(i)
108 except Exception:
109 print('出错了')
110
111 finally:
112 browser.close()
113
114 if __name__ == '__main__':
115 main()
2. config.py
1 __author__ = 'Administrator'
2 MONGO_URL = 'localhost'
3 MONGO_DB = 'taobao'
4 MON_TABLE = 'product'
5
6 # 配置phantomJS
7 SERVICE_ARGS = ['--load-images=false', '--disk-cache=true']
8 KEYWORD = '美食'
方案2:上面这种方法经测试可正常运行,但是会一次性爬取全部数据,数据量较大且不能灵活控制抓取内容,下面代码基本实现方法如下
1. 把搜索的关键字直接放在url中
2. 分页抓取商品信息
3. 使用chrome的headless功能
1 import pymongo
2 from selenium import webdriver
3 from selenium.common.exceptions import TimeoutException
4 from selenium.webdriver.common.by import By
5 from selenium.webdriver.support import expected_conditions as EC
6 from selenium.webdriver.support.wait import WebDriverWait
7 from pyquery import PyQuery as pq
8 from config import *
9 from urllib.parse import quote
10
11 # browser = webdriver.Chrome()
12 # browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
13
14 chrome_options = webdriver.ChromeOptions()
15 chrome_options.add_argument('--headless')
16 browser = webdriver.Chrome(chrome_options=chrome_options)
17
18 wait = WebDriverWait(browser, 10)
19 client = pymongo.MongoClient(MONGO_URL)
20 db = client[MONGO_DB]
21
22
23 def index_page(page):
24 """
25 抓取索引页
26 :param page: 页码
27 """
28 print('正在爬取第', page, '页')
29 try:
30 url = 'https://s.taobao.com/search?q=' + quote(KEYWORD)
31 browser.get(url)
32 if page > 1:
33 #定位页码输入框
34 input = wait.until(
35 EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager div.form > input')))
36 ##定位页码跳转确定按钮
37 submit = wait.until(
38 EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager div.form > span.btn.J_Submit')))
39 input.clear()
40 input.send_keys(page)
41 submit.click()
42
43 """
44 验证是否跳转到对应的页码
45 只需要判断当前高亮的页码数是当前的页码数即可,可使用等待条件text_to_be_present_in_element,它会等待指定的文本出现在某一节点里面时即返回成功
46 我们将高亮的页面节点对应的css选择器和当前要跳转的页面作为这个等待条件的参数,那么这个等待条件就会检测此页码节点是否为指定的页码数
47 """
48 wait.until(
49 EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager li.item.active > span'), str(page)))
50
51 #等待商品信息加载,选择器'.m-itemlist .items .item'对应的页面内容就是每个商品的信息,如果加载成功,执行get_products()
52 wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.m-itemlist .items .item')))
53 get_products()
54 except TimeoutException:
55 index_page(page)
56
57
58 #解析商品列表
59 def get_products():
60 """
61 提取商品数据
62 """
63 html = browser.page_source
64 doc = pq(html)
65 items = doc('#mainsrp-itemlist .items .item').items()
66 for item in items:
67 product = {
68 'image': item.find('.pic .img').attr('data-src'),
69 'price': item.find('.price').text(),
70 'deal': item.find('.deal-cnt').text(), #成交量
71 'title': item.find('.title').text(),
72 'shop': item.find('.shop').text(),
73 'location': item.find('.location').text()
74 }
75 print(product)
76 save_to_mongo(product)
77
78
79 def save_to_mongo(result):
80 """
81 保存至MongoDB
82 :param result: 结果
83 """
84 try:
85 if db[MONGO_COLLECTION].insert(result):
86 print('存储到MongoDB成功')
87 except Exception:
88 print('存储到MongoDB失败')
89
90
91 def main():
92 """
93 遍历每一页
94 """
95 for i in range(1, MAX_PAGE + 1):
96 index_page(i)
97 browser.close()
98
99
100 if __name__ == '__main__':
101 main()
对应的配置文件如下
MONGO_URL = 'localhost'
MONGO_DB = 'taobao'
MONGO_COLLECTION = 'products'
KEYWORD = 'ipad'
MAX_PAGE = 100
SERVICE_ARGS = ['--load-images=false', '--disk-cache=true']