1 LXML是比beautisoup速度更快的解析,使用的是XPATH,来个例子:
from lxml import etree
import requests
import csv

fp = open('d://doubanbook.csv','wt',newline='',encoding='utf-8')
writer = csv.writer(fp)
writer.writerow(('name', 'url', 'author', 'publisher', 'date', 'price', 'rate', 'comment'))

urls = ['https://book.douban.com/top250?start={}'.format(str(i)) for i in range(0,250,25)]

headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}

for url in urls:
html = requests.get(url,headers=headers)
selector = etree.HTML(html.text)
infos = selector.xpath('//tr[@class="item"]')
for info in infos:
name = info.xpath('td/div/a/@title')[0]
url = info.xpath('td/div/a/@href')[0]
book_infos = info.xpath('td/p/text()')[0]
author = book_infos.split('/')[0]
publisher = book_infos.split('/')[-3]
date = book_infos.split('/')[-2]
price = book_infos.split('/')[-1]
rate = info.xpath('td/div/span[2]/text()')[0]
comments = info.xpath('td/p/span/text()')
comment = comments[0] if len(comments) != 0 else "空"
writer.writerow((name,url,author,publisher,date,price,rate,comment))

fp.close()

注意写成CSV后,要记事本打开后,保存为UTF-8格式才能打开;

2) 针对EXCEL的读取
import xlwt
import requests
from lxml import etree
import time

all_info_list = []

def get_info(url):
html = requests.get(url)
selector = etree.HTML(html.text)
infos = selector.xpath('//ul[@class="all-img-list cf"]/li')
for info in infos:
title = info.xpath('div[2]/h4/a/text()')[0]
author = info.xpath('div[2]/p[1]/a[1]/text()')[0]
style_1 = info.xpath('div[2]/p[1]/a[2]/text()')[0]
style_2 = info.xpath('div[2]/p[1]/a[3]/text()')[0]
style = style_1+'·'+style_2
complete = info.xpath('div[2]/p[1]/span/text()')[0]
introduce = info.xpath('div[2]/p[2]/text()')[0].strip()
word = info.xpath('div[2]/p[3]/span/span/text()')[0].strip('万字')
info_list = [title,author,style,complete,introduce,word]
all_info_list.append(info_list)
time.sleep(5)

if __name__ == '__main__':
urls = ['http://a.qidian.com/?page={}'.format(str(i)) for i in range(1,2)]
for url in urls:
get_info(url)
header = ['title','author','style','complete','introduce','word']
book = xlwt.Workbook(encoding='utf-8')
sheet = book.add_sheet('Sheet1')
for h in range(len(header)):
sheet.write(0, h, header[h])
i = 1
for list in all_info_list:
j = 0
for data in list:
sheet.write(i, j, data)
j += 1
i += 1
book.save('xiaoshuo.xls')

3 selenium 和phantomjs 配合使用,比如登录网页
from selenium import webdriver
driver = webdriver.PhantomJS()
driver.get('https://www.douban.com/')
driver.implicitly_wait(10)
driver.find_element_by_id('form_email').clear()
driver.find_element_by_id('form_email').send_keys('用户名')
driver.find_element_by_id('form_password').clear()
driver.find_element_by_id('form_password').send_keys('密码')
driver.find_element_by_class_name('bn-submit').click()
print(driver.page_source)
也可以针对AJAX轻松不用逆向工程
比如爬QQ空间的说说:
from selenium import webdriver
import time
import csv
#import pymongo

#client = pymongo.MongoClient('localhost', 27017)
#mydb = client['mydb']
#qq_shuo = mydb['qq_shuo']

driver = webdriver.PhantomJS()
driver.maximize_window()

def get_info(qq):
driver.get('http://user.qzone.qq.com/{}/311'.format(qq))
driver.implicitly_wait(10)
try:
driver.find_element_by_id('login_div')
a = True
except:
a = False
if a == True:
driver.switch_to.frame('login_frame')
driver.find_element_by_id('switcher_plogin').click()
driver.find_element_by_id('u').clear()
driver.find_element_by_id('u').send_keys('XXXX')
driver.find_element_by_id('p').clear()
driver.find_element_by_id('p').send_keys('XXXX')
driver.find_element_by_id('login_button').click()
time.sleep(5)
driver.implicitly_wait(3)
try:
driver.find_element_by_id('QM_OwnerInfo_Icon')
b = True
except:
b = False
if b == True:
driver.switch_to.frame('app_canvas_frame')
contents = driver.find_elements_by_css_selector('.content')

times = driver.find_elements_by_css_selector('.c_tx.c_tx3.goDetail')
for content, tim in zip(contents, times):
data = {
'time': tim.text,
'content': content.text
}
print(content.text)
# qq_shuo.insert_one(data)

if __name__ == '__main__':
qq_lists = []
fp = open('C:/Users/lyr/Downloads/QQmail.csv')
reader = csv.DictReader(fp)
for row in reader:
qq_lists.append(row['电子邮件'].split('@')[0])
fp.close()
for item in qq_lists:
get_info(item)


4 from selenium import webdriver
from lxml import etree
import time
#import pymongo

#client = pymongo.MongoClient('localhost', 27017)
#mydb = client['mydb']
#taobao = mydb['taobao']

driver = webdriver.PhantomJS()
driver.maximize_window()

def get_info(url,page):
page = page + 1
driver.get(url)
driver.implicitly_wait(10)
selector = etree.HTML(driver.page_source)
infos = selector.xpath('//div[@class="item J_MouserOnverReq "]')
for info in infos:
data = info.xpath('div/div/a')[0]
goods = data.xpath('string(.)').strip()
price = info.xpath('div/div/div/strong/text()')[0]
sell = info.xpath('div/div/div[@class="deal-cnt"]/text()')[0]
shop = info.xpath('div[2]/div[3]/div[1]/a/span[2]/text()')[0]
address = info.xpath('div[2]/div[3]/div[2]/text()')[0]
print(goods)
print(price)

commodity = {
'good':goods,
'price':price,
'sell':sell,
'shop':shop,
'address':address
}
# taobao.insert_one(commodity)

if page <= 50:
NextPage(url,page)
else:
pass

def NextPage(url,page):
driver.get(url)
driver.implicitly_wait(10)
//模拟点击下一页
driver.find_element_by_xpath('//a[@trace="srp_bottom_pagedown"]').click()
time.sleep(4)
driver.get(driver.current_url)
driver.implicitly_wait(10)
get_info(driver.current_url,page)

if __name__ == '__main__':
page = 1
url = 'https://www.taobao.com/'
driver.get(url)
driver.implicitly_wait(10)
driver.find_element_by_id('q').clear()
driver.find_element_by_id('q').send_keys('男士短袖')
driver.find_element_by_class_name('btn-search').click()
get_info(driver.current_url,page)

4 scrapy快速使用
在某个目录下,可以scrapy startproject 项目名

然后要抓取的项,写在items.py 中
from scrapy.item import Item,Field

class XiaozhuItem(Item):
title= Field()
address = Field()
price = Field()
lease_type = Field()
suggestion = Field()
bed = Field()

然后在spiders目录下新建立文件:
from scrapy.spiders import CrawlSpider
from scrapy.selector import Selector
from xiaozhu.items import XiaozhuItem

class xiaozhu(CrawlSpider):
name = 'xiaozhu'
start_urls = ['http://bj.xiaozhu.com/fangzi/6937392816.html']

def parse(self, response):
item = XiaozhuItem()
selector = Selector(response)
title = selector.xpath('//h4/em/text()').extract()[0]
address = selector.xpath('//p/span[@class="pr5"]/text()').extract()[0].strip()
price = selector.xpath('//*[@id="pricePart"]/div[1]/span/text()').extract()[0]
lease_type = selector.xpath('//*[@id="introduce"]/li[1]/h6/text()').extract()[0]
suggestion = selector.xpath('//*[@id="introduce"]/li[2]/h6/text()').extract()[0]
bed = selector.xpath('//*[@id="introduce"]/li[3]/h6/text()').extract()[0]

item['title'] = title
item['address'] = address
item['price'] = price
item['lease_type'] = lease_type
item['suggestion'] = suggestion
item['bed'] = bed

yield item

对于抓取后的字段保存和处理,使用pipeline:
class XiaozhuPipeline(object):
def process_item(self, item, spider):
fp = open('d:/xiaozhu.txt','a+')
fp.write(item['title']+'\n')
fp.write(item['address']+'\n')
fp.write(item['price'] + '\n')
fp.write(item['lease_type'] + '\n')
fp.write(item['suggestion'] + '\n')
fp.write(item['bed'] + '\n')
return item

最后进行设置:
ITEM_PIPELINES = {'xiaozhu.pipelines.XiaozhuPipeline':300}
可以搞个MAIN程序,就可以不在命令行下运行了,在spiders目录下,设置
main.py
from scrapy import cmdline
cmdline.execute("scrapy crawl xiaozhu".split())
5 如果要scrapy 设置请求头和导出CSV,可以
设置settings.py:
ROBOTSTXT_OBEY = True
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
DOWNLOAD_DELAY=4
FEED_URI = 'file:d:/photo/zhuanti.csv'
FEED_FORMAT = 'csv'


6 scrapy后, 保存到MYSQL文件中

import pymysql
class JianshuitPipeline(object):
def __init__(self):
conn = pymysql.connect(host='localhost', user='root', passwd='123456', db='mydb', port=3306, charset='utf8')
cursor = conn.cursor()
self.post = cursor
def process_item(self, item, spider):
cursor = self.post
cursor.execute("use mydb")
sql = "insert into jianshu1 (user,time,title,view,comment,lik,gain) values(%s,%s,%s,%s,%s,%s,%s)"
cursor.execute(sql,(item['user'],item['time'],item['title'],item['view'],item['comment'],item['like'],item['gain']))
cursor.connection.commit()
return item