-
pyspider分类的最新文章
-
最新文章
-
目录
-
#!/usr/bin/env python # -*- encoding: utf-8 -*- # Created on 2016-11-21 07:17:36 # Project: xdf from pyspider.libs.base_handler import * import re def getId(url): j = len(url)-1 while j >0: if url[j] == '/': break j=j-1 url=url[j+1:len(url)] id=url.split('.')[0] return id class Handler(BaseHandler): crawl_config = { } @every(minutes=24 * 60) def on_start(self): self.crawl('http://www.koolearn.com/', callback=self.index_page) @config(age=10 * 24 * 60 * 60) def index_page(self, response): for each in response.doc('.snavbx a').items(): self.crawl(each.attr.href, callback=self.list_page) @config(age=10 * 24 * 60 * 60) def list_page(self, response): for each in response.doc('a[href^=http]').items(): url=each.attr.href if re.match('http://www.koolearn.com/product/c_(\d+_\d+).html',url): self.crawl(url, callback=self.detail_page_new) elif re.match('http://www.koolearn.com/product/(\d+_\d+).html',url): self.crawl(url, callback=self.detail_page_old) elif re.match('http://wxlm.gaodun.com/Public/jsShow_last/tag/(\d+)',url): self.crawl(url, callback=self.detail_page_gaodun) #新的详情页 def detail_page_new(self, response): price=response.doc('.p-price span').text() price=re.sub(r'[^0-9,.]', "", price) url=response.url id=getId(url) return { "edu_id":id, "name":response.doc('.p-content-head-title h1').text(), "categoryName":response.doc('.f1 a').text(), "price":price } #旧的详情页 def detail_page_old(self, response): price=response.doc('.pri_num').text() price=re.sub(r'[^0-9,.]', "", price) url=response.url id=getId(url) return { "edu_id":id, "name":response.doc('.add_box a:last-child').text(), "categoryName":response.doc('h1').text(), "price":price } #高顿课程页 def detail_page_gaodun(self, response): price=response.doc('div.m-c-tit span.d-price').text() price=re.sub(r'[^0-9,.]', "", price) url=response.url id=getId(url) return { "edu_id":id, "name":response.doc('h2.d-tit').text(), "categoryName":response.doc('.m-c-crumbs span:nth-child(2)').text(), "price":price }
1
收藏
Ctrl+Enter 发布
发布
取消