#!/usr/bin/python#-*-coding:utf-8-*-
#简易采集爬虫#1.采集Yahoo!Answers,parseData函数修改一下,可以采集任何网站#2.需要sqlite3或者pysqlite支持#3.可以在DreamHost.com空间上面运行#4.可以修改User-Agent冒充搜索引擎蜘蛛#5.可以设置暂停的时间,控制采集速度#6.采集Yahoo会被封IP数小时,所以这个采集用处不大#Author: Lukin#Date : 2008-09-25
#导入采集需要用到的模块
importre, sys, timeimporthttplib, os.path as ospfrom urlparse importurlparse#使用sqite数据库,为了兼容DreamHost.com的空间,只能这么写了
try:importsqlite3 as sqliteexceptImportError:from pysqlite2 importdbapi2 as sqlite#采集速度控制,单位秒
sleep =0#数据库路径
dbname = './database.db'
#设置提交的header头
headers = {"Accept": "*/*","Referer": "http://answers.yahoo.com/","User-Agent": "Mozilla/5.0+(compatible;+Googlebot/2.1;++http://www.google.com/bot.html)"}#连接服务器
dl = httplib.HTTPConnection('answers.yahoo.com')#连接数据库
conn =sqlite.connect(osp.abspath(dbname))#创建数据库
defcreateDatabase():globalconn,dbname;if osp.isfile(osp.abspath(dbname)) : returnc=conn.cursor()#创建url列表存放表
c.execute('''CREATE TABLE IF NOT EXISTS [collect]([cid] INTEGER PRIMARY KEY,[curl] TEXT,[state] INTEGER DEFAULT '0',UNIQUE([curl]));''')
c.execute('''CREATE INDEX IF NOT EXISTS [collect_idx_state] ON [collect]([state]);''')#创建分类表
c.execute('''CREATE TABLE IF NOT EXISTS [sorts]([sortid] INTEGER PRIMARY KEY,[sortname] TEXT,[sortpath] TEXT,[sortfoot] INTEGER DEFAULT '0',[sortnum] INTEGER DEFAULT '0',UNIQUE([sortpath]));''')
c.execute('''CREATE INDEX IF NOT EXISTS [sorts_idx_sortname] ON [sorts]([sortname]);''')
c.execute('''CREATE INDEX IF NOT EXISTS [sorts_idx_sortfoot] ON [sorts]([sortfoot]);''')#创建文章表
c.execute('''CREATE TABLE IF NOT EXISTS [article]([aid] INTEGER PRIMARY KEY,[sortid] INTEGER DEFAULT '0',[hits] INTEGER DEFAULT '0',[title] TEXT,[path] TEXT,[question] TEXT,[banswer] TEXT,[oanswer] TEXT,UNIQUE([path]));''')
c.execute('''CREATE INDEX IF NOT EXISTS [article_idx_sortid] ON [article]([sortid]);''')#事物提交
conn.commit()
c.close()#执行采集
def collect(url="http://answers.yahoo.com/"):global dl,error,headers; R =0print "GET:",url
urls= urlparse(url); path = urls[2];if urls[4]!='' : path += '?' + urls[4]
dl.request(method="GET", url=path, headers=headers); rs =dl.getresponse()if rs.status==200:
R=parseData(rs.read(),url);else:print "3 seconds, try again ..."; time.sleep(3)
dl.request(method="GET", url=path, headers=headers); rs =dl.getresponse()if rs.status==200:
R=parseData(rs.read(),url);else:print "3 seconds, try again ..."; time.sleep(3)
dl.request(method="GET", url=path, headers=headers); rs =dl.getresponse()if rs.status==200:
R=parseData(rs.read(),url);else:print "Continue to collect ..."R= 3
#更新记录
updateOneUrl(url,R)#返回结果
returnR#处理采集到的数据
defparseData(html,url):global dl,conn; R = 2;
c=conn.cursor()#格式化html代码
format =formatURL(clearBlank(html),url)#取出所有的连接
urls = re.findall(r'''(]*?href="([^"]+)"[^>]*?>)|(]*?href='([^']+)'[^>]*?>)''',format,re.I)if urls !=None :
i=0#循环所有的连接
for regs inurls :#得到一个单一的url
sUrl = en2chr(regs[1].strip())#判断url是否符合规则,符合,则插入数据库
if re.search('http(.*?)/(dir|question)/index(.*?)',sUrl,re.I) !=None :if re.search('http(.*?)/dir/index(.*?)',sUrl,re.I) !=None:if sUrl.find('link=list') == -1 and sUrl.find('link=over') == -1:
sUrl+= '&link=over'
else:
sUrl= sUrl.replace('link=list','link=over')if sUrl[-11:]=='link=mailto' : continue
try:
c.execute('INSERT INTO [collect]([curl])VALUES(?);',(sUrl,))
i= i + 1
exceptsqlite.IntegrityError :pass
if i>0 : print "Message: %d get a new URL." %(i,)#截取数据
if re.search('http(.*)/question/index(.*)',url,re.I) !=None :
sortfoot=0#自动创建分类和分类关系
guide = sect(format,'
1. ','
','((.*?)Home(.*?))') 
aGuide= re.findall(']*href="[^"]*"[^>]*>(.*?)',guide,re.I)if aGuide !=None :
sortname= ""
for sortname inaGuide :
sortname=sortname.strip()
sortpath=en2path(sortname)#查询分类是否存在
c.execute('SELECT [sortid],[sortname] FROM [sorts] WHERE [sortpath]=? LIMIT 0,1;',(sortpath,))
row=c.fetchone();#分类不存在,添加分类
if row==None :
c.execute('INSERT INTO [sorts]([sortname],[sortpath],[sortfoot])VALUES(?,?,?);',(sortname,sortpath,sortfoot))
sortfoot=c.lastrowidelse:
sortfoot=row[0]#标题
title = sect(format,'
','
')#最佳答案 
BestAnswer = sect(format,'(
Best Answer(.*?)
(.*?) 

  )','( 

)')#最佳答案不存在,则不采集 
if BestAnswer !=None :#文章路径
path = en2path(sortname + '-' +title.strip())#问题
adddata = sect(format,'
 

  ',' 

') 
content= sect(format,'(
(.*?)
 

  )','( 

)')if adddata != None : content += ' 

' +adddata#其他回答 
OtherAnswer = ''
for regs in re.findall('
 
') == -1 and regs.find('
') == -1: 
  
a1= sect(regs,'
 
  

    ',' 
  
') 
  
a2= sect(regs,'
 
  

    ',' 
  
') 
  
OtherAnswer+= '
 
  

     ' + a2 + ' 
   
'OtherAnswer+= ' 
  ' 
  
#判断采集成功
if title != None and content !=None :#将数据写入到数据
try:
c.execute('INSERT INTO [article]([sortid],[title],[path],[question],[banswer],[oanswer])VALUES(?,?,?,?,?,?);',(sortfoot,title,path,content,BestAnswer,OtherAnswer))print "Message:%s.html" %(path,)
R= 1
exceptsqlite.IntegrityError :pass
#提交写入数据库
conn.commit(); c.close()returnR#取得一条URL
defgetOneUrl():global conn; c =conn.cursor()
c.execute('SELECT [curl] FROM [collect] WHERE [state] IN(0,3) LIMIT 0,1;')
row=c.fetchone(); c.close()if row==None : return ""
return row[0].encode('utf-8')#更新一条记录的状态
defupdateOneUrl(url,state):global conn; c =conn.cursor()
c.execute('UPDATE [collect] SET [state]=? WHERE [curl]=?;',(state,url))
conn.commit(); c.close()#清除html代码里的多余空格
defclearBlank(html):if len(html) == 0 : return ''html= re.sub('\r|\n|\t','',html)while html.find(" ")!=-1 or html.find(' ')!=-1:
html= html.replace(' ',' ').replace(' ',' ')returnhtml#格式化url
defformatURL(html,url):
urls= re.findall('''(]*?href="([^"]+)"[^>]*?>)|(]*?href='([^']+)'[^>]*?>)''',html,re.I)if urls == None : returnhtmlfor regs inurls :
html=html.replace(regs[0],matchURL(regs[0],url))returnhtml#格式化单个url
defmatchURL(tag,url):
urls= re.findall('''(.*)(src|href)=(.+?)( |/>|>).*|(.*)url\(([^\)]+)\)''',tag,re.I)if urls ==None :returntagelse:if urls[0][5] == '':
urlQuote= urls[0][2]else:
urlQuote= urls[0][5]if len(urlQuote) >0 :
cUrl= re.sub('''['"]''','',urlQuote)else:returntag
urls= urlparse(url); scheme =urls[0];if scheme!='' : scheme+='://'host= urls[1]; host = scheme +hostif len(host)==0 : returntag
path= osp.dirname(urls[2]);if path=='/' : path = '';if cUrl.find("#")!=-1 : cUrl = cUrl[:cUrl.find("#")]#判断类型
if re.search('''^(http|https|ftp):(//|\\\\)(([\w/\\\+\-~`@:%])+\.)+([\w/\\\.\=\?\+\-~`@':!%#]|(&)|&)+''',cUrl,re.I) !=None :#http开头的url类型要跳过
returntagelif cUrl[:1] == '/':#绝对路径
cUrl = host +cUrlelif cUrl[:3]=='../':#相对路径
while cUrl[:3]=='../':
cUrl= cUrl[3:]if len(path) >0 :
path=osp.dirname(path)elif cUrl[:2]=='./':
cUrl= host + path + cUrl[1:]elif cUrl.lower()[:7]=='mailto:' or cUrl.lower()[:11]=='javascript:':returntagelse:
cUrl= host + path + '/' +cUrl
R= tag.replace(urlQuote,'"' + cUrl + '"')returnR#html代码截取函数
def sect(html,start,end,cls=''):if len(html)==0 : return;#正则表达式截取
if start[:1]==chr(40) and start[-1:]==chr(41) and end[:1]==chr(40) and end[-1:]==chr(41) :
reHTML= re.search(start + '(.*?)' +end,html,re.I)if reHTML == None : returnreHTML=reHTML.group()
intStart=re.search(start,reHTML,re.I).end()
intEnd=re.search(end,reHTML,re.I).start()
R=reHTML[intStart:intEnd]#字符串截取
else:#取得开始字符串的位置
intStart =html.lower().find(start.lower())#如果搜索不到开始字符串,则直接返回空
if intStart == -1 : return
#取得结束字符串的位置
intEnd = html[intStart+len(start):].lower().find(end.lower())#如果搜索不到结束字符串,也返回为空
if intEnd == -1 : return
#开始和结束字符串都有了,可以开始截取了
R = html[intStart+len(start):intStart+intEnd+len(start)]#清理内容
if cls != '':
R=clear(R,cls)#返回截取的字符
returnR#正则清除
defclear(html,regexs):if regexs == '' : returnhtmlfor regex in regexs.split(chr(10)):
regex=regex.strip()if regex != '':if regex[:1]==chr(40) and regex[-1:]==chr(41):
html= re.sub(regex,'',html,re.I|re.S)else:
html= html.replace(regex,'')returnhtml#格式化为路径
defen2path(enStr):return re.sub('[\W]+','-',en2chr(enStr),re.I|re.U).strip('-')#替换实体为正常字符
defen2chr(enStr):return enStr.replace('&','&')#------------------------------------- 开始执行程序 -------------------------------------------
#首先创建数据库
createDatabase()#开始采集
loops =0whileTrue:if loops>0 :
url=getOneUrl()if url == "":
loops=0else:
loops=collect(url)else:
loops=collect()#暂停
time.sleep(sleep)if loops==0 : break
#关闭HTTP连接
dl.close()#退出程序
sys.exit()