Python 爬取所有51VOA网站的Learn a words文本及mp3音频
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#Python 爬取所有51VOA网站的Learn a words文本及mp3音频
import os
import sys
import time
import urllib as req
from threading import Thread
import urllib2
import urllib
from threading import Thread
import xml
import re
class MyWorkThread(Thread, urllib.FancyURLopener):
"""
Multi-thread downloading class.
run() is a vitual method of Thread
"""
def __init__(self, threadname, url, filename, ranges = 0):
Thread.__init__(self, name = threadname)
urllib.FancyURLopener.__init__(self)
self.name = threadname
self.url = url
self.filename = filename
self.ranges = ranges
self.downloaded = 0
def run(self):
"""
virtual function in Thread
"""
try:
self.downloaded = os.path.getsize(self.filename)
except OSError:
self.downloaded = 0
#rebuild start point
self.startpoint = self.ranges[0] + self.downloaded
#if this part is completed
if self.startpoint >= self.ranges[1]:
print 'Part %s has been downloaded over.' % self.filename
return
self.oneTimeSize = 8 * 1024 #8K bytes / time
print 'task %s will download from %d to %d' %(self.name, self.startpoint, self.ranges[1])
self.addheader('Range', 'bytes=%d-%d' %(self.startpoint, self.ranges[1]))
self.urlhandle = self.open(self.url)
data = self.urlhandle.read(self.oneTimeSize)
while data:
filehandle = open(self.filename, 'ab+')
filehandle.write(data)
filehandle.close()
self.downloaded += len(data)
data = self.urlhandle.read(self.oneTimeSize)
def GetUrlFileSize(url):
urlHandler = urllib.urlopen(url)
headers = urlHandler.info().headers
length = 0
for header in headers:
if header.find('Length') != -1:
length = header.split(':')[-1].strip()
length = int(length)
return length
def SpliteBlocks(totalsize, blocknumber):
blocksize = totalsize / blocknumber
ranges = []
for i in range(0, blocknumber -1):
ranges.append((i * blocksize, i * blocksize + blocksize -1))
ranges.append((blocksize * (blocknumber -1), totalsize -1))
return ranges
def isLive(tasks):
for task in tasks:
if task.isAlive():
return True
return False
def downLoadFile(url, output, blocks = 6):
sys.stdout.write('Begin to download from %s\n' %url )
sys.stdout.flush()
size = GetUrlFileSize(url)
ranges = SpliteBlocks(size, blocks)
threadname = ["thread_%d" %i for i in range(0, blocks)]
filename = ["tmpfile_%d" %i for i in range(0, blocks)]
tasks = []
for i in range(0, blocks):
task = MyWorkThread(threadname[i], url, filename[i], ranges[i])
task.setDaemon(True)
task.start()
tasks.append(task)
time.sleep(2)
while isLive(tasks):
downloaded = sum([task.downloaded for task in tasks])
process = downloaded / float(size) * 100
show = u'\rFilesize: %d Downloaded:%d Completed: %.2f%%' %(size, downloaded, process)
sys.stdout.write(show)
sys.stdout.flush
time.sleep(1)
output = formatFileName(output)
filehandle = open(output, 'wb+')
for i in filename:
f = open(i, 'rb')
filehandle.write(f.read())
f.close()
os.remove(i)
filehandle.close()
sys.stdout.write("Completed!\n")
sys.stdout.flush()
def formatFileName(filename):
if isinstance(filename, str):
header, tail = os.path.split(filename)
if tail != '':
tuple = ('\\','/',':','*', '?', '"', '<', '>', '|')
for char in tuple:
if tail.find(char) != -1:
tail = tail.replace(char, ' ')
filename = os.path.join(header, tail)
#print filename
return filename
else:
return 'None'
def remove_tags(raw_html):
cleanr =re.compile('<.*?>')
cleantext = re.sub(cleanr,'', raw_html)
return cleantext
def saveword(url,name):
res=req.urlopen(url)
data=res.readlines()
res.close()
startag=r'id="mp3"'
endtag=r'</div>'
k=80
data2=''
data3=''
data4=''
while k<len(data)-10:
if(data[k].find(startag)!=-1):
data2=data[k]
if(data[k].find('<div id="content">')!=-1):
data3=data[k]
if(data[k+1].find('<p>')!=-1):
data4=data[k+1]
# if(data4.rfind('...')!=-1):
# endid = data4.find('...')+3
# else:
# endid = data4.find('</p>')
# data4 = data4[3:endid]
data4=remove_tags(data4)
k=k+1
# print data2
## data=str(data)
## data2=data[(data.find(startag)+14):data.lower().find(endtag)+3]
## data3=data[105]
# print data3
mp3url=data2[data2.find('http'):data2.find(''' title="''')-1]
if(data3.find(endtag)!=-1):
sent = data3[data3.find('今天我们要学'):data3.find(endtag)]
else:
sent = data3[data3.find('今天我们要学'):].strip('\n').strip('\r')+data4.strip('\n')
# sent = sent.replace('\n','. ')
# print mp3url,sent
f=open('LearningWord.txt','a+')
sent=remove_tags(sent)
f.write(name+'\n'+sent.strip('\r')+'\n')
f.close()
# print str(name)+'.mp3'
if(data2.find(startag)!=-1):
downLoadFile(mp3url,str(formatFileName(name.replace(':', ' ')))+'.mp3', blocks = 4)
def savepage(url):
res=req.urlopen(url)
data=res.read()
res.close()
startag='''<ul><li>'''
endtag='''</li></ul>'''
data=str(data)
data2=data[data.find(startag)+12:data.find(endtag)]
linestart='href'
meddle = '''" target'''
lineend = '</a>'
urls=[]
words = []
i=data2.find(linestart)
while(i!=-1):
k = data2.find(meddle)
j = data2.find(lineend)
url = 'http://www.51voa.com/'+data2[i+6:k]
urls = urls+[url]
word = data2[k+16:j]
print i,k,j, word,url
words = words + [word]
data2=data2[j+3:]
saveword(url,word)
i=data2.find(linestart)
# break
#下载所有单词
f=open('LearningWord.txt','w')
f.close()
i=53
while i<=54:
url = 'http://www.51voa.com/Learn_A_Word_'+str(i)+'.html'
savepage(url)
i=i+1
#下载指定单词
#url = "http://www.51voa.com/Voa_English_Learning/Learn_A_Word_21951.html"
#name ='9:pop up'
#saveword(url,name)