目前只是盲目的摘下来而已.
# cause it is python's bug
import urllib
from BeautifulSoup import BeautifulSoup
import sys
global file
def getWebContent(url, word):
html = urllib.urlopen(url).read()
#html = html.decode("gb2312","ignore").encode("utf-8","ignore")
html = unicode(html,"gb2312","ignore").encode("utf-8","ignore")
soup = BeautifulSoup(html)
#filter 1
data = str(soup.find("div", {"class":"explain"}))
#strContent = data.renderContents()+"\n" # default the string s is coded with ASCII
# but the original is UTF-8, because the
# beautifulSoup use it...
#fileter 2
soup = BeautifulSoup(data)
# beautifulsoup generator http://www.crummy.com/software/BeautifulSoup/documentation.zh.html#Generators
outtext=''.join([element for element in soup.recursiveChildGenerator() if isinstance(element,unicode)])
#make some rendering
for item in range(1,10):
outtext=outtext.replace(str(item),"\n%s" % str(item))
outtext=outtext.replace(" ","\n")
outtext =word +":\n" +outtext +"\n"
file.write(outtext)
print outtext.decode("utf-8").encode("gbk")
def word_FromFile():
file = open("F:/Whu/EnghlishWords.txt","r")
for word in file.readlines():
print isinstance(word, unicode)
print word.decode("utf-8")
#must be carefully!!!
#because we use the utf-8 to store the Chinese words in notepad
#it will add another 3 words to mark
# if file[:3] == codes.BOM_UTF8;
# data = data[3:]
# print data.decode("utf-8")
url = "http://dict.baidu.com/s?wd=%s" % word
getWebContent(url, word)
if __name__ == '__main__':
reload(sys)
sys.setdefaultencoding('utf-8')
file = open("F:/Whu/EnghlishWords_translate.txt",'w')
word_FromFile()
file.flush()
file.close()