关于多线程操作。
对于IO操作,如访问网站,写入磁盘这种需要时间等待响应的操作,多个cpu也几乎不能提高效率。
对于CPU密集型操作,如这个格式转换,可以通过多个cpu同时去进行。
但是对于python来讲,python存在GIL全局解释器的锁,导致只有一个python线程能被解释器接收。所以等于python只能对IO操作使用线程操作。
1 #coding:utf8
2 import csv
3 from xml.etree.ElementTree import Element,ElementTree
4 import requests
5 from StringIO import StringIO
6 from test_retractxml import pretty
7
8 def download(url):
9 #IO操作很慢,因为不能直接得到数据。如这步:是发送请求,等待数据,在等待的过程中让出CPU,自己睡眠。
10 response = requests.get(url,timeout=3)
11 if response.ok:
12 return StringIO(response.content)
13
14 def csvToxml(scsv,fxml):
15 #这是CPU密集型操作,多个CPU可以同时操作
16 reader = csv.reader(scsv)
17 headers = reader.next()
18 headers = map(lambda h:h.replace(' ',''),headers)
19
20 root = Element('Data')
21 for row in reader:
22 eRow = Element('Row')
23 root.append(eRow)
24 for tag,text in zip(headers,row):
25 e = Element(tag)
26 e.text = text
27 eRow.append(e)
28
29 pretty(root)
30 et = ElementTree(root)
31 et.write(fxml)
32
33
34 def handle(sid):
35 print 'Download ...(%d)' % sid
36 url = 'http://table.finance.yahoo.com/table.csv?s=%s.sz'
37 url %= str(sid).rjust(6,'0')
38 rf = download(url)
39 if rf is None:return
40
41 print 'Convert to XML...(%d)' % sid
42 fname = str(sid).rjust(6,'0')+'.xml'
43 with open(fname,'wb') as wf:
44 csvToxml(rf, wf)
45
46 from threading import Thread
47
48 '''
49 t = Thread(target=handle,args=(1,))
50 t.start()
51
52 print 'main thread'
53 '''
54 class MyThread(Thread):
55 def __init__(self,sid):
56 Thread.__init__(self)
57 self.sid = sid
58
59 def run(self):
60 handle(self.sid)
61
62 threads = []
63 for i in xrange(1,11):
64 t = MyThread(i)
65 threads.append(t)
66 t.start()
67
68 for t in threads:
69 t.join()
70
71 print 'main thread'
72 #t.join()#阻塞函数,保证主线程在所有子线程结束后再退出
73
74
75 '''
76 #这是串行的方法
77 for sid in xrange(1,11):
78 print 'Download ...(%d)' % sid
79 url = 'http://table.finance.yahoo.com/table.csv?s=%s.sz'
80 url %= str(sid).rjust(6,'0')
81 rf = download(url)
82 if rf is None:continue
83
84 print 'Convert to XML...(%d)' % sid
85 fname = str(sid).rjust(6,'0')+'.xml'
86 with open(fname,'wb') as wf:
87 csvToxml(rf, wf)
88 '''
线程间通信,可以用全局变量,但是不够安全,可以用Queue.Queue来存储通信内容。Queue作为线程安全的队列。
1 #coding:utf8
2 import requests
3 import csv
4 from xml.etree.ElementTree import Element,ElementTree
5 from test_retractxml import pretty
6 from threading import Thread
7 from StringIO import StringIO
8
9 from Queue import Queue
10
11
12 class DownloadThread(Thread):
13
14 def __init__(self,sid,queue):
15 Thread.__init__(self)
16 self.sid = sid
17 self.url = 'http://table.finance.yahoo.com/table.csv?s=%s.sz'
18 self.url %=str(sid).rjust(6,'0')
19 self.queue = queue
20
21 def download(self,url):
22 response = requests.get(url,timeout=3)
23 if response.ok:
24 return StringIO(response.content)
25
26 def run(self):
27 print'download',self.sid
28 data = self.download(self.url)
29 self.queue.put((self.sid,data))
30
31
32 class ConverThread(Thread):
33 def __init__(self,queue):
34 Thread.__init__(self)
35 self.queue = queue
36
37 def csvToxml(self,rf,wf):
38 reader = csv.reader(rf)
39 headers = reader.next()
40 headers = map(lambda h:h.replace(' ',''),headers)
41
42 root = Element('Data')
43 for row in reader:
44 eRow = Element('Row')
45 root.append(eRow)
46 for tag,text in zip(headers,row):
47 e = Element(tag)
48 e.text = text
49 eRow.append(e)
50
51 pretty(root)
52 et = ElementTree(root)
53 et.write(wf)
54
55 def run(self):
56 while True:
57 sid,data = self.queue.get()
58 print 'Convert', sid
59 if sid == -1:
60 break
61 if data:
62 fname = str(sid).rjust(6,'0')+'.xml'
63 with open(fname,'wb') as wf:
64 self.csvToxml(data, wf)
65
66
67
68 q = Queue()
69 dThreads = [DownloadThread(i,q) for i in xrange(1,11)]
70 cThread = ConverThread(q)
71
72 for t in dThreads:#多个线程下载
73 t.start()
74
75 cThread.start()#一个线程处理
76
77 for t in dThreads:
78 t.join()
79
80 q.put((-1,None))
由于全局锁GIL的存在,无法用多个线程来对cpu密集操作,所以此例子中是1,用多个线程来进行IO操作;2,将所有下载的内容传给1个线程进行转换。他们之间的交换是通过存入Queue这个安全队列里面。
而进程之间的的事件通知,需要调用thread库里的Event。事件的等待是Event.wait(),事件的响应是Event.set(),需要注意的是,set之后事件就不会再wait,需要Event.clear()来重新激活wait。要把等待,响应的逻辑弄清楚。
这节还引入了守护线程setDaemon的概念,当其值为True时 ,其他线程结束时,自身也会结束。
#coding:utf8class DownloadThread(Thread):
****
class ConverThread(Thread):
def __init__(self,queue,cEvent,tEvent):
Thread.__init__(self)
self.queue = queue
self.cEvent = cEvent
self.tEvent = tEvent
def csvToxml(self,rf,wf):
****
def run(self):
count = 0
while True:
sid,data = self.queue.get()
print 'Convert', sid
if sid == -1:
self.cEvent.set()
self.tEvent.wait()
break
if data:
fname = str(sid).rjust(6,'0')+'.xml'
with open(fname,'wb') as wf:
self.csvToxml(data, wf)
count += 1
if count == 5: #注意这里的逻辑
self.cEvent.set()#激活cEvent,表示转换完成
self.tEvent.wait()#等待tEvent事件完成
self.tEvent.clear()#重新激活tEevent
count = 0
import tarfile
import os
class TarThread(Thread):
def __init__(self,cEvent,tEvent):
Thread.__init__(self)
self.count = 0
self.cEvent = cEvent
self.tEvent = tEvent
self.setDaemon(True)#守护线程,其他线程退出后,他也退出
def tarXML(self):
self.count += 1
tfname = '%d.tgz'%self.count
tf = tarfile.open(tfname,'w:gz')#打包命令,打包格式为gz
for fname in os.listdir('.'):#遍历当前文件夹的文件
if fname.endswith('.xml'):#找到.xml结尾的文件
tf.add(fname)#添加到压缩包中
os.remove(fname)#删除掉已添加加的文件
tf.close()
if not tf.members:#如果打包文件为空,则删除
os.remove(tfname)
def run(self):
while True:
self.cEvent.wait()#等待cEvent事件
self.tarXML()
self.cEvent.clear()#重新激活等待
self.tEvent.set()#激活tEvent,表示完成打包
if __name__ == '__main__':
q = Queue()
dThreads =[DownloadThread(i,q) for i in xrange(1,11)]
cEvent = Event()
tEvent = Event()
cThread = ConverThread(q,cEvent,tEvent)
tThread = TarThread(cEvent,tEvent)
tThread.start()#注意这里要start线程
for t in dThreads:
t.start()
cThread.start()
for t in dThreads:
t.join()
q.put((-1,None))
print 'main thread'
本地线程这一章开始之后都是用了python3,我暂时还是想用python2来实现,所以先放一下,以后在回来补充。
线程池:pass
多进程:pass
:pass