1. def __init__(self, modelPath, _size=100, _window=5, _minCount=1, _workers=multiprocessing.cpu_count()):  
2. self.modelPath = modelPath  
3. self._size = _size  
4. self._window = _window  
5. self._minCount = _minCount  
6. self._workers = _workers




1. def initTrainWord2VecModel(self, corpusFilePath, safe_model=False):  
2. '''''
3.         init and train a new w2v model
4.         (corpusFilePath can be a path of corpus file or directory or a file directly, in some time it can be sentences directly
5.         about soft_model:
6.             if safe_model is true, the process of training uses update way to refresh model,
7.         and this can keep the usage of os's memory safe but slowly.
8.             and if safe_model is false, the process of training uses the way that load all
9.         corpus lines into a sentences list and train them one time.)
10.         '''
11.         extraSegOpt().reLoadEncoding()  
13.         fileType = localFileOptUnit.checkFileState(corpusFilePath)  
14. if fileType == u'error':  
15. 'load file error!')  
16. return None
17. else:  
18. None
19. if fileType == u'opened':  
20. print('training model from singleFile!')  
21. self._size, window=self._window, min_count=self._minCount, workers=self._workers)  
22. elif fileType == u'file':  
23. 'r')  
24. print('training model from singleFile!')  
25. self._size, window=self._window, min_count=self._minCount, workers=self._workers)  
26. elif fileType == u'directory':  
27.                 corpusFiles = localFileOptUnit.listAllFileInDirectory(corpusFilePath)  
28. print('training model from listFiles of directory!')  
29. if safe_model == True:  
30. 0]), size=self._size, window=self._window, min_count=self._minCount, workers=self._workers)  
31. for file in corpusFiles[1:len(corpusFiles)]:  
32. self.updateW2VModelUnit(model, file)  
33. else:  
34. self.loadSetencesFromFiles(corpusFiles)  
35. self._size, window=self._window, min_count=self._minCount, workers=self._workers)  
36. elif fileType == u'other':  
37. # TODO add sentences list directly
38. pass
40. self.modelPath)  
41.             model.init_sims()  
42. print('producing word2vec model ... ok!')  
43. return

首先是一些杂七杂八的,判断一下输入文件路径下访问结果的类型,根据不同的类型做出不同的文件处理反应,这个大家应该能看懂,以corpusFilePath为一个已经打开的file对象为例,创建word2vec model的代码为:

1. model = Word2Vec(LineSentence(corpusFilePath), size=self._size, window=self._window, min_count=self._minCount, workers=self._workers)





1. def updateW2VModelUnit(self, model, corpusSingleFilePath):  
2. '''''
3.         (only can be a singleFile)
4.         '''
5.         fileType = localFileOptUnit.checkFileState(corpusSingleFilePath)  
6. if fileType == u'directory':  
7. 'can not deal a directory!')  
8. return
10. if fileType == u'opened':  
11.             trainedWordCount = model.train(LineSentence(corpusSingleFilePath))  
12. print('update model, update words num is: '
13. elif fileType == u'file':  
14. 'r')  
15.             trainedWordCount = model.train(LineSentence(corpusSingleFile))  
16. print('update model, update words num is: '
17. else:  
18. # TODO add sentences list directly (same as last function)
19. pass
20. return



当你确定model已经训练完成,不会再更新的时候,可以对model进行锁定,并且据说是预载了相似度矩阵能够提高后面的查询速度,但是你的model从此以后就read only了。

1. def finishTrainModel(self, modelFilePath=None):  
2. '''''
3.         warning: after this, the model is read-only (can't be update)
4.         '''
5. if modelFilePath == None:  
6. self.modelPath  
7. self.loadModelfromFile(modelFilePath)  
8. True)



1. def getWordVec(self, model, wordStr):  
2. '''''
3.         get the word's vector as arrayList type from w2v model
4.         '''
5. return

1. def queryMostSimilarWordVec(self, model, wordStr, topN=20):  
2. '''''
3.         MSimilar words basic query function
4.         return 2-dim List [0] is word [1] is double-prob
5.         '''
6. 'utf-8'), topn=topN)  
7. return

1. def culSimBtwWordVecs(self, model, wordStr1, wordStr2):  
2. '''''
3.         two words similar basic query function
4.         return double-prob
5.         '''
6. 'utf-8'), wordStr2.decode('utf-8'))  
7. return
1. def culSimBtwWordVecs(self, model, wordStr1, wordStr2):  
2. '''''
3.         two words similar basic query function
4.         return double-prob
5.         '''
6. 'utf-8'), wordStr2.decode('utf-8'))  
7. return
1. def culSimBtwWordVecs(self, model, wordStr1, wordStr2):  
2. '''''
3.         two words similar basic query function
4.         return double-prob
5.         '''
6. 'utf-8'), wordStr2.decode('utf-8'))  
7. return
1. def culSimBtwWordVecs(self, model, wordStr1, wordStr2):  
2. '''''
3.         two words similar basic query function
4.         return double-prob
5.         '''
6. 'utf-8'), wordStr2.decode('utf-8'))  
7. return




1. def queryMSimilarVecswithPosNeg(self, model, posWordStrList, negWordStrList, topN=20):  
2. '''''
3.         pos-neg MSimilar words basic query function
4.         return 2-dim List [0] is word [1] is double-prob
5.         '''
6.         posWordList = []  
7.         negWordList = []  
8. for wordStr in
9. 'utf-8'))  
10. for wordStr in
11. 'utf-8'))  
12.         pnSimilarPairList = model.most_similar(positive=posWordList, negative=negWordList, topn=topN)  
13. return


1. def copeMSimilarVecsbtwWordLists(self, model, wordStrList1, wordStrList2, topN_rev=20, topN=20):  
2. '''''
3.         range word vec res for two wordList from source to target
4.         use wordVector to express the relationship between src-wordList and tag-wordList
5.         first, use the tag-wordList as neg-wordList to get the rev-wordList,
6.         then use the scr-wordList and the rev-wordList as the new src-tag-wordList
7.         topN_rev is topN of rev-wordList and topN is the final topN of relationship vec
8.         '''
9.         srcWordList = []  
10.         tagWordList = []  
11. 'utf-8') for wordStr in
12. 'utf-8') for wordStr in
14. self.queryMSimilarVecswithPosNeg(model, [], tagWordList, topN_rev)  
15.         revWordList = []  
16. 0].decode('utf-8') for pair in
17. self.queryMSimilarVecswithPosNeg(model, srcWordList, revWordList, topN)  
18. return







