1. 准备工作:分词和清洗

    1. import nltk  
    2. from nltk.corpus import stopwords  
    3. from nltk.corpus import brown  
    4. import numpy as np  
    6. #分词  
    7. text = "Sentiment analysis is a challenging subject in machine learning.\  
    8. in language that is often obscured by sarcasm,\  
    9. and plays on words, all of which could be very misleading for \  
    10. and computers.".lower()  
    11. text_list = nltk.word_tokenize(text)  
    12. #去掉标点符号  
    13. english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']  
    14. text_list = [word for word in text_list if word not in english_punctuations]  
    15. #去掉停用词  
    16. stops = set(stopwords.words("english"))  
    17. text_list = [word for word in text_list if word not in stops]

    2.  使用词性标注器:处理一个词序列,为每个词附加一个词性标记

    1. nltk.pos_tag(text_list)  
    2. Out[81]:   
    3. [('sentiment', 'NN'),  
    4. 'analysis', 'NN'),  
    5. 'challenging', 'VBG'),  
    6. 'subject', 'JJ'),  
    7. 'machine', 'NN'),  
    8. 'learning', 'VBG'),  
    9. 'people', 'NNS'),  
    10. 'express', 'JJ'),  
    11. 'emotions', 'NNS'),  
    12. 'language', 'NN'),  
    13. 'often', 'RB'),  
    14. 'obscured', 'VBD'),  
    15. 'sarcasm', 'JJ'),  
    16. 'ambiguity', 'NN'),  
    17. 'plays', 'NNS'),  
    18. 'words', 'NNS'),  
    19. 'could', 'MD'),  
    20. 'misleading', 'VB'),  
    21. 'humans', 'NNS'),  
    22. 'computers', 'NNS')]

    3. 读取已标注的语料库:NLTK中包括的若干语料库已经标注了词性

    1. brown_taged= nltk.corpus.brown.tagged_words()

    4.  自动标注

    1. brown_tagged_sents = brown.tagged_sents(categories='news')  
    2. brown_sents = brown.sents(categories='news')  
    3. #默认标注  
    4. tags = [tag for (word,tag) in brown.tagged_words(categories='news')]  
    5. print(nltk.FreqDist(tags).max())  
    7. NN
    1. raw = 'I do not like green eggs and ham, I do not like them Sam I am!'  
    2. tokens = nltk.word_tokenize(raw)  
    3. default_tagger = nltk.DefaultTagger('NN')  
    4. print(default_tagger.tag(tokens))  
    5. print(default_tagger.evaluate(brown_tagged_sents))  
    7. [('I', 'NN'), ('do', 'NN'), ('not', 'NN'), ('like', 'NN'), ('green', 'NN'), ('eggs', 'NN'), ('and', 'NN'), ('ham', 'NN'), (',', 'NN'), ('I', 'NN'), ('do', 'NN'), ('not', 'NN'), ('like', 'NN'), ('them', 'NN'), ('Sam', 'NN'), ('I', 'NN'), ('am', 'NN'), ('!', 'NN')]  
    8. 0.13089484257215028
    1. #正则表达式标注器  
    2. patterns= [(r'.*ing$','VBG'),(r'.*ed$','VBD'),(r'.*es$','VBZ'),(r'.*ould$','MD'),\  
    3. '.*\'s$','NN$'),(r'.*s$','NNS'),(r'^-?[0-9]+(.[0-9]+)?$','CD'),(r'.*','NN')]  
    4. regexp_tagger = nltk.RegexpTagger(patterns)  
    5. regexp_tagger.tag(brown_sents[3])  
    6. print(regexp_tagger.evaluate(brown_tagged_sents))  
    8. 0.20326391789486245

    1. #查询标注器:找出100个最频繁的词,存储它们最有可能的标记。然后可以使用这个信息作为  
    2. #"查询标注器"(NLTK UnigramTagger)的模型  
    3. fd = nltk.FreqDist(brown.words(categories='news'))  
    4. cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))  
    5. most_freq_words = list(fd.keys())[:100]  
    6. likely_tags = dict((word,cfd[word].max()) for word in most_freq_words)  
    7. # baseline_tagger = nltk.UnigramTagger(model=likely_tags)  
    8. #许多词都被分配了None标签,因为它们不在100个最频繁的词中,可以使用backoff参数设置这些词的默认词性  
    9. baseline_tagger = nltk.UnigramTagger(model=likely_tags,backoff=nltk.DefaultTagger('NN'))  
    10. print(baseline_tagger.evaluate(brown_tagged_sents))  
    11. 0.46063806511923944

    5.  N-gram 标注


      1. In[87]: unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)     #训练一个一元标注器  
      2. print(unigram_tagger.tag(brown_sents[2007]))  
      3. unigram_tagger.evaluate((brown_tagged_sents))  
      4. [('Various', 'JJ'), ('of', 'IN'), ('the', 'AT'), ('apartments', 'NNS'), ('are', 'BER'), ('of', 'IN'), ('the', 'AT'), ('terrace', 'NN'), ('type', 'NN'), (',', ','), ('being', 'BEG'), ('on', 'IN'), ('the', 'AT'), ('ground', 'NN'), ('floor', 'NN'), ('so', 'QL'), ('that', 'CS'), ('entrance', 'NN'), ('is', 'BEZ'), ('direct', 'JJ'), ('.', '.')]  
      5. Out[87]: 0.9349006503968017

      1. #分离训练集和测试集  
      2. size = int(len(brown_tagged_sents)*0.9)  
      3. train_sents = brown_tagged_sents[:size]  
      4. test_sents = brown_tagged_sents[size:]  
      5. unigram_tagger = nltk.UnigramTagger(train_sents)  
      6. unigram_tagger.evaluate(test_sents)  
      7. Out[89]: 0.8121200039868434



      1. bigram_tagger = nltk.BigramTagger(train_sents)  
      2. bigram_tagger.tag(brown_sents[2007])  
      3. bigram_tagger.evaluate(test_sents)  
      4. Out[90]: 0.10206319146815508




                 尝试使用bigram标注器标注标识符

          如果bigram无法找到标记,尝试unigram标注器

      1. t0 = nltk.DefaultTagger('NN')  
      2. t1 = nltk.UnigramTagger(train_sents,backoff=t0)  
      3. t2 = nltk.BigramTagger(train_sents,backoff=t1)  
      4. t2.evaluate(test_sents)  
      5. Out[92]: 0.8452108043456593
      1. t3 = nltk.BigramTagger(train_sents,cutoff=2,backoff=t1)  
      2. t3.evaluate(test_sents)  
      3. Out[95]: 0.8424200139539519



      1. In[101]: #保存标注器  
      2. from pickle import dump  
      3. output = open('t2.pkl','wb')  
      4. dump(t2,output,-1)  
      5. output.close()  
      6. #加载标注器  
      7. from pickle import load  
      8. input = open('t2.pkl','rb')  
      9. tagger = load(input)  
      10. input.close()  
      11. #使用标注器  
      12. text = "Sentiment analysis is a challenging subject in machine learning."  
      13. tokens = text.split()  
      14. tagger.tag(tokens)  
      15. Out[101]:   
      16. [('Sentiment', 'NN'),  
      17. 'analysis', 'NN'),  
      18. 'is', 'BEZ'),  
      19. 'a', 'AT'),  
      20. 'challenging', 'JJ'),  
      21. 'subject', 'NN'),  
      22. 'in', 'IN'),  
      23. 'machine', 'NN'),  
      24. 'learning.', 'NN')]