1.正则表达式

正则表达式在处理文本方面发挥着重要的作用
1.re.match()
从字符串开头匹配,匹配成功返回匹配结果,加上.group()可查看匹配到的具体的值,匹配不成功则返回None

import re
print(re.match(r'a','abc123').group()) #a
print(re.match(r'A','abc123',re.I).group())#a,加上re.I可忽略大小写
print(re.match(r'Ab','abc123',re.I).group())#ab
print(re.match('12','ab123'))#None

2.re.search()
从前往后匹配,返回匹配到的第一个值,若匹配不成功,返回None

print(re.search(r'a','abc123').group())#a
print(re.search(r'B','abc123',re.I).group())#b
print(re.search(r'Bc','abc123',re.I).group())#bc
print(re.search('12','ab123').group())#12
print(re.search('12','ab123').span())#(2,4), span()可以返回匹配到的字符串的索引范围

3.re.findall()
从前往后匹配,返回匹配到的字符串列表

print(re.findall(r'\d+','12abjh46hjk698bg7ghj8'))#['12', '46', '698', '7', '8']
print(re.findall(r'[a-zA-Z]+','my beautiful girl .'))#['my', 'beautiful', 'girl']

4.re.finditer()
和re.findall()差不多,只不过返回的不是列表,是迭代器

mymatch=re.finditer(r'\d+','12abjh46hjk698bg7ghj8')
for match in mymatch:
    print(match.group())
#输出
12
46
698
7
8

5.re.split()
将字符串按照正则表达式切分为列表

re.split('\s+','this is a     dog')
#输出
['this', 'is', 'a', 'dog']

6.re.sub()
将正则表达式匹配到的部分进行替换

print(re.sub('a','b','abcabc'))
print(re.sub('\s+','.','this is a dog'))
#输出:
bbcbbc
this.is.a.dog

7.关于group()

line='Cats are smarter than dogs'
matchObj=re.match(r'(.*) are (.*?) .*',line,re.M|re.I)
if matchObj:
    print(matchObj.group())
    print(matchObj.group(1))
    print(matchObj.group(2))
    print(matchObj.groups())
else:
    print('no match')
#输出:
Cats are smarter than dogs
Cats
smarter
('Cats', 'smarter')


#可以给分组命名
s='1102231990xxxxxxxx'
res=re.search('(?P<province>\d{3})(?P<city>\d{3})(?P<born_year>\d{4})',s)
print(res.groupdict())#这样会输出一个字典
#输出:
{'province': '110', 'city': '223', 'born_year': '1990'}

2.中英文分词和词性标注

1.英文分词和词性标注

import nltk
from nltk.corpus import stopwords
def english_label(text):
    #首先将所有的字母变为小写字母
    text=text.lower()
    #分词
    text_list=nltk.word_tokenize(text)
    #去掉标点符号
    english_punctuations=[',','.',':',';','?','(',')','[',']','&','!','*','@','#','$','%']
    text_list=[word for word in text_list if word not in english_punctuations]
    #去停用词
    stops=set(stopwords.words('english'))
    text_list=[word for word in text_list if word not in stops]
    #词性标注
    poslist=nltk.pos_tag(text_list)
    return poslist

#测试一下
mytext='This is a dog. That is a cat. I love them very much.'
print(english_label(mytext))    

#输出:
[('dog', 'NN'), ('cat', 'NN'), ('love', 'VBP'), ('much', 'RB')]

2.中文分词和词性标注

import jieba
import jieba.posseg as pseg
def chinese_label(text,stops_path):
    #text是需要处理的中文文本
    #stops_path是停用词表所在路径
    #首先对中文分词
    text_list=jieba.lcut(text)
    #然后删除停用词
    #首先导入停用词词表并处理
    with open(stops_path,encoding='utf-8') as fp:
        stopwords=fp.readlines()
    stopwords=[line.strip() for line in stopwords]
    text_list=[word for word in text_list if word not in stopwords]
    #把列表转成字符串,词与词之间连接
    text_str=''.join(text_list)
    #然后进行词性标注
    text_pos=pseg.cut(text_str)
    return text_pos

#测试一下
mytext='我爱北京天安门,天安门上太阳升'
path=r'D:\python\1python\stopwords\stopwords-zh-master\stopwords-zh.txt'#你的停用词路径
result=chinese_label(mytext,path)
for a in result:
    print(a)

#输出
爱/v
北京/ns
天安门/ns
天安门/ns
太阳升/nr

3.命名实体识别NER

1.英文命名实体识别

#英文命名实体识别用的还是nltk
import nltk
from nltk.corpus import stopwords
#定义一个用于命名实体识别的函数

#命名实体识别时不要把大写字母变成小写,可能也不需要去停用词
def english_ner(text):
    #首先分词
    text_list=nltk.word_tokenize(text)
    #去掉标点符号
    english_punctuations=[',','.',':',';','?','(',')','[',']','&','!','*','@','#','$','%']
    text_list=[word for word in text_list if word not in english_punctuations]
    #去停用词
    #stops=set(stopwords.words('english'))
    #text_list=[word for word in text_list if word not in stops]
    #然后词性标注
    text_pos=nltk.pos_tag(text_list)
    #然后进行命名实体识别
    text_entities=nltk.chunk.ne_chunk(text_pos)
    return text_entities

#测试一下
texts='This is a dog. That is a cat. I love them very much. Beijing is a beautiful city. London is a beautiful girl. I am very excited about the next generation of Apple products. I bought these Apple products today. His name is Jack'

#分句
mytext=nltk.sent_tokenize(texts)
print(mytext)
#然后进行分词词性标注命名实体识别
for text in mytext:
    print(english_ner(text))    


#输出:
['This is a dog.', 'That is a cat.', 'I love them very much.', 'Beijing is a beautiful city.', 'London is a beautiful girl.', 'I am very excited about the next generation of Apple products.', 'I bought these Apple products today.', 'His name is Jack']
(S This/DT is/VBZ a/DT dog/NN)
(S That/DT is/VBZ a/DT cat/NN)
(S I/PRP love/VBP them/PRP very/RB much/RB)
(S (GPE Beijing/NNP) is/VBZ a/DT beautiful/JJ city/NN)
(S (GPE London/NNP) is/VBZ a/DT beautiful/JJ girl/NN)
(S
  I/PRP
  am/VBP
  very/RB
  excited/JJ
  about/IN
  the/DT
  next/JJ
  generation/NN
  of/IN
  (GPE Apple/NNP)
  products/NNS)
(S I/PRP bought/VBD these/DT Apple/NNP products/NNS today/NN)
(S His/PRP$ name/NN is/VBZ (PERSON Jack/NNP))

2.中文命名实体识别

#中文命名实体识别
#可以用百度的LAC开源项目实现中文的命名实体识别
from LAC import LAC
import re
import jieba
import jieba.posseg as pseg
#定义一个函数实现中文命名实体识别,我写的这个函数是针对句子列表的,上面那些函数都是针对句子的
def chinese_ner(texts):
    #装载LAC模型
    lac=LAC(mode='lac')#当mode是seg时是在分词
    lac_result=lac.run(texts)
    return lac_result

#测试一下
corpus='''我爱北京天安门。天安门上太阳升。朱一龙是个著名的青年男演员!百度是一家大公司。
华北制药集团有限责任公司工资高吗'''
#进行分句得到句子的列表(用正则表达式完成分句)
delimiter=r'[。?;!]'
texts=re.split(delimiter,corpus)
texts=[sent for sent in texts if sent]
print(texts)
#然后调用上面的函数进行命名实体识别
print(chinese_ner(texts))    

#输出:
['我爱北京天安门', '天安门上太阳升', '朱一龙是个著名的青年男演员', '百度是一家大公司', '\n华北制药集团有限责任公司工资高吗']
[[['我', '爱', '北京', '天安门'], ['r', 'v', 'LOC', 'LOC']], [['天安门', '上', '太阳', '升'], ['LOC', 'f', 'n', 'v']], [['朱一龙', '是', '个', '著名', '的', '青年', '男演员'], ['PER', 'v', 'q', 'a', 'u', 'n', 'n']], [['百度', '是', '一家', '大公司'], ['ORG', 'v', 'm', 'n']], [['\n华北制药集团有限责任公司', '工资', '高', '吗'], ['ORG', 'n', 'a', 'xc']]]

4.句法分析

1.中文句法分析

#句法分析用的是斯坦福的包
#首先是中文的句法分析
from stanfordcorenlp import StanfordCoreNLP
nlp=StanfordCoreNLP(r'D:\wy\斯坦福句法分析\stanford-corenlp-full-2016-10-31',lang='zh')#中文的话是‘en’
sentence='清华大学位于北京。'
#分词和词性标注不能用在中文里面
#分词
#print(nlp.word_tokenize(sentence))
#词性标注
#print(nlp.pos_tag(sentence))
#句法分析
print(nlp.parse(sentence))#这个难道是短语结构句法分析?
print(nlp.dependency_parse(sentence))#这个是依存句法分析


#输出:
(ROOT
  (IP
    (NP (NR 清华) (NN 大学))
    (VP (VV 位于)
      (NP (NR 北京)))
    (PU 。)))
[('ROOT', 0, 3), ('compound:nn', 2, 1), ('nsubj', 3, 2), ('dobj', 3, 4), ('punct', 3, 5)]

2.英文句法分析

#然后是英文的,步骤跟上面完全一样,改一改参数就行了
from stanfordcorenlp import StanfordCoreNLP
nlp=StanfordCoreNLP(r'D:\wy\斯坦福句法分析\stanford-corenlp-full-2016-10-31',lang='en')
sentence='The dog is eating a meat on the desk.'
#分词
print(nlp.word_tokenize(sentence))
#词性标注
print(nlp.pos_tag(sentence))
#句法分析
print(nlp.parse(sentence))
print(nlp.dependency_parse(sentence))


#输出:
['The', 'dog', 'is', 'eating', 'a', 'meat', 'on', 'the', 'desk', '.']
[('The', 'DT'), ('dog', 'NN'), ('is', 'VBZ'), ('eating', 'VBG'), ('a', 'DT'), ('meat', 'NN'), ('on', 'IN'), ('the', 'DT'), ('desk', 'NN'), ('.', '.')]
(ROOT
  (S
    (NP (DT The) (NN dog))
    (VP (VBZ is)
      (VP (VBG eating)
        (NP
          (NP (DT a) (NN meat))
          (PP (IN on)
            (NP (DT the) (NN desk))))))
    (. .)))
[('ROOT', 0, 4), ('det', 2, 1), ('nsubj', 4, 2), ('aux', 4, 3), ('det', 6, 5), ('dobj', 4, 6), ('case', 9, 7), ('det', 9, 8), ('nmod', 4, 9), ('punct', 4, 10)]