版权声明:作者:金良山庄,欲联系请评论博客或私信,个人主页:http://www.jinliangxu.com/


作者:金良​

对数域操作函数

class Logspace:
def __init__(self):
self.LOGZERO =np.nan
def eexp(self,x):
if np.isnan(x):
return 0
else:
return np.exp(x)
def eln(self,x):
if x == 0:
return self.LOGZERO
elif x>0:
return np.log(x)
else:
print 'Wrong!!!\n\t negative input error'
return np.nan
def elnsum(self,elnx,elny):
if np.isnan(elnx):
return elny
elif np.isnan(elny):
return elnx
elif elnx > elny:
return elnx + self.eln(1+np.exp(elny-elnx))
else:
return elny + self.eln(1+np.exp(elnx-elny))
def elnproduct(self,elnx,elny):
if np.isnan(elnx) or np.isnan(elny):
return self.LOGZERO
else:
return elnx + elny
def elnmatprod(self,elnx,elny):
#array([[ 0.]])其size是2
xsize = np.size(np.shape(elnx))
ysize = np.size(np.shape(elny))

if xsize == 1 and ysize == 1:
r = self.LOGZERO
for i in range(np.shape(elnx)[0]):
r = self.elnsum(r,self.elnproduct(elnx[i],elny[i]))
return r
elif xsize == 1 and not ysize == 1:
n = np.shape(elny)[1]
r = np.zeros(n)
for i in range(n):
r[i] = self.elnmatprod(elnx,elny[:,i])
return r
elif not xsize == 1 and ysize == 1:
n = np.shape(elnx)[0]
r = np.zeros(n)
for i in range(n):
r[i] = self.elnmatprod(elnx[i,:],elny)
return r
else:
m,n= np.shape(elnx)
p = np.shape(elny)[1]
r = np.zeros((m,p))
for i in range(m):
for j in range(p):
r[i][j] = self.elnmatprod(elnx[i,:],elny[:,j])
return r
def eexpmat(self,elny):
expy = np.copy(elny)
if np.size(np.shape(elny)) == 1:
for i in range(np.shape(elny)[0]):
expy[i] = self.eexp(expy[i])
else:
for i in range(np.shape(elny)[0]):
for j in range(np.shape(elny)[1]):
expy[i][j] = self.eexp(expy[i][j])
return expy
def elnmat(self,x):
elnx = np.copy(x)
if np.size(np.shape(x)) == 1:
for i in range(np.shape(x)[0]):
elnx[i] = self.eln(x[i])
else:
for i in range(np.shape(x)[0]):
for j in range(np.shape(x)[1]):
elnx[i,j] = self.eln(x[i,j])
return elnx

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80


  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80

测试举例

logspace = Logspace()
M1 = np.array([1,0.5])
M2 = np.array([[1.3,1.5],[1.8,0.5]])
M3 = np.array([[0.8,1.5],[1.8,0.7]])
M4 = np.array([0,0])

print logspace.eexpmat(logspace.elnmatprod(M1,M2))
print np.dot(logspace.eexpmat(M1),logspace.eexpmat(M2))

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8


  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8


[ 19.94836491 14.90077579] 
[ 19.94836491 14.90077579]


条件随机场的函数

def read_corps(corpsfile='testchunk.data'):
#http://www.chokkan.org/software/crfsuite/tutorial.html,该页面有两个网址可下载数据集,该数据集量很大
#http://blog.dpdearing.com/2011/12/opennlp-part-of-speech-pos-tags-penn-english-treebank/
tagids = defaultdict(lambda: len(tagids))
tagids["<S>"] = 0

corps=[]
onesentence = []
words = [ "<S>" ]
tags = [ 0 ]
#wordnumcount = 0
with open(corpsfile,'r') as f:
for line in f:
if len(line)<=1:
pass
elif line != '. . O\n':
# '. . O\n'表示一句话结束,当一句话未结束则将该单词加入列表onesentence
onesentence.append(line)
else: #如果一句话结束,则对该句话的所有出现的单词进行处理,将处理结果存入列表corps
for texts in onesentence:
#wordnumcount += 1
w_t = texts.strip().split(" ")
#print w_t
try:
#由于表示数字的字符串变化较多,为了减少其干扰,这里将其检测出来并替换掉
float(w_t[0].strip().replace(',',''));
#print w_t
words.append('#CD#')
except:
words.append(w_t[0].lower())
#if w_t[1] in{ '``',',',"''",'$','#',')','('}:
# print w_t
tags.append(tagids[w_t[1]])
words.append("<S>") #words是一句话的单词组成的列表
tags.append(0) #tags是一句话的标注组成的列表,与单词列表words一一对应
if np.shape(words)[0] > 2: #排除掉空句子
corps.append((words,tags))

#对onesentence,words和tags重新初始化
onesentence = []
words = [ "<S>" ]
tags = [ 0 ]
#print '一共出现的单词个数:'+np.str(wordnumcount)
#一共出现的单词个数:40377
return corps,tagids
def getfeatureTS(corps):
featuresets = set() #特征的集合
featureT = [] #转移特征的列表,比如列表元素('T', 2, 3)表示从状态2转到特征3
featureS = [] #状态特征的列表,比如列表元素('S','Confidence', 1)
for corp in corps:
for i in range(np.shape(corp[0])[0]):
if corp[0][i] == '<S>':
continue
if ('S',corp[0][i],corp[1][i]) not in featuresets:
featuresets.add(('S',corp[0][i],corp[1][i]))
featureS.append(('S',corp[0][i],corp[1][i]))
if corp[0][i-1] != '<S>':
if ('T',corp[1][i-1],corp[1][i]) not in featuresets:
featuresets.add(('T',corp[1][i-1],corp[1][i]))
featureT.append(('T',corp[1][i-1],corp[1][i]))
featureTS = featureT+featureS
words2tagids = words2tagidfromfeatureS(featureS)
return featureTS,words2tagids
def getpriorfeatureE(corps,featureTS):
#计算先验特征期望值
N = np.shape(corps)[0] #训练样本数
K = np.shape(featureTS)[0] #特征数
priorfeatureE = np.zeros(K)

for corp in corps:
for i in range(np.shape(corp[0])[0]):
if corp[0][i] == '<S>':
continue
try:
idex = featureTS.index(('S', corp[0][i], corp[1][i]))
priorfeatureE[idex] += 1.0
except:
pass
try:
idex = featureTS.index(('T', corp[1][i-1], corp[1][i]))
priorfeatureE[idex] += 1.0
except:
pass
priorfeatureE /=N
#plt.plot(priorfeatureE)
#从特征的先验期望值可以看出无论是转移特征(从横坐标0开始)还是状态特征(从横坐标318开始),先被记录的先验期望值越大
return priorfeatureE
def words2tagidfromfeatureS(featureS):
#统计所有单词分别对应的词性列表
words2tagids = {}
for feature in featureS:
word = feature[1]
state = feature[2]
if word in words2tagids:
words2tagids[word].append(state)
else:
words2tagids[word] = [state]

#lennums列表统计单词对应的词性的长度的分布
#lennums = [[lenlist.count(i) for i in range(1,max(lenlist)+1)]
# for lenlist in [[len(words2tagids[i]) for i in words2tagids]]][0]
#lennums = [3760, 389, 32, 1]
return words2tagids
def getpostfeatureE(weights,corps,featureTS,words2tagids):
K = np.shape(featureTS)[0] #特征数
postfeatureE = np.zeros(K) #特征的后验期望值
N = np.shape(corps)[0]
for corpidx in range(N):
corp = corps[corpidx][0][1:-1]

lencorp = np.size(corp) #语料长度,即句子中的单词数
Mlist = {}
Mlist['mat'] = ['']*(lencorp+1)
Mlist['dim'] = [words2tagids[corp[i]] for i in range(lencorp)]
Mlist['len'] = [np.size(words2tagids[corp[i]]) for i in range(lencorp)]
for i in range(lencorp+1):
if i == 0:#第一个矩阵,只有状态特征的行向量
d = Mlist['len'][0]
Mlist['mat'][i] = np.zeros((1,d))
for j in range(d):
Mlist['mat'][i][0,j] = weights[featureTS.index(('S', corp[0], words2tagids[corp[0]][j]))]
continue
if i == lencorp:#最后一个矩阵,元素为0的列向量矩阵
Mlist['mat'][i] = np.zeros((Mlist['len'][-1],1))
continue
#既非第一个矩阵,亦非第二个矩阵,每个元素要计算状态特征和转移特征
Mlist['mat'][i] = np.zeros((Mlist['len'][i-1],Mlist['len'][i]))
for d1 in range(Mlist['len'][i-1]):
for d2 in range(Mlist['len'][i]):
id1 = words2tagids[corp[i-1]][d1]
id2 = words2tagids[corp[i]][d2]
try:
Sweight = weights[featureTS.index(('S', corp[i], id2))]
except:
Sweight = 0
try:
Tweight = weights[featureTS.index(('T', id1, id2))]
except:
Tweight = 0
Mlist['mat'][i][d1,d2] = Sweight + Tweight

#return Mlist,corps[0]
#return 0

z = np.array([[0]])
for i in range(lencorp+1):
z = logspace.elnmatprod(z,Mlist['mat'][i])

Alphalist = ['']*(lencorp+2)
Betalist = ['']*(lencorp+2)
Alphalist[0] = np.zeros((