条件随机场（CRF）原理和实现

转载

mob604756f99da6 2016-05-30 20:22:00

作者：金良

对数域操作函数

class Logspace:
    def __init__(self):
        self.LOGZERO =np.nan
    def eexp(self,x):
        if np.isnan(x):
            return 0
        else:
            return np.exp(x)
    def eln(self,x):
        if x == 0:
            return self.LOGZERO
        elif x>0:
            return np.log(x)
        else:
            print 'Wrong!!!\n\t negative input error'
            return np.nan
    def elnsum(self,elnx,elny):
        if np.isnan(elnx):
            return elny
        elif np.isnan(elny):
            return elnx
        elif elnx > elny:
            return elnx + self.eln(1+np.exp(elny-elnx))
        else:
            return elny + self.eln(1+np.exp(elnx-elny))
    def elnproduct(self,elnx,elny):
        if np.isnan(elnx) or np.isnan(elny):
            return self.LOGZERO
        else:
            return elnx + elny
    def elnmatprod(self,elnx,elny):
        #array([[ 0.]])其size是2
        xsize = np.size(np.shape(elnx))
        ysize = np.size(np.shape(elny))

        if xsize == 1 and ysize == 1:
            r = self.LOGZERO
            for i in range(np.shape(elnx)[0]):
                r = self.elnsum(r,self.elnproduct(elnx[i],elny[i]))
            return r
        elif xsize == 1 and not ysize == 1:
            n = np.shape(elny)[1]
            r = np.zeros(n)
            for i in range(n):
                r[i] = self.elnmatprod(elnx,elny[:,i])
            return r
        elif not xsize == 1 and ysize == 1:
            n = np.shape(elnx)[0]
            r = np.zeros(n)
            for i in range(n):
                r[i] = self.elnmatprod(elnx[i,:],elny)
            return r    
        else:
            m,n= np.shape(elnx)
            p = np.shape(elny)[1]
            r = np.zeros((m,p))
            for i in range(m):
                for j in range(p):
                    r[i][j] = self.elnmatprod(elnx[i,:],elny[:,j])
            return r
    def eexpmat(self,elny):
        expy = np.copy(elny)
        if np.size(np.shape(elny)) == 1:
            for i in range(np.shape(elny)[0]):
                expy[i] = self.eexp(expy[i])
        else:
            for i in range(np.shape(elny)[0]):
                for j in range(np.shape(elny)[1]):
                    expy[i][j] = self.eexp(expy[i][j])
        return expy
    def elnmat(self,x):
        elnx = np.copy(x)
        if np.size(np.shape(x)) == 1:
            for i in range(np.shape(x)[0]):
                elnx[i] = self.eln(x[i])
        else:
            for i in range(np.shape(x)[0]):
                for j in range(np.shape(x)[1]):
                    elnx[i,j] = self.eln(x[i,j])
        return elnx

测试举例

logspace = Logspace()
M1 = np.array([1,0.5])
M2 = np.array([[1.3,1.5],[1.8,0.5]])
M3 = np.array([[0.8,1.5],[1.8,0.7]])
M4 = np.array([0,0])

print logspace.eexpmat(logspace.elnmatprod(M1,M2))
print np.dot(logspace.eexpmat(M1),logspace.eexpmat(M2))

[ 19.94836491 14.90077579]
[ 19.94836491 14.90077579]

条件随机场的函数

def read_corps(corpsfile='testchunk.data'):
    #http://www.chokkan.org/software/crfsuite/tutorial.html,该页面有两个网址可下载数据集，该数据集量很大
    #http://blog.dpdearing.com/2011/12/opennlp-part-of-speech-pos-tags-penn-english-treebank/
    tagids = defaultdict(lambda: len(tagids))
    tagids["<S>"] = 0

    corps=[]
    onesentence = []
    words = [ "<S>" ]
    tags  = [   0   ]
    #wordnumcount = 0
    with open(corpsfile,'r') as f:   
        for line in f:
            if len(line)<=1:
                pass
            elif line != '. . O\n': 
                # '. . O\n'表示一句话结束，当一句话未结束则将该单词加入列表onesentence
                onesentence.append(line)
            else: #如果一句话结束，则对该句话的所有出现的单词进行处理，将处理结果存入列表corps               
                for texts in onesentence:
                    #wordnumcount += 1
                    w_t = texts.strip().split(" ")
                    #print w_t
                    try: 
                        #由于表示数字的字符串变化较多，为了减少其干扰，这里将其检测出来并替换掉
                        float(w_t[0].strip().replace(',',''));
                        #print w_t
                        words.append('#CD#')
                    except: 
                        words.append(w_t[0].lower()) 
                    #if w_t[1] in{ '``',',',"''",'$','#',')','('}:
                    #    print w_t
                    tags.append(tagids[w_t[1]])
                words.append("<S>") #words是一句话的单词组成的列表
                tags.append(0)      #tags是一句话的标注组成的列表，与单词列表words一一对应
                if np.shape(words)[0] > 2: #排除掉空句子
                    corps.append((words,tags))

                #对onesentence，words和tags重新初始化
                onesentence = []
                words = [ "<S>" ]
                tags  = [   0   ]
    #print '一共出现的单词个数：'+np.str(wordnumcount)
    #一共出现的单词个数：40377
    return corps,tagids
def getfeatureTS(corps):
    featuresets = set() #特征的集合
    featureT = [] #转移特征的列表，比如列表元素('T', 2, 3)表示从状态2转到特征3
    featureS = [] #状态特征的列表，比如列表元素('S','Confidence', 1)
    for corp in corps:
        for i in range(np.shape(corp[0])[0]):
            if corp[0][i] == '<S>':
                continue
            if ('S',corp[0][i],corp[1][i]) not in featuresets:
                featuresets.add(('S',corp[0][i],corp[1][i]))
                featureS.append(('S',corp[0][i],corp[1][i]))
            if corp[0][i-1] != '<S>':
                if ('T',corp[1][i-1],corp[1][i]) not in featuresets:
                    featuresets.add(('T',corp[1][i-1],corp[1][i]))
                    featureT.append(('T',corp[1][i-1],corp[1][i]))
    featureTS = featureT+featureS
    words2tagids = words2tagidfromfeatureS(featureS)
    return featureTS,words2tagids
def getpriorfeatureE(corps,featureTS):
    #计算先验特征期望值
    N = np.shape(corps)[0] #训练样本数
    K = np.shape(featureTS)[0] #特征数
    priorfeatureE = np.zeros(K) 

    for corp in corps: 
        for i in range(np.shape(corp[0])[0]):
            if corp[0][i] == '<S>':
                continue 
            try:
                idex = featureTS.index(('S', corp[0][i], corp[1][i]))
                priorfeatureE[idex] += 1.0
            except:
                pass
            try:
                idex = featureTS.index(('T', corp[1][i-1], corp[1][i]))
                priorfeatureE[idex] += 1.0
            except:
                pass
    priorfeatureE /=N
    #plt.plot(priorfeatureE) 
    #从特征的先验期望值可以看出无论是转移特征(从横坐标0开始)还是状态特征(从横坐标318开始)，先被记录的先验期望值越大
    return priorfeatureE
def words2tagidfromfeatureS(featureS):
    #统计所有单词分别对应的词性列表
    words2tagids = {}
    for feature in featureS:
        word = feature[1]
        state = feature[2]
        if word in words2tagids:
            words2tagids[word].append(state)
        else:
            words2tagids[word] = [state]

    #lennums列表统计单词对应的词性的长度的分布
    #lennums = [[lenlist.count(i) for i in range(1,max(lenlist)+1)] 
    #           for lenlist in [[len(words2tagids[i]) for i in words2tagids]]][0]
    #lennums = [3760, 389, 32, 1]
    return words2tagids
def getpostfeatureE(weights,corps,featureTS,words2tagids):
    K = np.shape(featureTS)[0] #特征数
    postfeatureE = np.zeros(K) #特征的后验期望值
    N = np.shape(corps)[0]
    for corpidx in range(N):
        corp = corps[corpidx][0][1:-1]

        lencorp = np.size(corp) #语料长度，即句子中的单词数
        Mlist = {}
        Mlist['mat'] = ['']*(lencorp+1)
        Mlist['dim'] = [words2tagids[corp[i]] for i in range(lencorp)]
        Mlist['len'] = [np.size(words2tagids[corp[i]]) for i in range(lencorp)]
        for i in range(lencorp+1):
            if i == 0:#第一个矩阵，只有状态特征的行向量
                d = Mlist['len'][0]
                Mlist['mat'][i] = np.zeros((1,d))
                for j in range(d):
                    Mlist['mat'][i][0,j] = weights[featureTS.index(('S', corp[0], words2tagids[corp[0]][j]))]        
                continue
            if i == lencorp:#最后一个矩阵，元素为0的列向量矩阵
                Mlist['mat'][i] = np.zeros((Mlist['len'][-1],1))
                continue
            #既非第一个矩阵，亦非第二个矩阵，每个元素要计算状态特征和转移特征
            Mlist['mat'][i] = np.zeros((Mlist['len'][i-1],Mlist['len'][i]))
            for d1 in range(Mlist['len'][i-1]):
                for d2 in range(Mlist['len'][i]):
                    id1 = words2tagids[corp[i-1]][d1]
                    id2 = words2tagids[corp[i]][d2]
                    try:
                        Sweight = weights[featureTS.index(('S', corp[i], id2))] 
                    except:
                        Sweight = 0
                    try:
                        Tweight = weights[featureTS.index(('T', id1, id2))]
                    except:
                        Tweight = 0
                    Mlist['mat'][i][d1,d2] = Sweight + Tweight 

        #return  Mlist,corps[0]
        #return 0

        z = np.array([[0]])
        for i in range(lencorp+1):
            z = logspace.elnmatprod(z,Mlist['mat'][i])

        Alphalist = ['']*(lencorp+2)
        Betalist = ['']*(lencorp+2)
        Alphalist[0] = np.zeros((