作业:编写一个程序,给文档生成simhash指纹。可以对词使用任意合理的散列函数。使用该程序对计算机上的重复文档进行检测,得出检测的准确率。检测的准确率随着指纹大小的有什么变化?


目录

  • SimHash基本过程
  • 代码
  • Python中文实现
  • Python英文实现
  • Python实现作业
  • 参考资料


SimHash基本过程

1、文本分词,得到关键词:权重(feature:weight)
对文本进行关键词抽取(分词和计算权重),抽出权重最高的前n(关键词和权重)对,可利用jieba.analyse.extract_tags()实现,即一个文本得到一个长度为n(feature:weight)的集合。
2、hash
对获取的词(feature),进行普通的哈希操作之后,计算hash值,这样就得到一个长度为n位的二进制,得到(hash:weight)的集合。
3、加权
在获取的hash值的基础上,根据对应的weight值进行加权,W=hash*weight。即hash为1则和weight正相乘,为0则和weight负相乘。例如一个词经过hash后得到(010111:5)经过步骤(3)之后可以得到列表[-5,5,-5,5,5,5]。
4、合并
将上述得到的各个向量的加权结果进行求和,变成只有一个序列串。如[-5,5,-5,5,5,5]、[-3,-3,-3,3,-3,3]、[1,-1,-1,1,1,1]进行列向累加得到[-7,1,-9,9,3,9],这样,我们对一个文档得到,一个长度为64的列表。
5、降维
对于得到的n-bit签名的累加结果的每个值进行判断,大于0则置为1, 否则置为0,从而得到该语句的simhash值。例如,[-7,1,-9,9,3,9]得到 010111,这样,我们就得到一个文档的 simhash值。最后根据不同语句的simhash值的汉明距离来判断相似度。

代码

Python中文实现

# -*- coding:utf-8 -*-
import jieba
import jieba.analyse
import numpy as np

class simhash:
    # 构造函数
    def __init__(self, content):
        self.hash = self.simhash(content)

    def __str__(self):
        return str(self.hash)

    # 生成simhash值
    def simhash(self, content):
        seg = jieba.cut(content)
        #jieba.analyse.set_stop_words('stopword.txt')
        # jieba基于TF-IDF提取关键词,前10位
        keyWords = jieba.analyse.extract_tags("|".join(seg), topK=10, withWeight=True, allowPOS=())
        #print(keyWords)

        keyList = []
        for feature, weight in keyWords:
            #print('feature:{},weight: {}'.format(feature,weight))
            weight = int(weight)
            #生成普通的的hash值
            binstr = self.string_hash(feature)
            temp = []
            for c in binstr:
                if (c == '1'):# 查看当前bit位是否为1,是的话将weight*1加入temp[]
                    temp.append(weight)
                else:#否则的话,将weight*-1加入temp[]
                    temp.append(-weight)
            keyList.append(temp)
        listSum = np.sum(np.array(keyList), axis=0)
        if (keyList == []):#编码读不出来
            return '00'
        simhash = ''
        for i in listSum:
            if (i > 0):
                simhash = simhash + '1'
            else:
                simhash = simhash + '0'
        return simhash# 整个文档的fingerprint为最终各个位>=0的和

    # 求海明距离
    def hamming_distance(self, other):
        t1 = '0b' + self.hash
        t2 = '0b' + other.hash
        n = int(t1, 2) ^ int(t2, 2)
        i = 0
        while n:
            n &= (n - 1)
            i += 1
        return i

    #计算相似度
    def similarity(self, other):
        a = float(self.hash)
        b = float(other.hash)
        if a > b:
            return b / a
        else:
            return a / b

# 针对source生成hash值   (一个可变长度版本的Python的内置散列)
    def string_hash(self, source):
        if source == "":
            return 0
        else:
            x = ord(source[0]) << 7
            m = 1000003
            mask = 2 ** 128 - 1
            for c in source:
                x = ((x * m) ^ ord(c)) & mask
            x ^= len(source)
            if x == -1:
                x = -2
            x = bin(x).replace('0b', '').zfill(64)[-64:]
            #print('strint_hash: %s, %s'%(source, x))
            return str(x)
if __name__ == '__main__':
    hash1 = simhash('我想洗照片')
    hash2 = simhash('可以洗一张照片吗')
    print("海明距离:", hash1.hamming_distance(hash2))
    print("文本相似度:", hash1.similarity(hash2))

Python英文实现

class simhash:

    # 构造函数
    def __init__(self, tokens='', hashbits=128):
        self.hashbits = hashbits
        self.hash = self.simhash(tokens);

    # toString函数
    def __str__(self):
        return str(self.hash)

    # 生成simhash值
    def simhash(self, tokens):
        v = [0] * self.hashbits
        for t in [self._string_hash(x) for x in tokens]:  # t为token的普通hash值
            for i in range(self.hashbits):
                bitmask = 1 << i
                if t & bitmask:
                    v[i] += 1  # 查看当前bit位是否为1,是的话将该位+1
                else:
                    v[i] -= 1  # 否则的话,该位-1
        fingerprint = 0
        for i in range(self.hashbits):
            if v[i] >= 0:
                fingerprint += 1 << i
        return fingerprint  # 整个文档的fingerprint为最终各个位>=0的和

    # 求海明距离
    def hamming_distance(self, other):
        x = (self.hash ^ other.hash) & ((1 << self.hashbits) - 1)
        tot = 0
        while x:
            tot += 1
            x &= x - 1
        return tot

    # 求相似度
    def similarity(self, other):
        a = float(self.hash)
        b = float(other.hash)
        #print("a:",a, b, end='\n')
        if a > b:
            return b / a
        else:
            return a / b

    # 针对source生成hash值   (一个可变长度版本的Python的内置散列)
    def _string_hash(self, source):
        if source == "":
            return 0
        else:
            x = ord(source[0]) << 7
            m = 1000003
            mask = 2 ** self.hashbits - 1
            for c in source:
                x = ((x * m) ^ ord(c)) & mask
            x ^= len(source)
            if x == -1:
                x = -2
            return x


if __name__ == '__main__':
    s = 'This is a test string for testing'
    hash1 = simhash(s.split())

    s = 'This is a test string for testing also'
    hash2 = simhash(s.split())

    s = 'This is a test'
    hash3 = simhash(s.split())
    print(hash1,hash2,hash3)
    print(hash1.hamming_distance(hash2), "\t", hash1.similarity(hash2))
    print(hash1.hamming_distance(hash3), "\t", hash1.similarity(hash3))

Python实现作业

# -*- coding:utf-8 -*-
import jieba
import jieba.analyse
import numpy as np
import re

txt1 = r'./test1.txt'
txt2 = r'./test2.txt'

class simhash:
    # 构造函数
    def __init__(self, content):
        self.hash = self.simhash(content)

    def __str__(self):
        return str(self.hash)

    # 生成simhash值
    def simhash(self, content):
        count = 0
        seg = jieba.cut(content)
        # jieba基于TF-IDF提取前10位关键词
        keyWords = jieba.analyse.extract_tags("|".join(seg), topK=10, withWeight=True, allowPOS=())

        keyList = []
        # 获取每个词的权重
        for feature, weight in keyWords:
            #print('feature:{},weight: {}'.format(feature, weight))
            # 每个关键词的权重*总单词数
            weight = int(weight * 10)
            #生成普通的的hash值
            binstr = self.string_hash(feature)
            #打印指纹大小
            if(count == 0):
                print("指纹大小为:", len(binstr))
                count += 1
            temp = []
            for c in binstr:
                if (c == '1'):# 查看当前bit位是否为1,是的话将weight*1加入temp[]
                    temp.append(weight)
                else:#否则的话,将weight*-1加入temp[]
                    temp.append(-weight)
            keyList.append(temp)
        # 将每个关键词的权重变成一维矩阵
        listSum = np.sum(np.array(keyList), axis=0)
        if (keyList == []):#编码读不出来
            return '00'
        simhash = ''
        for i in listSum:
            if (i > 0):
                simhash = simhash + '1'
            else:
                simhash = simhash + '0'
        return simhash# 整个文档的fingerprint为最终各个位>=0的和

    # 求海明距离
    def hamming_distance(self, other):
        t1 = '0b' + self.hash
        t2 = '0b' + other.hash
        n = int(t1, 2) ^ int(t2, 2)
        i = 0
        while n:
            n &= (n - 1)
            i += 1
        return i

    #计算相似度
    def similarity(self, other):
        a = float(self.hash)
        b = float(other.hash)
        print(a, b)
        if a > b:
            return b / a
        #elif a == 0.0 and b == 0.0:
        #return 1
        else:
            return a / b

# 针对source生成hash值   (一个可变长度版本的Python的内置散列)
    def string_hash(self, source):
        if source == "":
            return 0
        else:
            # 将字符转为二进制,并向左移动7位
            x = ord(source[0]) << 7
            m = 1000003
            mask = 2 ** 128 - 1
            # 拼接每个关键词中字符的特征
            for c in source:
                x = ((x * m) ^ ord(c)) & mask
            x ^= len(source)
            if x == -1:
                x = -2
            #通过改变.zfill(16)[-16:]来实现改变指纹大小
            x = bin(x).replace('0b', '').zfill(32)[-32:]
            #print('strint_hash: %s, %s' % (source, x))
            return str(x)

def txt_line(txt1, txt2):
    punc = './ <>_ - - = ", 。,?!“”:‘’@#¥% … &×()——+【】{};;● &~| \s:'
    #获取文本中的数据
    with open(txt1, 'r', encoding='gbk') as f:
        list1 = f.read()
        string = ''
        text1 = re.sub(r'[^\w]+', '', list1)
        s = jieba.cut(text1)
        string = string.join(s)
        line1 = re.sub(r"[{}]+".format(punc), "", string)

    with open(txt2, 'r', encoding='gbk') as f:
        list2 = f.read()
        string = ''
        text2 = re.sub(r'[^\w]+', '', list2)
        s = jieba.cut(text2)
        string = string.join(s)
        line2 = re.sub(r"[{}]+".format(punc), "", string)
        hash1 = simhash(line1)
        hash2 = simhash(line2)
        print("海明距离:", hash1.hamming_distance(hash2))
        print("文本相似度:", hash1.similarity(hash2))

if __name__ == '__main__':
    txt_line(txt1, txt2)