相似度匹配算法 java 数据相似度匹配算法

转载

mob64ca14193248 2023-11-17 15:28:26

文章标签 相似度匹配算法 java 数据块数据 #include 文章分类 Java 后端开发

数据相似性检测算法

1、引言
　　"数据同步算法研究"一文研究了在网络上高效同步数据的方法，其中有个前提是文件A和B非常相似，即两者之间存在大量相同的数据。如果两个文件相似性很低，虽然这种方法依然可以正常工作，但数据同步性能却不会得到提高，甚至会有所降低。因为会产生部分元数据和网络通信消耗，这在两个文件完全不相关时尤为明显。因此，同步数据前需要计算种子文件(seed file)与目标文件之间的相似性，如果相似性大于指定阈值(通常应大于50%)则应用该数据同步算法，否则接传输文件即可。如此，可使得数据同步算法则具有较好的自适应性，在数据具有不同相似性的情形下均可进行高性能的数据同步。另外，在数据相似性检测的基础之上，可对于相似性高的数据进行数据编码处理(如Delta编码)，通过一个文件给另一个文件编码的方式进行数据压缩，这是一种基于相似数据检测与编码的重复数据删除技术。
2、相似性计算　　Unix diff对文档进行逐行对比来检测相似文件，它采用经典的LCS(Longest Common Subsequence，最长公共子串)算法，运用动态规划方法来计算相似性。LCS的含义是同时包含在字符串里的一个最长字符序列，LCS的长度作为这两个字符串相似性的度量。Diff算法以整行作为"字符"来计算最长公共子串，性能上比字符级的LCS算法快很多。这种方法效率很低，而且只适用文本文件的相似比较，不能直接适用于二进制文件。

　　目前通常的做法是将文件相似性问题转换为集合相似性问题，如基于shingle的计算方法和基于bloom filter的计算方法，这两种方法都可适用于任何格式的数据文件。这种方式的核心思想是为每个文件提取组特征值，以特征值集合来计算相似性，从而降低计算复杂性来提高性能。shingle用特征值交集来计算相似性会导致高计算和空间开销，bloom filter技术在计算开销和匹配精度上更具势。Bloom filter所定义的集合元素是文件按照CDC(content-defined chunking)算法所切分数据块的指纹值，其相似性定义如下：
                           |fingerprints(f1) ∩ fingerprints(f2)|
　　Sim(f1, f2) = ---------------------------------------------   (公式1)
                           |fingerprints(f1) ∪ fingerprints(f2)|

　　另外一种方法，是将二进制文件进行切块，使用数据块指纹来表示数据块，然后将数据块映射为"字符"，再应用LCS算法寻找最大公共子串并计算出相似度。其相似性定义如下：
　　                      2 * length(LCS(fingerprints(f1), fingerprints(f2)))
    Sim(f1, f2) = ------------------------------------------------------------------　(公式2)
                          length(fingerprints(f1)) + length(fingerprints(f2))

　　上面两种相似性算法中均采用数据切分技术，数据块可以是定长或变长。为了相似性计算的精确性，实现中采用以数据块长度作为权的加权计算方法。

3、Bloom filter算法　　该文件相似性计算流程如下：
      (1) 采用CDC算法将文件切分成数据块集，并为每个数据块计算MD5指纹；
      (2) 计算两个指纹集合的交集和并集，通过hashtable来实现；
      (3) 按照公式1计算文件相似性，考虑重复数据块和数据块长度来提高计算精确度。
      详细参见附录bsim源码中的file_chunk，chunk_file_process和similarity_detect函数实现。

4、LCS算法　　该文件相似性计算流程如下：
    (1) 采用CDC算法将文件切分成数据块集，并为每个数据块计算MD5指纹；
    (2) 将MD5指纹串映射为"字符"，则文件转换为"字符串"表示；
    (3) 应用LCS算法计算出最长公共子串，并计算其加权长度；
    (4) 按照公式2计算文件相似性，考虑重复数据块和数据块长度来提高计算精确度。
    详细参见附录bsim源码中的file_chunk，chunk_file_process，LCS和similarity_detect函数实现。

5、算法分析比较
　两种算法都对文件进行切分操作，假设文件f1切为m个块，文件f2切分成n个块。Bloom filter算法没有考虑数据块顺序，因此在相似性精确度方面要低于LCS算法，其时间和空间复杂性都是O(m + n)。相反，LCS算法考虑了数据块顺序问题，相似性度量相对精确，然而其时间和空间复杂性是O(mn)，这个大大限制了应用规模。综合来看，Bloom filter算法精确度比LCS算法要低，但计算消耗要小很多，性能和适用性非常好。LCS比较适合精确的文件相似性计算，这些文件往往比较小，50MB以内比较合适。对于重复数据删除和网络数据同步来说，消重效果和性能与数据块顺序性无关，因此Bloom filter算法计算的数据相似性更适用，性能也更高。

附录：bsim.c源码(完整源码请参见deduputil源码)

[cpp] view plain copy print ?

1. /* Copyright (C) 2010 Aigui Liu
2.  *
3.  * This program is free software; you can redistribute it and/or modify
4.  * it under the terms of the GNU General Public License as published by
5.  * the Free Software Foundation; either version 3 of the License, or
6.  * (at your option) any later version.
7.  *
8.  * This program is distributed in the hope that it will be useful,
9.  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10.  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11.  * GNU General Public License for more details.
12.  *
13.  * You should have received a copy of the GNU General Public License along
14.  * with this program; if not, visit the http://fsf.org website.
15.  */  
16. #include <stdio.h>  
17. #include <stdlib.h>  
18. #include <string.h>  
19. #include <sys/types.h>  
20. #include <sys/stat.h>  
21. #include <fcntl.h>  
22. #include <unistd.h>  
23. #include "hashtable.h"  
24. #include "sync.h"  
25. #define NEITHER       0  
26. #define UP            1  
27. #define LEFT          2  
28. #define UP_AND_LEFT   3  
29. #define MAX(x, y) (((x) > (y)) ? (x) : (y))  
30. #define MIN(x, y) (((x) < (y)) ? (x) : (y))  
31. #define MD5_LEN 17  
32. enum {  
33.     FILE1 = 0,  
34.     FILE2  
35. };  
36. enum {  
37.     LCS_NOT = 0,  
38.     LCS_YES  
39. };  
40. typedef struct {  
41.     uint32_t nr1;  
42.     uint32_t nr2;  
43.     uint32_t len;  
44. } hash_entry;  
45. typedef struct {  
46. char **str;  
47.     uint32_t len;  
48. } lcs_entry;  
49. static uint32_t sim_union = 0;  
50. static uint32_t sim_intersect = 0;  
51. static void usage()  
52. {  
53. "Usage: bsim FILE1 FILE2 CHUNK_ALGO LCS/n/n");  
54. "Similarity detect between FILE1 and FILE2 based on block level./n");  
55. "CHUNK_ALGO:/n");  
56. "  FSP - fixed-size partition/n");  
57. "  CDC - content-defined chunking/n");  
58. "  SBC - slide block chunking/n/n");  
59. "LCS:/n");  
60. "  LCS_NOT - do not use LCS(longest lommon subsequence) algorithms/n");  
61. "  LCS_YES - use LCS algorithms/n/n");  
62. "Report bugs to <Aigui.Liu@>./n");  
63. }  
64. static int parse_arg(char *argname)  
65. {  
66. if (0 == strcmp(argname, "FSP"))  
67. return CHUNK_FSP;  
68. else if (0 == strcmp(argname, "CDC"))  
69. return CHUNK_CDC;  
70. else if (0 == strcmp(argname, "SBC"))  
71. return CHUNK_SBC;  
72. else if (0 == strcmp(argname, "LCS_NOT"))  
73. return LCS_NOT;  
74. else if (0 == strcmp(argname, "LCS_YES"))  
75. return LCS_YES;  
76. else  
77. return -1;  
78. }  
79. static char **alloc_2d_array(int row, int col)  
80. {  
81. int i;  
82. char *p, **pp;  
83. char *)malloc(row * col * sizeof(char));  
84. char **)malloc(row * sizeof(char *));  
85. if (p == NULL || pp == NULL)  
86. return NULL;  
87. for (i = 0; i < row; i++) {  
88.                 pp[i] = p + col * i;  
89.         }  
90. return pp;  
91. }  
92. static void free_2d_array(char **str)  
93. {  
94.     free(str[0]);  
95.     free(str);  
96. }  
97. static void show_md5_hex(unsigned char md5_checksum[16])  
98. {  
99. int i;  
100. for (i = 0; i < 16; i++) {  
101. "%02x", md5_checksum[i]);  
102.         }  
103. "/n");  
104. }  
105. static int chunk_file_process(char *chunk_file, hashtable *htab, int which, int sim_algo, lcs_entry *le)  
106. {  
107. int fd, i, ret = 0;  
108.     ssize_t rwsize;  
109.     chunk_file_header chunk_file_hdr;  
110.     chunk_block_entry chunk_bentry;  
111.     hash_entry *he = NULL;  
112. /* parse chunk file */  
113.     fd = open(chunk_file, O_RDONLY);  
114. if (-1 == fd) {  
115. return -1;  
116.     }  
117.     rwsize = read(fd, &chunk_file_hdr, CHUNK_FILE_HEADER_SZ);  
118. if (rwsize != CHUNK_FILE_HEADER_SZ) {  
119.         ret = -1;  
120. goto _CHUNK_FILE_PROCESS_EXIT;  
121.     }  
122. if (sim_algo == LCS_YES) {  
123.         le->str = alloc_2d_array(chunk_file_hdr.block_nr, MD5_LEN);  
124. if (le->str == NULL) {  
125.             ret = -1;  
126. goto _CHUNK_FILE_PROCESS_EXIT;  
127.         }  
128.         le->len = chunk_file_hdr.block_nr;  
129.     }  
130. for(i = 0; i < chunk_file_hdr.block_nr; i++) {  
131.         rwsize = read(fd, &chunk_bentry, CHUNK_BLOCK_ENTRY_SZ);  
132. if (rwsize != CHUNK_BLOCK_ENTRY_SZ) {  
133.             ret = -1;  
134. goto _CHUNK_FILE_PROCESS_EXIT;  
135.         }  
136. void *)chunk_bentry.md5, htab);  
137. if (he == NULL) {  
138. sizeof(hash_entry));  
139.             he->nr1 = he->nr2 = 0;  
140.             he->len = chunk_bentry.len;  
141.         }  
142.         (which == FILE1) ? he->nr1++ : he->nr2++;  
143. /* insert or update hash entry */  
144. void *)strdup(chunk_bentry.md5), (void *)he, htab);  
145. if (sim_algo == LCS_YES) {  
146.             memcpy(le->str[i], chunk_bentry.md5, MD5_LEN);  
147.         }  
148.     }  
149. _CHUNK_FILE_PROCESS_EXIT:  
150.     close(fd);  
151. return ret;  
152. }  
153. uint32_t LCS(char** a, int n, char** b, int m, hashtable *htab)   
154. {  
155. int** S;  
156. int** R;  
157. int ii;  
158. int jj;  
159. int pos;  
160.         uint32_t len = 0;  
161.     hash_entry *he = NULL;  
162. /* Memory allocation */  
163. int **)malloc( (n+1) * sizeof(int *) );  
164. int **)malloc( (n+1) * sizeof(int *) );  
165. if (S == NULL || R == NULL) {  
166. "malloc for S and R in LCS");  
167.         exit(0);  
168.     }  
169. for(ii = 0; ii <= n; ++ii) {  
170. int*) malloc( (m+1) * sizeof(int) );  
171. int*) malloc( (m+1) * sizeof(int) );  
172. if (S[ii] == NULL || R[ii] == NULL) {  
173. "malloc for S[ii] and R[ii] in LCS");  
174.             exit(0);  
175.         }  
176.         }  
177. /* It is important to use <=, not <.  The next two for-loops are initialization */  
178. for(ii = 0; ii <= n; ++ii) {  
179.                 S[ii][0] = 0;  
180.                 R[ii][0] = UP;  
181.         }  
182. for(jj = 0; jj <= m; ++jj) {  
183.                 S[0][jj] = 0;  
184.                 R[0][jj] = LEFT;  
185.         }  
186. /* This is the main dynamic programming loop that computes the score and */  
187. /* backtracking arrays. */  
188. for(ii = 1; ii <= n; ++ii) {  
189. for(jj = 1; jj <= m; ++jj) {  
190. if (strcmp(a[ii-1], b[jj-1]) == 0) {  
191.                                 S[ii][jj] = S[ii-1][jj-1] + 1;  
192.                                 R[ii][jj] = UP_AND_LEFT;  
193.                         }  
194. else {  
195.                                 S[ii][jj] = S[ii-1][jj-1] + 0;  
196.                                 R[ii][jj] = NEITHER;  
197.                         }  
198. if( S[ii-1][jj] >= S[ii][jj] ) {  
199.                                 S[ii][jj] = S[ii-1][jj];  
200.                                 R[ii][jj] = UP;  
201.                         }  
202. if( S[ii][jj-1] >= S[ii][jj] ) {  
203.                                 S[ii][jj] = S[ii][jj-1];  
204.                                 R[ii][jj] = LEFT;  
205.                         }  
206.                 }  
207.         }  
208. /* The length of the longest substring is S[n][m] */  
209.         ii = n;  
210.         jj = m;  
211.         pos = S[ii][jj];  
212. /* Trace the backtracking matrix. */  
213. while( ii > 0 || jj > 0 ) {  
214. if( R[ii][jj] == UP_AND_LEFT ) {  
215.                         ii--;  
216.                         jj--;  
217. //lcs[pos--] = a[ii];  
218. void *)a[ii], htab);  
219.             len += ((he == NULL) ? 0: he->len);  
220.                 }  
221. else if( R[ii][jj] == UP ) {  
222.                         ii--;  
223.                 }  
224. else if( R[ii][jj] == LEFT ) {  
225.                         jj--;  
226.                 }  
227.         }  
228. for(ii = 0; ii <= n; ++ii ) {  
229.                 free(S[ii]);  
230.                 free(R[ii]);  
231.         }  
232.         free(S);  
233.         free(R);  
234. return len;  
235. }  
236. int hash_callback(void *key, void *data)  
237. {  
238.     hash_entry *he = (hash_entry *)data;  
239.     sim_union += (he->len * (he->nr1 + he->nr2));  
240.     sim_intersect += (he->len * MIN(he->nr1, he->nr2));  
241. }  
242. static float similarity_detect(hashtable *htab, char **str1, int n, char **str2, int m, int sim_algo)  
243. {  
244.     uint32_t lcs_len = 0;  
245.     hash_for_each_do(htab, hash_callback);  
246. if (sim_algo == LCS_YES) {  
247.         lcs_len = LCS(str1, n, str2, m, htab);  
248. return lcs_len * 2.0 / sim_union;  
249. else { /* LCS_NOT */  
250. return sim_intersect * 2.0 / sim_union;  
251.     }  
252. }  
253. int main(int argc, char *argv[])  
254. {  
255. int chunk_algo = CHUNK_CDC;  
256. int sim_algo = LCS_NOT;  
257. char *file1 = NULL;  
258. char *file2 = NULL;  
259.     lcs_entry le1, le2;  
260. char tmpname[NAME_MAX_SZ] = {0};  
261. char template[] = "deduputil_bsim_XXXXXX";  
262.     hashtable *htab = NULL;  
263. int ret = 0;  
264. if (argc < 5) {  
265.         usage();  
266. return -1;  
267.     }  
268. /* parse chunk algorithms */  
269.     file1 = argv[1];  
270.     file2 = argv[2];  
271.     chunk_algo = parse_arg(argv[3]);  
272.     sim_algo = parse_arg(argv[4]);  
273. if (chunk_algo == -1 || sim_algo == -1) {  
274.         usage();  
275. return -1;  
276.     }  
277.     htab = create_hashtable(HASHTABLE_BUCKET_SZ);  
278. if (htab == NULL) {  
279. "create hashtabke failed/n");  
280. return -1;  
281.     }  
282. /* chunk file1 and file2 into blocks */  
283. "/tmp/%s_%d", mktemp(template), getpid());  
284.     ret = file_chunk(file1, tmpname, chunk_algo);  
285. if (0 != ret) {  
286. "chunk %s failed/n", file1);  
287. goto _BENCODE_EXIT;  
288.     }  
289.     le1.str = NULL;  
290.     ret = chunk_file_process(tmpname, htab, FILE1, sim_algo, &le1);  
291. if (ret != 0) {  
292. "pasre %s failed/n", file1);  
293. goto _BENCODE_EXIT;  
294.     }  
295.     ret = file_chunk(file2, tmpname, chunk_algo);  
296. if (0 != ret){  
297. "chunk %s failed/n", file2);  
298. goto _BENCODE_EXIT;  
299.     }  
300.     le2.str = NULL;  
301.     ret = chunk_file_process(tmpname, htab, FILE2, sim_algo, &le2);  
302. if (ret != 0) {  
303. "pasre %s failed/n", file2);  
304. goto _BENCODE_EXIT;  
305.     }  
306. "similarity = %.4f/n", similarity_detect(htab, le1.str, le1.len, le2.str, le2.len, sim_algo));  
307. _BENCODE_EXIT:  
308.     unlink(tmpname);  
309.     hash_free(htab);  
310. if (le1.str) free_2d_array(le1.str);  
311. if (le2.str) free_2d_array(le2.str);  
312. return ret;  
313. }

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。