场景:表中因早起原因分别创建两套部门表。概述登录人为A/B不同类型,可选的部门范围不同。
但是后来发现B类人员可选A类中部门,故对于B来说 部门取并集!
问题:相同名称或类似名称部门怎么办?1.重复2.类似的都要保留一个。然后修改原先数据
思路:1.先分词 2.然后比较看相似度/匹配度
直接代码:
注意:最后计算相似度的会报错,因为借用其他算法。后面可看 参数类型不同,不想改了,测试可自己改下
0.pom引用+settings配置[maven可直接下载]
<!-- https://mvnrepository.com/artifact/com.huaban/jieba-analysis -->
<dependency>
<groupId>com.huaban</groupId>
<artifactId>jieba-analysis</artifactId>
<version>1.0.2</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.janeluo/ikanalyzer -->
<dependency>
<groupId>com.janeluo</groupId>
<artifactId>ikanalyzer</artifactId>
<version>2012_u6</version>
</dependency>
<mirror>
<id>aliyunmaven</id>
<mirrorOf>*</mirrorOf>
<name>阿里云公共仓库</name>
<url>https://maven.aliyun.com/repository/public</url>
</mirror>
1.jieba
public class JieBaUtils {
private static JiebaSegmenter segmenter = new JiebaSegmenter();
/**
* 单词 参考的他人例子
**/
public static List<String> getSignaleWord(String words) {
//segmenter.process(text, JiebaSegmenter.SegMode.SEARCH) 两者效果一致
List<String> resultList = segmenter.sentenceProcess(words);
return resultList;
}
/**
* 结巴分词 process(str,SegMode.INDEX)
* @param text
* @return
*/
public static Vector<String> participleJieBa(String text) {
List<SegToken> process = segmenter.process(text, JiebaSegmenter.SegMode.INDEX);
List<String> collect = process.stream().map(item -> item.word).collect(Collectors.toList());
return new Vector<>(collect);
}
public static void main(String[] args) {
System.out.println(getSignaleWord("数学形态学的表面原子熔融相的STM图像识别算法"));
System.out.println(participleJieBa("数学形态学的表面原子熔融相的STM图像识别算法"));
//先分词为集合,然后集合中字段比较
System.out.println(IKUtils.getSimilarity( participleJieBa("数学形态学的表面原子熔融相的STM图像识别算法") , getSignaleWord("数学形态学的表面原子熔融相的STM图像识别算法") ));
}
}
2.IKAnalyzer
计算匹配度的代码
public class IKUtils {
public static double YUZHI = 0.2;
/**
* 返回百分比
*
* @param T1
* @param T2
* @return
* @author: Administrator
* @Date: 2015年1月22日
*/
public static double getSimilarity(Vector<String> T1, Vector<String> T2) throws Exception {
int size = 0, size2 = 0;
if (T1 != null && (size = T1.size()) > 0 && T2 != null && (size2 = T2.size()) > 0) {
Map<String, double[]> T = new HashMap<String, double[]>();
//T1和T2的并集T
String index = null;
for (int i = 0; i < size; i++) {
index = T1.get(i);
if (index != null) {
double[] c = T.get(index);
c = new double[2];
c[0] = 1; //T1的语义分数Ci
c[1] = YUZHI;//T2的语义分数Ci
T.put(index, c);
}
}
for (int i = 0; i < size2; i++) {
index = T2.get(i);
if (index != null) {
double[] c = T.get(index);
if (c != null && c.length == 2) {
c[1] = 1; //T2中也存在,T2的语义分数=1
} else {
c = new double[2];
c[0] = YUZHI; //T1的语义分数Ci
c[1] = 1; //T2的语义分数Ci
T.put(index, c);
}
}
}
//开始计算,百分比
Iterator<String> it = T.keySet().iterator();
double s1 = 0, s2 = 0, Ssum = 0; //S1、S2
while (it.hasNext()) {
double[] c = T.get(it.next());
Ssum += c[0] * c[1];
s1 += c[0] * c[0];
s2 += c[1] * c[1];
}
//百分比
return Ssum / Math.sqrt(s1 * s2);
} else {
throw new Exception("传入参数有问题!");
}
}
}
2.分词并测试
package com.controller.util;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
import java.io.IOException;
import java.io.StringReader;
import java.util.Vector;
public class CheckTheSame {
//大同小异 分词
public static Vector<String> participle(String str) {
Vector<String> str1 = new Vector<String>();//对输入进行分词
try {
StringReader reader = new StringReader(str);
IKSegmenter ik = new IKSegmenter(reader, false);//当为true时,分词器进行智能切分
Lexeme lexeme = null;
while ((lexeme = ik.next()) != null) {
str1.add(lexeme.getLexemeText());
}
if (str1.size() == 0) {
return null;
}
//分词后
// System.out.println( "str分词后:" + str1 );
} catch (IOException e1) {
//System.out.println();
}
return str1;
}
/**
* 返回比较的两个字符串的相似度
*
* @param strone
* @param strtwo
* @return
*/
public String getSemblance(String strone, String strtwo) {
String semblanceString = "0.0000";
//分词
Vector<String> strs1 = participle(strone);
Vector<String> strs2 = participle(strtwo);
//根据分词返回相似度
double same = 0;
try {
same = IKUtils.getSimilarity(strs1, strs2);
} catch (Exception e) {
//System.out.println( e.getMessage() );
}
semblanceString = String.valueOf(same);
//System.out.println( "相似度:" + same );
return semblanceString;
}
public static void main(String[] args) {
//分词
Vector<String> strs1 = participle("蚂蚁金服");
Vector<String> strs2 = participle("蚂蚁");
//根据分词返回相似度
double same = 0;
try {
same = IKUtils.getSimilarity(strs1, strs2);
} catch (Exception e) {
System.out.println(e.getMessage());
}
System.out.println("相似度:" + same);
}
}