思路:
调用detect方法即可得到重复率(float)
代码如下:
package com.hnisc.cmpas.util;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
public class DuplicateDetection {
private static final String html_seperator="<br>";
private static final String sentence_seperator="[,.;!:,。;!:]";
//将一段文字切割成若干句
public static String[] split(String src)
{
List<String> result=new ArrayList<>();
String []htmls=src.split(html_seperator);
for (String s:htmls)
for (String s2:s.split(sentence_seperator))
result.add(s2);
String []results=new String[result.size()];
result.toArray(results);
return results;
}
//清晰字符串除杂消除干扰
public static String[] clean(String []src)
{
for (int i=0;i<src.length;i++)
src[i]=clean(src[i]);
return src;
}
//清除非数字、字母、中文
public static String clean(String src)
{
return src.replaceAll("[^a-zA-Z0-9\\u4e00-\\u9fa5]","");
}
//检测两个作业内容的相似率
public static float detect(String des,String src)
{
float resultFloat=0.00f;
//分语义行切割作业内容
String desArray[]=split(des);
String srcArray[]=split(src);
//对作业内容中非自然语言进行清洗除杂
desArray=clean(desArray);
srcArray=clean(srcArray);
for (String s:desArray)
resultFloat+=checkSingleLineWithSrcArray(s,srcArray);
resultFloat/=desArray.length;
return resultFloat;
}
//将一行和所有可能行进行比较得出最大的相似率
private static float checkSingleLineWithSrcArray(String line, String []srcArray)
{
float result=0.00f;
for (String s:srcArray)
{
//基于找公共子串进行相似度计算
float temp1=checkSingleLineWithSingleLine(line,s);
if (temp1>result)
result=temp1;
//基于找公共字符数进行相似度计算
float temo2=checkDuplicationWithMatrix(line,s);
if (temo2>result)
result=temo2;
}
return result;
}
//将一行和一行比较,求出公共子序列,以此得出两字符串的相似度
private static float checkSingleLineWithSingleLine(String line,String src)
{
float result;
String s1 = line;
String s2 = src;
String max = s1.length() >= s2.length()?s1:s2;
String min = s1.length() >= s2.length()?s2:s1;
int l = 0;
String s ="";
for(int i=0;i<min.length();i++){
for(int j=i+1;j<=min.length();j++){
if(max.contains(min.substring(i,j)) && j-i>l){
l=j-i;
s=min.substring(i,j);
}
}
}
result=s.length();
result/=line.length();
return result;
}
//将一行和一行比较,使用集合法求出相似度
private static float checkDuplicationWithMatrix(String s1,String s2)
{
float result;
int count=0;
for (int i=0;i<s1.length();i++)
for (int j=0;j<s2.length();j++)
if (s1.charAt(i)==s2.charAt(j))
count++;
result=count;
result/=((s1.length()+s2.length())/2);
return result;
}
//将小数转化为百分数的字符串
public static String transferFloatToPersentString(float f)
{
String result;
f*=10000;
int t1=(int)f;
result=(((float)t1)/100)+"%";
return result;
}
public static void main(String[] args) {
Date start=new Date();
String tar="public class AAAA {<br> public static void main(String[] args) {<br> <br> nineSort();<br> <br> }<br><br> // 打印九九乘法表<br> public static void nineSort(){<br> for(int i = 1; i <= 9; i++) {<br> for (int j = 1; j <= i; j++) {<br> System.out.print(j+\"*\"+i+\"=\"+j*i+\"\\t\");<br> }<br> System.out.println();<br> }<br> }<br>}";
String src="public class AAAA {<br> // 打印九九乘法表<br> public static void nineSortTest(){<br> for(int index1 = 1; index1 <= 9; index1++) {<br> for (int index2 = 1; index2 <= index1; index2++) {<br> System.out.print(index2+\"*\"+index1+\"=\"+index2*index1+\"\\t\");<br> }<br> System.out.println();<br> }<br> }<br> public static void main(String[] args) {<br> <br> nineSortTest();<br> <br> }<br>}";
System.out.println(transferFloatToPersentString(detect(src,tar)));
Date end=new Date();
System.out.println("花费时间:"+(end.getTime()-start.getTime())+"毫秒");
}
}
运行结果: