C++/JAVA 计算两篇文章的相似度
实验介绍及思路
问题描述:
编写程序,计算任意两篇文章的相似度。
基本思路:
利用余弦相似度来计算其相似度。
完整代码
C++ 代码来啰
/*
*
* Author : YU.J.P
* Time ; 2022/04/03
* Project : Experment One -- calculate article similarity.
*
*/
//计算两篇文章的相似度
#include "bits/stdc++.h"
//#include <iostream>
//#include <cstring>
#define MAXSIZE 1024 //预设最大容量
using namespace std;
/* 单词顺序表 */
typedef struct word
{
char str[32]; //单词最大长度设为32
int cnt = -1; //单词出现次数,-1表示没有元素
}wordList;
/* 读取文件中的单词 */
void readFile(string fileName, wordList* words)
{
char temp[32]; //读取单词变量
int netWords = 0, total = 0; //去重单词个数,单词出现次数
int newWordflag = 1; //newWord标志位,判断是否为新单词(查重)
ifstream ifs; //1.创建流对象;
ifs.open(fileName , ios::in);
if (!ifs.is_open()) //2.打开文件 ifs.open("文件路径",打开方式);
{
cout << "\n文件打开失败!" << endl;
}
cout << "\n文件打开成功!" << endl;
char character[] = ",.:\"!"; //英文标点符号
while (ifs >> temp) //3.读数据 4种读取方式
{
for (int j = 0; temp[j] != '\0'; j++) //清除符号
{ //判断是否有符号
if (strchr(character, temp[j]) != NULL)
temp[j] = '\0'; // 如果文本符号没有分隔开,会出错
}
newWordflag = 1;
for (int i = 0; i < netWords; i++)
{
if (strcmp(temp, words[i].str) == 0)//重复的单词
{
newWordflag = 0;
words[i].cnt++;
}
}
if (newWordflag)
{
strcpy_s(words[netWords].str, temp);
words[netWords++].cnt = 1; //新单词数量+1
}
}
for (int i = netWords - 1; i >= 0; i--) //单词出现次数
{
total += words[i].cnt;
}
ifs.close(); //4.关闭文件 ifs.close();
//打印n和s
cout << "netWords = " << netWords << "\ttotal = " << total << endl;
}
/* 获得合并keyWord数组target */
void getKeyWord(wordList* target,wordList* article1, wordList* article2)
{
//复制一个数组
int i;//新单词递增位
for (i = 0; article1[i].cnt != -1 && i < MAXSIZE; i++)
{
strcpy_s(target[i].str,article1[i].str); //复制单词
target[i].cnt = 0; //清零
}
//添加另一个数组不同的单词
for (int j = 0; article2[j].cnt != -1 && j < MAXSIZE; j++)
{
int newWordFlag = 1; //新单词标志位
for (int v = 0; article2[v].cnt != -1 && v < MAXSIZE; v++)
{
if (strcmp(article2[j].str, target[v].str) == 0)
{ //重复的单词
newWordFlag = 0;
}
}
if (newWordFlag == 1)
{
strcpy_s(target[i].str, article2[j].str); //复制单词
target[i++].cnt = 0; //清零 i++递增
}
}
}
/* 统计词频 */
void statistics(wordList* target, wordList* article, int* wordsNum)
{
for (int i = 0; target[i].cnt != -1 && i < MAXSIZE; i++)
{ //外循环遍历target
for (int j = 0; article[j].cnt != -1 && j < MAXSIZE; j++)
{ //内循环遍历article
if (strcmp(target[i].str,article[j].str) == 0)//相同
{ //包含此单词,复制频数
wordsNum[i] = article[j].cnt;
break;
}
}
if(wordsNum[i] == -1) //没有此单词标记为0
wordsNum[i] = 0;
}
}
/* 计算相似度 */
double calSimilarity(int* a, int* b)
{
double squareSum = 0; //平方和 分子
double sqrtSumA = 0, sqrtSumB = 0; //平方和开方 分母
for (int i = 0; a[i] != -1; i++)
{
squareSum += (double)a[i] * (double)b[i];
sqrtSumA += pow(a[i],2);
sqrtSumB += pow(b[i], 2);
}
sqrtSumA = sqrt(sqrtSumA);
sqrtSumB = sqrt(sqrtSumB);
double Cos = squareSum / (sqrtSumA * sqrtSumB);
return Cos;
}
/* 打印单词结构体中的单词 */
void printWordList(wordList* articleWords)
{
for (int i = 0; articleWords[i].cnt != -1 && i < MAXSIZE; i++)
{
//预设打印宽度20,左对齐
cout << articleWords[i].str << setw(20) << setiosflags(ios::left);
if ((i + 1) % 5 == 0) //打印换行
cout << endl;
}
cout << endl;
}
/* 打印统计单词频数 */
void printWordsNum(int* WordsNum)
{
for (int i = 0; WordsNum[i] != -1 && i < MAXSIZE; i++)
{
cout << WordsNum[i] << "\t";
if ((i + 1) % 10 == 0) //打印换行
cout << endl;
}
cout << endl;
}
/* MAIN */
int main()
{
cout << "\t计算文章相似度\n" << endl;
wordList articleWords1[MAXSIZE]; //单词数量上限1024
wordList articleWords2[MAXSIZE]; //单词数量上限1024
wordList target[MAXSIZE]; //keyWord
int wordsNum1[MAXSIZE]; //article 1 statistics
int wordsNum2[MAXSIZE]; //article 2 statistics
readFile("article1.txt", articleWords1); //查找单词写入结构体
cout << "\n**************** article1 words ***************" << endl;
printWordList(articleWords1); //打印单词
cout << "\n***********************************************" << endl;
readFile("article2.txt", articleWords2); //查找单词写入结构体
cout << "\n**************** article2 words ***************" << endl;
printWordList(articleWords2); //打印单词
cout << "\n***********************************************" << endl;
getKeyWord(target,articleWords1, articleWords2);
cout << "\n**************** target words ***************" << endl;
printWordList(target); //打印单词
cout << "\n***********************************************" << endl;
memset(wordsNum1, -1, MAXSIZE * sizeof(int)); //初始化元素为-1
statistics(target, articleWords1, wordsNum1); //统计wordsNum1
cout << "\n**************** wordsNum1 ***************" << endl;
printWordsNum(wordsNum1); //打印wordsNum1
cout << "\n******************************************" << endl;
memset(wordsNum2, -1, MAXSIZE * sizeof(int)); //初始化元素为-1
statistics(target, articleWords2, wordsNum2); //统计wordsNum2
cout << "\n**************** wordsNum2 ***************" << endl;
printWordsNum(wordsNum2); //打印wordsNum2
cout << "\n******************************************" << endl;
cout << "两文章的相似度为: " << setw(5) << setprecision(2)
<< setiosflags(ios::fixed) << setiosflags(ios::right)
<< calSimilarity(wordsNum1, wordsNum2) * 100
<< "%" << endl;//小数2位
cout << "\nOver..." << endl;
return 0;
}
JAVA 代码来啰
/*
*
* Time : 2022/04/02
* Author : YU.J.P
* Project : Duplicate checking by using cosine formula.
* version : V1.0.0
*
*/
/* Import... */
import java.io.File; //File类
import java.util.ArrayList; //ArrayList类
import java.util.List; //List类
import java.util.Scanner; //Scanner类
public class Experment_0402 {
/** MAIN */
public static void main(String[] args) {
judgeArticle();
}
/**
* discription: 判断两篇文章的相似度。采用余弦相似度来计算两文章的相似度。
* */
public static void judgeArticle(){
String str1 = readFile("txtData/article1.txt");//读入文章1
String str2 = readFile("txtData/article2.txt");//读入文章2
System.out.println("article 1 为:" + str1);
System.out.println();
System.out.println("article 2 为:" + str2);
System.out.println();
//获得str的字符数组
List<String> list1 = changeToList(str1);
List<String> list2 = changeToList(str2);
//获得去重list
List<String> strSpilt = fixTwoStringArr(list1, list2);
System.out.println("去重字符数组为:");
for (int i = 0; i < strSpilt.size(); i++) {
System.out.print(strSpilt.get(i) + " ");
}
//获得词频
int[] wordNum1 = getWordFrequency(strSpilt, list1);
int[] wordNum2 = getWordFrequency(strSpilt, list2);
System.out.println("\nstr1文本词频为:");
for (int i = 0; i < wordNum1.length; i++) {
System.out.print(wordNum1[i] + " ");
}
System.out.println("\nstr2文本词频为:");
for (int i = 0; i < wordNum2.length; i++) {
System.out.print(wordNum2[i] + " ");
}
//获得相似度
System.out.printf("\n\narticle1与article2的相似度为:%.2f%%\n" , calSimilarity(wordNum1, wordNum2)*100);
}
/**
* discription: 读取文件中的文章到字符串
* @param fileName A pathname string
* @throws
* */
public static String readFile(String fileName) {//给地址 如"txtData/article1.txt"
String str = "", temp = "";//字符串str 临时单词变量temp
try {
File file = new File(fileName);//文件对象
Scanner in = new Scanner(file);//文件输入流
while (in.hasNext()) {
temp = in.next();
// System.out.print(temp + " ");
// char a = temp.charAt(temp.length() - 1);
// if (a == '.' || a == '?' || a == '!')//去符号
// System.out.println();
str += temp + " ";//将每次的单词加入str,末尾加空格
}
in.close();//关闭输入流
} catch (Exception e) {//抛出错误
System.out.println(e.getMessage());
}
return str;
}
/**
* discription: 将字符串转化成list
* @param str 需要转化为list的字符串
* @throws
* */
public static List<String> changeToList(String str) {
String[] string = str.split(" ");//分词
List<String> list = new ArrayList<>();
for (int i = 0; i < string.length; i++) {
list.add(string[i]);
}
return list;
}
/**
* discription: 合并两个字符串list,去掉重复单词
* @param list1 字符串List<String> list1
* @param list2 字符串List<String> list2
* @throws
* */
public static List<String> fixTwoStringArr(List<String> list1, List<String> list2) {
List<String> list = new ArrayList<>();
//处理list1
for (int i = 0; i < list1.size(); i++) {
if (!list.contains(list1.get(i))) {
list.add(list1.get(i));
}
}
//处理list2
for (int i = 0; i < list2.size(); i++) {
if (!list.contains(list2.get(i))) {
list.add(list2.get(i));
}
}
return list;
}
/**
* description: 获得文本词频
* @param target 去重后的对比字符串List<String> target
* @param list 传入比较字符串List<String> list
* @throws
* */
public static int[] getWordFrequency (List<String> target, List<String> list) {
int[] wordFrequency = new int[target.size()];//创建数组
for (int i = 0; i < list.size(); i++) {//外循环遍历list
for (int j = 0; j < target.size(); j++) {//内循环遍历target,如果对应位置单词出现则+1
if (target.get(j).equals(list.get(i))) {
wordFrequency[j]++;
break;
}
}
}
return wordFrequency;
}
/**
* discription: 根据n维向量余弦公式得到相似度
* @param x 第一个文章的文本词频
* @param y 第二个文章的文本词频
* @throws
* */
public static double calSimilarity(int[] x, int[] y) {
double squareSum = 0; //平方和 分子
double sqrtSumA = 0, sqrtSumB = 0; //平方和开方 分母
for (int i = 0; i < x.length; i++) {
squareSum += x[i] * y[i];
sqrtSumA += Math.pow(x[i], 2);
sqrtSumB += Math.pow(y[i], 2);
}
return squareSum / (Math.sqrt(sqrtSumA) * Math.sqrt(sqrtSumB));
}
}