C++/JAVA 计算两篇文章的相似度

实验介绍及思路

问题描述:

编写程序,计算任意两篇文章的相似度。

基本思路:

利用余弦相似度来计算其相似度。

完整代码

C++ 代码来啰

/*
*
* Author	:		YU.J.P
* Time		;		2022/04/03
* Project	:		Experment One -- calculate article similarity.
*
*/

//计算两篇文章的相似度

#include "bits/stdc++.h"
//#include <iostream>
//#include <cstring>

#define MAXSIZE 1024	//预设最大容量
using namespace std;

/* 单词顺序表 */
typedef struct word 
{
	char str[32];		//单词最大长度设为32
	int cnt = -1;		//单词出现次数,-1表示没有元素
}wordList;

/* 读取文件中的单词 */
void readFile(string fileName, wordList* words)
{
	char temp[32];						//读取单词变量
	int netWords = 0, total = 0;		//去重单词个数,单词出现次数
	int newWordflag = 1;				//newWord标志位,判断是否为新单词(查重) 

	ifstream ifs;						//1.创建流对象;

	ifs.open(fileName , ios::in);		
	if (!ifs.is_open())					//2.打开文件 ifs.open("文件路径",打开方式);
	{
		cout << "\n文件打开失败!" << endl;
	}
	cout << "\n文件打开成功!" << endl;

	char character[] = ",.:\"!";		//英文标点符号
	while (ifs >> temp)					//3.读数据 4种读取方式
	{
		for (int j = 0; temp[j] != '\0'; j++)	//清除符号
		{										//判断是否有符号
			if (strchr(character, temp[j]) != NULL)
				temp[j] = '\0';					// 如果文本符号没有分隔开,会出错
		}

		newWordflag = 1;
		for (int i = 0; i < netWords; i++)
		{
			if (strcmp(temp, words[i].str) == 0)//重复的单词
			{
				newWordflag = 0;
				words[i].cnt++;
			}
		}
		if (newWordflag)
		{
			strcpy_s(words[netWords].str, temp);
			words[netWords++].cnt = 1;			//新单词数量+1 
		}
	}

	for (int i = netWords - 1; i >= 0; i--)		//单词出现次数
	{
		total += words[i].cnt;
	}

	ifs.close();						//4.关闭文件 ifs.close();
												//打印n和s
	cout << "netWords = " << netWords << "\ttotal = " << total << endl;

}

/* 获得合并keyWord数组target */
void getKeyWord(wordList* target,wordList* article1, wordList* article2)
{
	//复制一个数组
	int i;//新单词递增位
	for (i = 0; article1[i].cnt != -1 && i < MAXSIZE; i++)
	{
		strcpy_s(target[i].str,article1[i].str);	//复制单词
		target[i].cnt = 0;							//清零
	}
	//添加另一个数组不同的单词
	for (int j = 0; article2[j].cnt != -1 && j < MAXSIZE; j++)
	{
		int newWordFlag = 1;						//新单词标志位
		for (int v = 0; article2[v].cnt != -1 && v < MAXSIZE; v++)
		{
			if (strcmp(article2[j].str, target[v].str) == 0)
			{										//重复的单词
				newWordFlag = 0;
			}
		}

		if (newWordFlag == 1)
		{
			strcpy_s(target[i].str, article2[j].str);	//复制单词
			target[i++].cnt = 0;						//清零 i++递增
		}
	}
}

/* 统计词频 */
void statistics(wordList* target, wordList* article, int* wordsNum)
{
	for (int i = 0; target[i].cnt != -1 && i < MAXSIZE; i++)
	{								//外循环遍历target
		for (int j = 0; article[j].cnt != -1 && j < MAXSIZE; j++)
		{							//内循环遍历article
			if (strcmp(target[i].str,article[j].str) == 0)//相同
			{						//包含此单词,复制频数
				wordsNum[i] = article[j].cnt;
				break;
			}

		}
		if(wordsNum[i] == -1)		//没有此单词标记为0
			wordsNum[i] = 0;
	}
}


/* 计算相似度 */
double calSimilarity(int* a, int* b)
{
	double squareSum = 0;				//平方和		分子
	double sqrtSumA = 0, sqrtSumB = 0;	//平方和开方	分母

	for (int i = 0; a[i] != -1; i++)
	{
		squareSum += (double)a[i] * (double)b[i];
		sqrtSumA += pow(a[i],2);
		sqrtSumB += pow(b[i], 2);
	}
	sqrtSumA = sqrt(sqrtSumA);
	sqrtSumB = sqrt(sqrtSumB);

	double Cos = squareSum / (sqrtSumA * sqrtSumB);

	return Cos;
}

/* 打印单词结构体中的单词 */
void printWordList(wordList* articleWords)
{
	for (int i = 0; articleWords[i].cnt != -1 && i < MAXSIZE; i++)
	{
		//预设打印宽度20,左对齐
		cout << articleWords[i].str << setw(20) << setiosflags(ios::left);
		if ((i + 1) % 5 == 0)		//打印换行
			cout << endl;
	}
	cout << endl;
}

/* 打印统计单词频数 */
void printWordsNum(int* WordsNum)
{
	for (int i = 0; WordsNum[i] != -1 && i < MAXSIZE; i++)
	{
		cout << WordsNum[i] << "\t";
		if ((i + 1) % 10 == 0)		//打印换行
			cout << endl;
	}
	cout << endl;
}


/* MAIN */
int main()
{
	cout << "\t计算文章相似度\n" << endl;

	wordList articleWords1[MAXSIZE];				//单词数量上限1024
	wordList articleWords2[MAXSIZE];				//单词数量上限1024
	wordList target[MAXSIZE];						//keyWord
	int wordsNum1[MAXSIZE];							//article 1 statistics
	int wordsNum2[MAXSIZE];							//article 2 statistics


	readFile("article1.txt", articleWords1);  //查找单词写入结构体
	cout << "\n**************** article1 words ***************" << endl;
	printWordList(articleWords1);					//打印单词
	cout << "\n***********************************************" << endl;

	readFile("article2.txt", articleWords2);  //查找单词写入结构体
	cout << "\n**************** article2 words ***************" << endl;
	printWordList(articleWords2);					//打印单词
	cout << "\n***********************************************" << endl;

	getKeyWord(target,articleWords1, articleWords2);
	cout << "\n**************** target   words ***************" << endl;
	printWordList(target);							//打印单词
	cout << "\n***********************************************" << endl;

	memset(wordsNum1, -1, MAXSIZE * sizeof(int));					//初始化元素为-1
	statistics(target, articleWords1, wordsNum1);	//统计wordsNum1
	cout << "\n**************** wordsNum1 ***************" << endl;
	printWordsNum(wordsNum1);						//打印wordsNum1
	cout << "\n******************************************" << endl;

	memset(wordsNum2, -1, MAXSIZE * sizeof(int));					//初始化元素为-1
	statistics(target, articleWords2, wordsNum2);	//统计wordsNum2
	cout << "\n**************** wordsNum2 ***************" << endl;
	printWordsNum(wordsNum2);						//打印wordsNum2
	cout << "\n******************************************" << endl;

	cout << "两文章的相似度为: " << setw(5) << setprecision(2) 
		 << setiosflags(ios::fixed) << setiosflags(ios::right) 
		 << calSimilarity(wordsNum1, wordsNum2) * 100 
		 << "%" << endl;//小数2位

	cout << "\nOver..." << endl;
	return 0;
}

JAVA 代码来啰

/*
*
* Time      :   2022/04/02
* Author    :   YU.J.P
* Project   :   Duplicate checking by using cosine formula.
* version   :   V1.0.0
*
*/

/* Import... */
import java.io.File;            //File类
import java.util.ArrayList;     //ArrayList类
import java.util.List;          //List类
import java.util.Scanner;       //Scanner类



public class Experment_0402 {
    /** MAIN */
    public static void main(String[] args) {
        judgeArticle();
    }

    /**
     * discription: 判断两篇文章的相似度。采用余弦相似度来计算两文章的相似度。
     * */
    public static void judgeArticle(){
        String str1 = readFile("txtData/article1.txt");//读入文章1
        String str2 = readFile("txtData/article2.txt");//读入文章2
        System.out.println("article 1 为:" + str1);
        System.out.println();
        System.out.println("article 2 为:" + str2);
        System.out.println();

        //获得str的字符数组
        List<String> list1 = changeToList(str1);
        List<String> list2 = changeToList(str2);

        //获得去重list
        List<String> strSpilt = fixTwoStringArr(list1, list2);
        System.out.println("去重字符数组为:");
        for (int i = 0; i < strSpilt.size(); i++) {
            System.out.print(strSpilt.get(i) + " ");
        }

        //获得词频
        int[] wordNum1 = getWordFrequency(strSpilt, list1);
        int[] wordNum2 = getWordFrequency(strSpilt, list2);
        System.out.println("\nstr1文本词频为:");
        for (int i = 0; i < wordNum1.length; i++) {
            System.out.print(wordNum1[i] + " ");
        }
        System.out.println("\nstr2文本词频为:");
        for (int i = 0; i < wordNum2.length; i++) {
            System.out.print(wordNum2[i] + " ");
        }

        //获得相似度
        System.out.printf("\n\narticle1与article2的相似度为:%.2f%%\n" , calSimilarity(wordNum1, wordNum2)*100);
    }

    /**
     * discription: 读取文件中的文章到字符串
     * @param   fileName  A pathname string
     * @throws
     * */
    public static String readFile(String fileName) {//给地址 如"txtData/article1.txt"
        String str = "", temp = "";//字符串str 临时单词变量temp
        try {
            File file = new File(fileName);//文件对象
            Scanner in = new Scanner(file);//文件输入流
            while (in.hasNext()) {
                temp = in.next();
//                System.out.print(temp + " ");
//                char a = temp.charAt(temp.length() - 1);
//                if (a == '.' || a == '?' || a == '!')//去符号
//                    System.out.println();
                str += temp + " ";//将每次的单词加入str,末尾加空格
            }
            in.close();//关闭输入流
        } catch (Exception e) {//抛出错误
            System.out.println(e.getMessage());
        }
        return str;
    }

    /**
     * discription: 将字符串转化成list
     * @param   str  需要转化为list的字符串
     * @throws
     * */
    public static List<String> changeToList(String str) {
        String[] string = str.split(" ");//分词
        List<String> list = new ArrayList<>();
        for (int i = 0; i < string.length; i++) {
            list.add(string[i]);
        }
        return list;
    }

    /**
     * discription: 合并两个字符串list,去掉重复单词
     * @param   list1  字符串List<String> list1
     * @param   list2  字符串List<String> list2
     * @throws
     * */
    public static List<String> fixTwoStringArr(List<String> list1, List<String> list2) {
        List<String> list = new ArrayList<>();
        //处理list1
        for (int i = 0; i < list1.size(); i++) {
            if (!list.contains(list1.get(i))) {
                list.add(list1.get(i));
            }
        }
        //处理list2
        for (int i = 0; i < list2.size(); i++) {
            if (!list.contains(list2.get(i))) {
                list.add(list2.get(i));
            }
        }
        return list;
    }

    /**
     * description: 获得文本词频
     * @param   target  去重后的对比字符串List<String> target
     * @param   list  传入比较字符串List<String> list
     * @throws
     * */
    public static int[] getWordFrequency (List<String> target, List<String> list) {
        int[] wordFrequency = new int[target.size()];//创建数组
        for (int i = 0; i < list.size(); i++) {//外循环遍历list
            for (int j = 0; j < target.size(); j++) {//内循环遍历target,如果对应位置单词出现则+1
                if (target.get(j).equals(list.get(i))) {
                    wordFrequency[j]++;
                    break;
                }
            }
        }
        return wordFrequency;
    }

    /**
     * discription: 根据n维向量余弦公式得到相似度
     * @param   x  第一个文章的文本词频
     * @param   y  第二个文章的文本词频
     * @throws
     * */
    public static double calSimilarity(int[] x, int[] y) {
        double squareSum = 0;                 //平方和	 分子
        double sqrtSumA = 0, sqrtSumB = 0;    //平方和开方	 分母

        for (int i = 0; i < x.length; i++) {
            squareSum += x[i] * y[i];
            sqrtSumA += Math.pow(x[i], 2);
            sqrtSumB += Math.pow(y[i], 2);
        }
        return squareSum / (Math.sqrt(sqrtSumA) * Math.sqrt(sqrtSumB));
    }

}