头文件:
#ifndef _Preprocess_H
#define _Preprocess_H
#include<iostream>
#include<map>
#include<set>
#include<vector>
#include<string>
#include<iomanip>
#include<fstream>
#include<algorithm>
#include<cmath>
#include<sstream>
#include<limits>
#include <xstring>
#include"ictclas30.h"
#include"boost\tr1\regex.hpp"
#include"boost/algorithm/string.hpp"
#include"windows.h"
/************************************************************************/
/* WkaPreprocess类完成如下功能
将文本集合分词-》去停用词-》建立词袋子模型=》特征词选择=》对文章建立VSM模型=
》写成weka数据格式(arff)-》输出聚类信息 */
/************************************************************************/
//一些谓词函数
using namespace std;
class Preprocess
{
//typedef vector<string>(Preprocess::*FUNCSEG)(string,set<string>);
private:
char *bagofwordsAddress;//存放词袋子模型的位置
char * featurewordsAddress;//存放特征词文件的位置;
char *arffFileAddress;//存放ARFF文件的位置
char *infoFromWekaAddress;//存放调用weka后的实验结果
char *articleIdsAddress;//存放被聚类的文章的ID号
char *dbconnection;//数据库的链接字符串
char *dbselect;//数据库select语句
char *dbfield;//数据库字段
int beginIndex;//开始聚类的文章id
int endIndex;//结束聚类的文章id
public:
typedef vector<string>(Preprocess::*FUNCSEG)(string,set<string>);
Preprocess(int c_style_stringsize,const char *mydict,const char *keywordsinfo,const char *tobeCluster,const char * InfoFromWeka,const char *artileIds,const char *conn,const char *selectsql, int beginIndex,int endIndex)
{
bagofwordsAddress=new char[c_style_stringsize];
featurewordsAddress=new char[c_style_stringsize];
arffFileAddress=new char[c_style_stringsize];
infoFromWekaAddress=new char[c_style_stringsize];
articleIdsAddress=new char[c_style_stringsize];
dbconnection=new char[c_style_stringsize];
dbselect=new char[c_style_stringsize];
this->beginIndex=beginIndex;
this->endIndex=endIndex;
sprintf_s(bagofwordsAddress,c_style_stringsize,mydict);
sprintf_s(featurewordsAddress,c_style_stringsize,keywordsinfo);
sprintf_s(arffFileAddress,c_style_stringsize,tobeCluster);
sprintf_s(infoFromWekaAddress,c_style_stringsize,InfoFromWeka);
sprintf_s(articleIdsAddress,c_style_stringsize,artileIds);
sprintf_s(dbconnection,c_style_stringsize,conn);
sprintf_s(dbselect,c_style_stringsize,selectsql);
}
/*Preprocess()
{
}*/
~Preprocess()
{
delete []bagofwordsAddress;
delete []featurewordsAddress;
delete []arffFileAddress;
delete [] infoFromWekaAddress;
delete []articleIdsAddress;
delete []dbconnection;
delete []dbselect;
}
void trim(string &str,const string val);//去除字符串首尾空白
//构建倒排表: key=word,val= a list of pairs which consists of articleid,and count, count=tf
int ConstructMap(map<string,vector<pair<int,int>>>&mymap,char *dbfield,FUNCSEG seg);
inline void TruncateArff()
{
ofstream ofile;
ofile.open(arffFileAddress,ios::trunc);
ofile.close();
}
//保存词袋子到硬盘
void save(map<string,vector<pair<int,int> > >&mymap);
//从内存中加载词袋子模型
void load(map<string,vector<pair<int,int> > >&mymap);
//打印词袋子模型
void print(map<string,vector<pair<int,int> > >&mymap);
//窄字符串转化成宽字符串
wstring myMultibyteToWideChar(string sResult);
//宽字符串转化成窄字符串
string myWideCharToMultibyte(wstring wsResult);
//调用ICTclass分词
string ICTsplit(const char *sInput);
//构造停用词表
set<string>MakeStopSet();
//去除停用词,噪声词
vector<string>goodWordsinPieceArticle(string rawtext,set<string> stopwords);
//整数转化成字符串
string do_fraction(int val);
//浮点数转化成字符串
string do_fraction(double val, int decplaces=5);
//特征词选择算法
void DFcharicteristicWordSelection(map<string,vector<pair<int,int>>> &mymap,int DFthreshold);
//获取最后的特征词
vector<string> GetFinalKeyWords();
//获取特征词的maxTF,DF
vector<pair<int,int> >GetfinalKeysMaxTFDF(map<string,vector<pair<int,int>>> &mymap);
//文档向量模型规范化
vector<pair<int,double> > NormalizationVSM(vector<pair<int,double> > tempVSM);
//建立文档向量模型并且写到arff文件里
void VSMFormation(map<string,vector<pair<int,int>>> &mymap);
/***单个文档向量模型字符串化***/
string FormatVSMtoString(vector<pair<int,double> > tempVSM);
//写Arff文件头部
void WriteHeadArff();
void WriteTotalArff(char * dbfield,int DFthreshlod,bool isbagOfwordsexsist,FUNCSEG seg);
/******************************************************以下函数完成聚类功能**********************************/
/***************建立文档向量模型,但是不形成字符串***********/
map<int,vector<double> >VSMConstruction(map<string,vector<pair<int,int>>> &mymap);
/************从weka给出的结果中获取聚类中心******/
map<string,vector<double> > GetClusters();
/**计算向量的内积*****************8*/
double CalDotProductOfVectors(const vector<double>&vector1,const vector<double>&vector2);
/************计算余弦相似度*******/
double CalCosineofVectors(const vector<double>&vector1,const vector<double>&vector2);
/* 获取聚类信息,即给每篇文章附上一个类别label */
vector<pair<int,string> >GenerateClusterInfo(map<int,vector<double> >&vsmMatrix, map<string,vector<double> >&clusters);
/****返回聚类中每个类别的文章ID******************/
map<string,vector<int> >FetchArticlesOFClusters(map<string,vector<double> >&clusters,vector<pair<int,string>>&resultInfo);
void RetreiveArticleInfoFromDataBase();
vector<string> mySplit(string s,set<string> stopwords);//分割关键词
};
#endif
Preprocess类的函数功能实现文件:
#include"stdafx.h"
#include "Preprocess.h"
#pragma comment(lib, "ICTCLAS30.lib")
using namespace std;
/************************************************************************/
/* 去掉字符串首尾空白 */
/************************************************************************/
bool isLonger(const pair<string,int> &pair1, const pair<string,int> &pair2)
{
return pair1.second>pair2.second;
}
bool cntAssist(const pair<string,int> &pair1)
{
return pair1.second<=100;
}
bool PredTF(const pair<int,int>& pair1,int articleId)
{
return pair1.first==articleId;
}
class PredTFclass
{
private: const int m;
public:
PredTFclass(int id):m(id){};
bool operator()(const pair<int,int>& pair1){return PredTF(pair1,m);};
};
bool myCmp(const pair<string,double>&pair1,const pair<string,double>&pair2 )
{
return pair1.second>=pair2.second;
}
void Preprocess:: trim(string &str,const string val)
{
str.erase(0,str.find_first_not_of(val));
str.erase(str.find_last_not_of(val)+val.size());
}
/************************************************************************/
/* 建立词袋子模型 */
/************************************************************************/
int Preprocess::ConstructMap(map<string,vector<pair<int,int>>>&mymap,char *dbfield,FUNCSEG seg)
{
//set<string>MakeStopSet();
CoInitialize(NULL);
_ConnectionPtr pConn(__uuidof(Connection));
_RecordsetPtr pRst(__uuidof(Recordset));
pConn->ConnectionString=dbconnection;
pConn->Open("","","",adConnectUnspecified);
pRst=pConn->Execute(dbselect,NULL,adCmdText);
set<string>stopwords=MakeStopSet();
while(!pRst->rsEOF)
{ vector<string>wordcollection;
//string keywordstr=(_bstr_t)pRst->GetCollect("CKeyWord");
string rawtext=(_bstr_t)pRst->GetCollect(dbfield);
if(rawtext!="")
{
wordcollection=(this->*seg)(rawtext,stopwords);
string tempid=(_bstr_t)pRst->GetCollect("ArticleId");
int articleid=atoi(tempid.c_str());
for(vector<string>::iterator strit=wordcollection.begin();strit!=wordcollection.end();strit++)
{
vector<pair<int,int>>::iterator it;
if(mymap[*strit].empty())
{
pair<int,int>mytemppair=make_pair(articleid,1);
mymap[*strit].push_back(mytemppair);
}
else
{
for(it=mymap[*strit].begin();it!=mymap[*strit].end();it++)
{
if(it->first==articleid)
{
it->second=++(it->second);
break;
}
}
if(it==mymap[*strit].end())
{
pair<int,int>mytemppair=make_pair(articleid,1);
mymap[*strit].push_back(mytemppair);
}
}
}
}
pRst->MoveNext();
wordcollection.clear();
}
pRst->Close();
pConn->Close();
pRst.Release();
pConn.Release();
CoUninitialize();
return 0;
}
/************************************************************************/
/* 保存词袋子模型到硬盘 */
/************************************************************************/
void Preprocess::save(map<string,vector<pair<int,int> > >&mymap)
{
ofstream outfile(bagofwordsAddress,ios::binary);
outfile<<mymap.size()<<endl;
map<string,vector<pair<int,int> > >::iterator it;
for (it=mymap.begin();it!=mymap.end();it++)
{ outfile<<it->first<<endl;
vector<pair<int,int>>::iterator subit;
outfile<<it->second.size()<<endl;
for(subit=(it->second).begin();subit!=(it->second).end();++subit)
{
outfile<<subit->first<<" "<<subit->second<<" "<<";"<<" ";
}
outfile<<endl;
}
//outfile.write((char *)&mymap,sizeof(mymap));
outfile.close();
}
/************************************************************************/
/* 加载词典信息到内存 */
/************************************************************************/
void Preprocess::load(map<string,vector<pair<int,int> > >&mymap)
{
std::locale loc1 = std::locale::global(std::locale(".936"));
{
// 在这里使用std::ifstream 或者 std::fstream
ifstream infile(bagofwordsAddress,ios::binary);
int lenMyMap;//保存词典长度
int lenVector;//保存每个词出现的文章数目
string key;//保存读出的map的键值
int articleId;//文章标号
int count;//在该文章中刚出现的数目
string comma;
string semicolon;
infile>>lenMyMap;
while(!infile.eof())
{
infile>>key;
infile>>lenVector;
vector<pair<int,int> >temp;
for (int i=0;i<lenVector;i++)
{
infile>>articleId>>count>>semicolon;
temp.push_back(make_pair(articleId,count));
}
mymap[key]=temp;
}
infile.close();
}
std::locale::global(std::locale(loc1));
}
/************************************************************************/
/* 打印词典信息 */
/************************************************************************/
void print(map<string,vector<pair<int,int> > >&mymap)
{
cout<<mymap.size()<<endl;
map<string,vector<pair<int,int> > >::iterator it;
for (it=mymap.begin();it!=mymap.end();it++)
{ cout<<it->first<<endl;
vector<pair<int,int>>::iterator subit;
cout<<it->second.size()<<endl;
for(subit=(it->second).begin();subit!=(it->second).end();++subit)
{
cout<<subit->first<<','<<subit->second<<";";
}
cout<<endl;
}
}
/************************************************************************/
/* 构造停用词表 */
/************************************************************************/
set<string> Preprocess::MakeStopSet()
{
set<string> stopwordsSet;
ifstream ifile("stopwords.txt");
while(!ifile.eof())
{
string temp;
trim(temp," ");
ifile>>temp;
stopwordsSet.insert(temp);
}
return stopwordsSet;
}
/************************************************************************/
/* 将整数转化成字符串 */
/************************************************************************/
string Preprocess::do_fraction(int val)
{
ostringstream out;
out<<val;
string str= out.str(); //从流中取出字符串
str.swap(string(str.c_str()));//删除nul之后的多余字符
return str;
}
/************************************************************************/
/* 将浮点数转化成指定精度的字符串 */
/************************************************************************/
string Preprocess::do_fraction(double val,int decplaces)
{
//int prec=numeric_limits<double>::digits10;
char DECIMAL_POINT='.';
ostringstream out;
//out.precision(prec);
out<<val;
string str=out.str();
size_t n=str.find(DECIMAL_POINT);
if((n!=string::npos)&&n+decplaces<str.size())
{
str[n+decplaces]='\0';
}
str.swap(string(str.c_str()));
return str;
}
/************************************************************************/
/* 窄字符串砖宽字符串 */
/************************************************************************/
wstring Preprocess::myMultibyteToWideChar(string sResult)
{
int iWLen=MultiByteToWideChar( CP_ACP, 0, sResult.c_str(), sResult.size(), 0, 0 );// 计算转换后宽字符串的长度。(不包含字符串结束符)
wchar_t *lpwsz= new wchar_t [iWLen+1];
MultiByteToWideChar( CP_ACP, 0, sResult.c_str(), sResult.size(), lpwsz, iWLen ); // 正式转换。
lpwsz[iWLen] = L'\0';
wstring wsResult(lpwsz);
delete []lpwsz;
return wsResult;
}
/************************************************************************/
/* 宽字符串转窄字符串 */
/************************************************************************/
string Preprocess::myWideCharToMultibyte(wstring wsResult)
{
string sResult;
int iLen= WideCharToMultiByte( CP_ACP, NULL, wsResult.c_str(), -1, NULL, 0, NULL, FALSE ); // 计算转换后字符串的长度。(包含字符串结束符)
char *lpsz= new char[iLen];
WideCharToMultiByte( CP_OEMCP, NULL, wsResult.c_str(), -1, lpsz, iLen, NULL, FALSE); // 正式转换。
sResult.assign( lpsz, iLen-1 ); // 对string对象进行赋值。
delete []lpsz;
return sResult;
}
/************************************************************************/
/* 调用ICTclas进行中文分词 */
/************************************************************************/
string Preprocess::ICTsplit(const char *sInput)
{
if(!ICTCLAS_Init())
{
printf("ICTCLAS INIT FAILED!\n");
string strerr(sInput);
return strerr;
}
ICTCLAS_SetPOSmap(ICT_POS_MAP_SECOND);
//导入用户词典后
/*printf("\n导入用户词典后:\n");
int nCount = ICTCLAS_ImportUserDict("userdic.txt");//覆盖以前的用户词典
//保存用户词典
ICTCLAS_SaveTheUsrDic();
printf("导入%d个用户词。\n", nCount);*/
const char* sResult = ICTCLAS_ParagraphProcess(sInput, 0);
string strresult(sResult);
//printf("%s\n", sResult);
//把字符串转化成宽字符串
wstring wsResult=myMultibyteToWideChar(strresult);
boost::wregex wreg(L"\\s+");
wsResult=boost::regex_replace(wsResult,wreg,wstring(L"|"));
strresult=myWideCharToMultibyte(wsResult);
//ofile<<str1;
//ofile.close();
//cout<<str1<<endl;
//ICTCLAS_FileProcess("text.txt","test_result.txt",1);
ICTCLAS_Exit();
return strresult;
}
/************************************************************************/
/* 对每一篇文章去掉噪声词,剩下好词 */
/************************************************************************/
vector<string>Preprocess::goodWordsinPieceArticle(string rawtext,set<string> stopwords)
{
vector<wstring> goodWordstemp;
vector<string> goodWords;
const char* sInput=rawtext.c_str();
string sResult=ICTsplit(sInput);
wstring wsResult=myMultibyteToWideChar(sResult);
boost::wregex wreg(L"\\d+");//去掉中文空格
wsResult=boost::regex_replace(wsResult,wreg,wstring(L""));
//boost::regex_split(back_inserter(goodWordstemp),wsResult,wreg);
boost::split(goodWordstemp,wsResult,boost::is_any_of("|"));
for(vector<wstring>::iterator it=goodWordstemp.begin();it!=goodWordstemp.end();it++)
{
string temp=myWideCharToMultibyte(*it);
trim(temp," ");
if(!stopwords.count(temp)&&!temp.empty())
{
goodWords.push_back(temp);
}
}
return goodWords;
}
/************************************************************************/
/* DF特征词选择法 */
/************************************************************************/
void Preprocess::DFcharicteristicWordSelection(map<string,vector<pair<int,int>>> &mymap,int DFthreshold)
{
int finalKeyWordsCount=0;//计算共取了多少个关键词
vector<pair<string,int> >tempvector;
for(map<string,vector<pair<int,int>>>::iterator it=mymap.begin();it!=mymap.end();++it)
{
tempvector.push_back(make_pair(it->first,(it->second).size()));
}
stable_sort(tempvector.begin(),tempvector.end(),isLonger);
ofstream outfile(featurewordsAddress);
for(vector<pair<string,int> >::iterator it=tempvector.begin();it!=tempvector.end();it++)
{
if(it->second>=DFthreshold)
{
//outfile<<it->first<<" "<<it->second<<endl;
outfile<<it->first<<endl;
finalKeyWordsCount++;
}
}
outfile.close();
cout<<"最后共选择特征词"<<finalKeyWordsCount<<endl;
cout<<"by the way,DFthreshold equals"<<DFthreshold<<endl;
}
/************************************************************************/
/* 获得最终选定的构造文档向量模型的特征词 */
/************************************************************************/
vector<string>Preprocess::GetFinalKeyWords()
{
vector<string>myKeys;
ifstream infile(featurewordsAddress);
while(!infile.eof())
{
string temp;
infile>>temp;
if(temp!="")
{
myKeys.push_back(temp);
}
}
return myKeys;
}
/************************************************************************/
/* 获得特征词的maxTF,DF */
/************************************************************************/
vector<pair<int,int> >Preprocess::GetfinalKeysMaxTFDF(map<string,vector<pair<int,int>>> &mymap)
{
vector<pair<int,int> >maxTFandDF;
vector<string>myKeys=GetFinalKeyWords();
for(vector<string>::iterator it=myKeys.begin();it!=myKeys.end();it++)
{
int DF=mymap[*it].size();
int maxTF=0;
for(vector<pair<int,int> >::iterator subit=mymap[*it].begin();subit!=mymap[*it].end();subit++)
{
if(subit->second>maxTF)
{
maxTF=subit->second;
}
}
maxTFandDF.push_back(make_pair(maxTF,DF));
//find_if(mymap[*it].begin(),mymap[*it].end(),
}
return maxTFandDF;
}
/************************************************************************/
/* 文档向量模型归一化 */
/************************************************************************/
vector<pair<int,double> >Preprocess::NormalizationVSM(vector<pair<int,double> > tempVSM)
{
double sum=0;
for(vector<pair<int,double> >::iterator vsmit=tempVSM.begin();vsmit!=tempVSM.end();++vsmit)
{
sum+=pow(vsmit->second,2);
}
for(vector<pair<int,double> >::iterator vsmit=tempVSM.begin();vsmit!=tempVSM.end();++vsmit)
{
vsmit->second/=sqrt(sum);
}
return tempVSM;
}
/************************************************************************/
/* 单个文档向量模型字符串化 */
/************************************************************************/
string Preprocess::FormatVSMtoString(vector<pair<int,double> > tempVSM)
{
string ret="{";
int commaindication=0;
for(vector<pair<int,double> >::iterator vsmit=tempVSM.begin();vsmit!=tempVSM.end();++vsmit)
{
ret+=do_fraction(vsmit->first)+" "+do_fraction(vsmit->second,8);
if(commaindication<tempVSM.size()-1)
{
ret+=",";
}
commaindication++;
}
ret+="}";
return ret;
}
/************************************************************************/
/* 写Arff头文件 */
/************************************************************************/
void Preprocess::WriteHeadArff()
{
ofstream ofile(arffFileAddress,ios::binary);
ofile<<"@relation aticle"<<endl;
ofile<<"\n";
vector<string> myKeys=GetFinalKeyWords();
for(vector<string>::iterator it=myKeys.begin();it!=myKeys.end();it++)
{
//string temp="@attribute "+"'"+(*it)+"'"+" real";
string temp="";
temp+="@attribute ";
temp+="'";
temp+=*(it);
temp+="'";
temp+=" real";
/*strcpy(temp,"@attribute ");
strcpy(temp,"'");
strcpy(temp,*(it));
strcpy(temp,"'");
strcpy(temp," real");*/
ofile<<temp<<endl;
}
ofile<<"\n"<<endl;
ofile<<"@data"<<endl;
ofile.close();
}
/************************************************************************/
/* 将实验数据写成arff @data格式 */
/************************************************************************/
void Preprocess::VSMFormation(map<string,vector<pair<int,int>>> &mymap)
{ int corpus_N=endIndex-beginIndex+1;
ofstream ofile1(articleIdsAddress,ios::binary);//保存文章编号的文件
ofstream ofile2(arffFileAddress,ios::binary|ios::app);
vector<string> myKeys=GetFinalKeyWords();
vector<pair<int,int> >maxTFandDF=GetfinalKeysMaxTFDF(mymap);
for(int i=beginIndex;i<=endIndex;i++)
{ vector<pair<int,double> >tempVSM;
for(vector<string>::size_type j=0;j<myKeys.size();j++)
{
//vector<pair<int,int> >::iterator findit=find_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i));
double TF=(double)count_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i));
TF=0.5+0.5*(double)TF/(maxTFandDF[j].first);
TF*=log((double)corpus_N/maxTFandDF[j].second);
if(TF!=0)
{
tempVSM.push_back(make_pair(j,TF));
}
}
if(!tempVSM.empty())
{
tempVSM=NormalizationVSM(tempVSM);
string vsmStr=FormatVSMtoString(tempVSM);
ofile1<<i<<endl;
ofile2<<vsmStr<<endl;
}
tempVSM.clear();
}
ofile1.close();
ofile2.close();
}
void Preprocess::WriteTotalArff(char *dbfield,int DFthreshold,bool isbagOfWordsExist,FUNCSEG seg)
{
map<string,vector<pair<int,int>>> mymap;
if(!isbagOfWordsExist)
{
ConstructMap(mymap,dbfield,seg);
save(mymap);
cout<<"词袋子信息已经保存到硬盘"<<endl;
}
else
{
load(mymap);
}
DFcharicteristicWordSelection(mymap,DFthreshold);
WriteHeadArff();
VSMFormation(mymap);
cout<<"arff文件已经形成"<<endl;
string temp(infoFromWekaAddress);
cout<<"请您将使用weka聚类,并保存为"<<temp<<endl;
}
/*****************以下函数辅助完成聚类功能*********************************************************************8**********************/
/************************************************************************/
/* 建立文档向量模型 */
/************************************************************************/
map<int,vector<double> > Preprocess::VSMConstruction(map<string,vector<pair<int,int>>> &mymap)
{
int corpus_N=endIndex-beginIndex+1;
map<int,vector<double>> vsmMatrix;
vector<string> myKeys=GetFinalKeyWords();
vector<pair<int,int> >maxTFandDF=GetfinalKeysMaxTFDF(mymap);
for(int i=beginIndex;i<=endIndex;i++)
{
vector<pair<int,double> >tempVSM;
for(vector<string>::size_type j=0;j<myKeys.size();j++)
{
//vector<pair<int,int> >::iterator findit=find_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i));
double TF=(double)count_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i));
TF=0.5+(double)TF/(maxTFandDF[j].first);
TF*=log((double)corpus_N/maxTFandDF[j].second);
tempVSM.push_back(make_pair(j,TF));
}
if(!tempVSM.empty())
{
tempVSM=NormalizationVSM(tempVSM);
for(vector<pair<int,double> >::iterator it=tempVSM.begin();it!=tempVSM.end();it++)
{
vsmMatrix[i].push_back(it->second);
}
}
tempVSM.clear();
}
return vsmMatrix;
}
/************************************************************************/
/* 获得Weka提供的聚类信息 */
/************************************************************************/
map<string,vector<double> > Preprocess::GetClusters()
{
map<string,vector<double> >clusters;
ifstream ifile(infoFromWekaAddress);
string temp;
while(getline(ifile,temp))
{ boost::smatch matchcluster;
boost::regex regcluster("Cluster\\s+\\d+",boost::regex::icase);
if(boost::regex_search(temp,matchcluster,regcluster))
{
string clustertmp=matchcluster[0].str();
string ordinates="";
getline(ifile,ordinates);
boost::regex regordinates("\\d+(\\.\\d{1,4})?");
boost::smatch matchordinates;
std::string::const_iterator it=ordinates.begin();
std::string::const_iterator end=ordinates.end();
while (boost::regex_search(it,end,matchordinates,regordinates))
{
string digitstemp=matchordinates[0].str();
double digitval=0.0;
std::stringstream ss;
ss<<digitstemp;
ss>>digitval;
clusters[clustertmp].push_back(digitval);
it=matchordinates[0].second;
}
}
}
return clusters;
}
/**计算向量内积*/
double Preprocess::CalDotProductOfVectors(const vector<double>&vector1,const vector<double>&vector2)
{
double result = 0.0f;
for (int i = 0; i < vector1.size(); i++)
result += vector1[i] * vector2[i];
return result;
}
/**计算向量余弦相似度*/
double Preprocess::CalCosineofVectors(const vector<double>&vector1,const vector<double>&vector2)
{
double numerator=CalDotProductOfVectors(vector1,vector2);
double denominator=CalDotProductOfVectors(vector1,vector1)*CalDotProductOfVectors(vector2,vector2);
denominator=sqrt(denominator);
return numerator/denominator;
}
/**为每篇文章打上个类别标签*/
vector<pair<int,string> > Preprocess::GenerateClusterInfo(map<int,vector<double> >&vsmMatrix, map<string,vector<double> >&clusters)
{
vector<pair<int,string> >resultInfo;
for(map<int,vector<double> >::iterator it=vsmMatrix.begin();it!=vsmMatrix.end();it++)
{
vector<pair<string,double> >clusterDistanceAist;
for(map<string,vector<double> >::iterator clusterit=clusters.begin();clusterit!=clusters.end();clusterit++)
{
double temp=CalCosineofVectors(it->second,clusterit->second);
clusterDistanceAist.push_back(make_pair(clusterit->first,temp));
}
sort(clusterDistanceAist.begin(),clusterDistanceAist.end(),myCmp);
vector<pair<string,double> >::iterator cDAit=clusterDistanceAist.begin();
resultInfo.push_back(make_pair(it->first,cDAit->first));
clusterDistanceAist.clear();
}
return resultInfo;
}
/************************************************************************/
/* 获取每个类别所包含的文章ID */
/************************************************************************/
map<string,vector<int> > Preprocess::FetchArticlesOFClusters(map<string,vector<double> >&clusters,vector<pair<int,string>>&resultInfo)
{
map<string,vector<int>> articlesInfo;
for(vector<pair<int,string>>::iterator retit=resultInfo.begin();retit!=resultInfo.end();retit++)
{
for(map<string,vector<double> >::iterator it=clusters.begin();it!=clusters.end();it++)
{
if(retit->second==it->first)
{
articlesInfo[it->first].push_back(retit->first);
}
}
}
return articlesInfo;
}
void Preprocess::RetreiveArticleInfoFromDataBase()
{
map<string,vector<pair<int,int>>> mymap;
vector<pair<int,string>>resultInfo;
map<string,vector<double> >clusters;
map<int,vector<double> >vsmMatrix;
map<string,vector<int>> articlesInfo;
ofstream ofile("F:\\cluster\\ArticlesInPerCluster.txt");
//boost::regex_replace(strresult)
//ConstructMap(mymap,1,500);
//save(mymap);
load(mymap);
vsmMatrix=VSMConstruction(mymap);
clusters=GetClusters();
resultInfo=GenerateClusterInfo(vsmMatrix,clusters);
articlesInfo=FetchArticlesOFClusters(clusters,resultInfo);
/*for(map<string,vector<int>>::iterator it=articlesInfo.begin();it!=articlesInfo.end();it++)
{
ofile<<it->first<<endl;
int count=0;
ofile<<"(";
for(int i=0;i<it->second.size();i++)
{
ofile<<(it->second)[i];
if(count<it->second.size()-1)
{
ofile<<",";
}
count++;
}
ofile<<")";
ofile<<endl;
}*/
for(map<string,vector<int>>::iterator it=articlesInfo.begin();it!=articlesInfo.end();it++)
{
ostringstream out;
string selectassist;
char *selectsql=new char[5000];
int count=0;
CoInitialize(NULL);
_ConnectionPtr pConn(__uuidof(Connection));
_RecordsetPtr pRst(__uuidof(Recordset));
pConn->ConnectionString=dbconnection;
pConn->Open("","","",adConnectUnspecified);
cout <<it->first<<endl;
ofile<<it->first<<endl;
out<<"(";
count=0;
for(int i=0;i<it->second.size();i++)
{
out<<(it->second)[i];
if(count<it->second.size()-1)
{
out<<",";
}
count++;
}
out<<")";
selectassist=out.str();
sprintf_s(selectsql,5000,"%s %s","Select ArticleTitle,class from News Where ArticleId in ",selectassist.c_str());
pRst=pConn->Execute(selectsql,NULL,adCmdText);
while(!pRst->rsEOF)
{
//string keywordstr=(_bstr_t)pRst->GetCollect("CKeyWord");
string title=(_bstr_t)pRst->GetCollect("ArticleTitle");
//string rawtext=(_bstr_t)pRst->GetCollect("ArticleText");
string categorization=(_bstr_t)pRst->GetCollect("class");
cout<<"文章标题:"<<title<<"文章所属类别: "<<categorization<<endl;
ofile<<"文章标题:"<<title<<"文章所属类别: "<<categorization<<endl;
pRst->MoveNext();
}
pRst->Close();
pConn->Close();
pRst.Release();
pConn.Release();
CoUninitialize();
}
ofile.close();
}
/********按空白把关键词分割开*****************/
vector<string>Preprocess:: mySplit(string s,set<string> stopwords)
{
vector<string> wordCollection;
trim(s," ");
int nPosBegin=0;
int nPosEnd=s.find(' ',nPosBegin);
while(nPosEnd!=string::npos)
{
string temp=s.substr(nPosBegin,nPosEnd-nPosBegin);
trim(temp," ");
wordCollection.push_back(temp);
nPosBegin=s.find_first_not_of(' ',nPosEnd);
nPosEnd=s.find(' ',nPosBegin);
}
string temp=s.substr(nPosBegin,s.size()-nPosBegin);
trim(temp," ");
wordCollection.push_back(temp);
return wordCollection;
}