K-meams文本聚类算法C++实现

FROM:http://www.cnblogs.com/finallyliuyu/archive/2010/09/03/1817348.html

 头文件：

  #ifndef _Preprocess_H

 #define  _Preprocess_H

 #include

 #include

 #include

 #include

 #include

 #include

 #include

 #include

 #include

 #include

 #include

 #include

 #include"ictclas30.h"

 #include"boost\tr1\regex.hpp"

 #include"boost/algorithm/string.hpp"

 #include"windows.h"

 //一些谓词函数

 using namespace std;

 class Preprocess

 {

     //typedef  vector(Preprocess::*FUNCSEG)(string,set);

     private:

          char *bagofwordsAddress;//存放词袋子模型的位置

         char * featurewordsAddress;//存放特征词文件的位置；

         char *arffFileAddress;//存放ARFF文件的位置

         char *infoFromWekaAddress;//存放调用weka后的实验结果

         char *articleIdsAddress;//存放被聚类的文章的ID号

         char *dbconnection;//数据库的链接字符串

         char *dbselect;//数据库select语句

         char *dbfield;//数据库字段

         int beginIndex;//开始聚类的文章id

         int endIndex;//结束聚类的文章id

     public:

         typedef vector(Preprocess::*FUNCSEG)(string,set);

         Preprocess(int c_style_stringsize,const char *mydict,const char *keywordsinfo,const char *tobeCluster,const char * InfoFromWeka,const char *artileIds,const char *conn,const char *selectsql, int beginIndex,int endIndex)

         {

                 bagofwordsAddress=new char[c_style_stringsize];

                 featurewordsAddress=new char[c_style_stringsize];

                 arffFileAddress=new char[c_style_stringsize];

                 infoFromWekaAddress=new char[c_style_stringsize];

                 articleIdsAddress=new char[c_style_stringsize];

                 dbconnection=new char[c_style_stringsize];

                 dbselect=new char[c_style_stringsize];

                 this->beginIndex=beginIndex;

                 this->endIndex=endIndex;

                 sprintf_s(bagofwordsAddress,c_style_stringsize,mydict);

                 sprintf_s(featurewordsAddress,c_style_stringsize,keywordsinfo);

                 sprintf_s(arffFileAddress,c_style_stringsize,tobeCluster);

                 sprintf_s(infoFromWekaAddress,c_style_stringsize,InfoFromWeka);

                 sprintf_s(articleIdsAddress,c_style_stringsize,artileIds);

                 sprintf_s(dbconnection,c_style_stringsize,conn);

                 sprintf_s(dbselect,c_style_stringsize,selectsql);

         }

         ~Preprocess()

         {

             delete []bagofwordsAddress;

             delete []featurewordsAddress;

             delete []arffFileAddress;

             delete [] infoFromWekaAddress;

             delete []articleIdsAddress;

             delete []dbconnection;

             delete []dbselect;

         }

         void trim(string  &str,const string val);//去除字符串首尾空白

         //构建倒排表： key=word,val= a list of pairs which consists of articleid,and count, count=tf

         int ConstructMap(mapint,int>>>&mymap,char *dbfield,FUNCSEG seg);

         inline void TruncateArff()

         {

             ofstream ofile;

             ofile.open(arffFileAddress,ios::trunc);

             ofile.close();

         }

         //保存词袋子到硬盘

         void save(mapint,int> > >&mymap);

         //从内存中加载词袋子模型

         void load(mapint,int> > >&mymap);

         //打印词袋子模型

         void print(mapint,int> > >&mymap);

         //窄字符串转化成宽字符串

         wstring myMultibyteToWideChar(string sResult);

         //宽字符串转化成窄字符串

         string myWideCharToMultibyte(wstring wsResult);

         //调用ICTclass分词

         string ICTsplit(const char *sInput);

         //构造停用词表

         setMakeStopSet();

         //去除停用词，噪声词

         vectorgoodWordsinPieceArticle(string rawtext,set stopwords);

         //整数转化成字符串

         string do_fraction(int val);

         //浮点数转化成字符串

         string do_fraction(double val, int decplaces=);

         //特征词选择算法

         void DFcharicteristicWordSelection(mapint,int>>> &mymap,int DFthreshold);

         //获取最后的特征词

         vector GetFinalKeyWords();

         //获取特征词的maxTF，DF

         vectorint,int> >GetfinalKeysMaxTFDF(mapint,int>>> &mymap);

         //文档向量模型规范化

         vectorint,double> > NormalizationVSM(vectorint,double> > tempVSM);

         //建立文档向量模型并且写到arff文件里

         void VSMFormation(mapint,int>>> &mymap);

         string FormatVSMtoString(vectorint,double> > tempVSM);

         //写Arff文件头部

         void WriteHeadArff();

         void WriteTotalArff(char * dbfield,int DFthreshlod,bool isbagOfwordsexsist,FUNCSEG seg);

         map<</code>int,vector<</code>double> >VSMConstruction(mapint,int>>> &mymap);

         map<</code>double> > GetClusters();

         double CalDotProductOfVectors(const vector<</code>double>&vector1,const vector<</code>double>&vector2);

         double CalCosineofVectors(const vector<</code>double>&vector1,const vector<</code>double>&vector2);

         vectorint,string> >GenerateClusterInfo(map<</code>int,vector<</code>double> >&vsmMatrix, map<</code>double> >&clusters);

         map<</code>int> >FetchArticlesOFClusters(map<</code>double> >&clusters,vectorint,string>>&resultInfo);

         void RetreiveArticleInfoFromDataBase();

         vector mySplit(string s,set stopwords);//分割关键词

 };

 #endif 

  Preprocess类的函数功能实现文件： 

  #include"stdafx.h"

 #include "Preprocess.h"

 #pragma comment(lib, "ICTCLAS30.lib")

 using namespace std;

 bool isLonger(const  pairint> &pair1, const pairint>  &pair2)

 {

     return pair1.second>pair2.second;

 }

 bool cntAssist(const  pairint> &pair1)

 {

     return pair1.second<=;

 }

 bool PredTF(const pair<</code>int,int>& pair1,int articleId)

 {

     return pair1.first==articleId;

 }

 class PredTFclass

 {

 private: const int m;

 public:

     PredTFclass(int id):m(id){};

     bool operator()(const pair<</code>int,int>& pair1){return PredTF(pair1,m);};

 };

 bool myCmp(const pairdouble>&pair1,const pairdouble>&pair2 )

 {

     return pair1.second>=pair2.second;

 }

 void Preprocess:: trim(string  &str,const string val)

 {

     str.erase(,str.find_first_not_of(val));

     str.erase(str.find_last_not_of(val)+val.size());

 }

 int Preprocess::ConstructMap(mapint,int>>>&mymap,char *dbfield,FUNCSEG seg)

 {

     //setMakeStopSet();

     CoInitialize(NULL);

     _ConnectionPtr pConn(__uuidof(Connection));

     _RecordsetPtr pRst(__uuidof(Recordset));

     pConn->ConnectionString=dbconnection;

     pConn->Open("","","",adConnectUnspecified);

     pRst=pConn->Execute(dbselect,NULL,adCmdText);

     setstopwords=MakeStopSet();

     while(!pRst->rsEOF)

     {   vectorwordcollection;

        //string keywordstr=(_bstr_t)pRst->GetCollect("CKeyWord");

         string rawtext=(_bstr_t)pRst->GetCollect(dbfield);

         if(rawtext!="")

         {

             wordcollection=(this->*seg)(rawtext,stopwords);

             string tempid=(_bstr_t)pRst->GetCollect("ArticleId");

             int articleid=atoi(tempid.c_str());

             for(vector::iterator strit=wordcollection.begin();strit!=wordcollection.end();strit++)

             {

                 vectorint,int>>::iterator it;

                 if(mymap[*strit].empty())

                 {

                     pair<</code>int,int>mytemppair=make_pair(articleid,);

                     mymap[*strit].push_back(mytemppair);

                 }

                 else

                 {

                     for(it=mymap[*strit].begin();it!=mymap[*strit].end();it++)

                     {

                         if(it->first==articleid)

                         {

                             it->second=++(it->second);

                             break;

                         }

                 }

                 if(it==mymap[*strit].end())

                 {

                     pair<</code>int,int>mytemppair=make_pair(articleid,);

                     mymap[*strit].push_back(mytemppair);

                 }

             }

         }

     }

     pRst->MoveNext();

     wordcollection.clear();

  }

     pRst->Close();

     pConn->Close();

     pRst.Release();

     pConn.Release();

     CoUninitialize();

     return ;

 }

 void Preprocess::save(mapint,int> > >&mymap)

 {

     ofstream outfile(bagofwordsAddress,ios::binary);

     outfile<<mymap.size()<<endl;

     mapint,int> > >::iterator it;

     for (it=mymap.begin();it!=mymap.end();it++)

     {   outfile<<it->first<<endl;

     vectorint,int>>::iterator subit;

     outfile<<it->second.size()<<endl;

     for(subit=(it->second).begin();subit!=(it->second).end();++subit)

     {

         outfile<<subit->first<<" "<<subit->second<<" "<<";"<<" ";

     }

     outfile<<endl;

     }

     //outfile.write((char *)&mymap,sizeof(mymap));

     outfile.close();

 }

 void Preprocess::load(mapint,int> > >&mymap)

 {

     std::locale loc1 = std::locale::global(std::locale(".936"));

     {

         // 在这里使用std::ifstream 或者 std::fstream

         ifstream infile(bagofwordsAddress,ios::binary);

         int lenMyMap;//保存词典长度

         int lenVector;//保存每个词出现的文章数目

         string key;//保存读出的map的键值

         int articleId;//文章标号

         int count;//在该文章中刚出现的数目

         string comma;

         string semicolon;

         infile>>lenMyMap;

         while(!infile.eof())

         {

             infile>>key;

             infile>>lenVector;

             vectorint,int> >temp;

             for (int i=;i

             {

                 infile>>articleId>>count>>semicolon;

                 temp.push_back(make_pair(articleId,count));

             }

             mymap[key]=temp;

         }

         infile.close();

     }

     std::locale::global(std::locale(loc1));

 }

 void print(mapint,int> > >&mymap)

 {

     cout<<mymap.size()<<endl;

     mapint,int> > >::iterator it;

     for (it=mymap.begin();it!=mymap.end();it++)

     {   cout<<it->first<<endl;

     vectorint,int>>::iterator subit;

     cout<<it->second.size()<<endl;

     for(subit=(it->second).begin();subit!=(it->second).end();++subit)

     {

         cout<<subit->first<<','<<subit->second<<";";

     }

     cout<<endl;

     }

 }

 set Preprocess::MakeStopSet()

 {

     set stopwordsSet;

     ifstream ifile("stopwords.txt");

     while(!ifile.eof())

     {

         string temp;

         trim(temp," ");

         ifile>>temp;

         stopwordsSet.insert(temp);

     }

     return stopwordsSet;

 }

 string Preprocess::do_fraction(int val)

 {

     ostringstream out;

     out<<val;

     string str= out.str(); //从流中取出字符串

     str.swap(string(str.c_str()));//删除nul之后的多余字符

     return str;

 }

 string Preprocess::do_fraction(double val,int decplaces)

 {

     //int prec=numeric_limits::digits10;

     char DECIMAL_POINT='.';

     ostringstream out;

     //out.precision(prec);

     out<<val;

     string str=out.str();

     size_t n=str.find(DECIMAL_POINT);

     if((n!=string::npos)&&n+decplaces

     {

         str[n+decplaces]='\0';

     }

     str.swap(string(str.c_str()));

     return str;

 }

 wstring Preprocess::myMultibyteToWideChar(string sResult)

 {

     int iWLen=MultiByteToWideChar( CP_ACP, , sResult.c_str(), sResult.size(), ,  );// 计算转换后宽字符串的长度。（不包含字符串结束符）

     wchar_t *lpwsz= new wchar_t [iWLen+];

     MultiByteToWideChar( CP_ACP, , sResult.c_str(), sResult.size(), lpwsz, iWLen ); // 正式转换。

     lpwsz[iWLen] = L'\0';

     wstring wsResult(lpwsz);

     delete []lpwsz;

     return wsResult;

 }

 string Preprocess::myWideCharToMultibyte(wstring wsResult)

 {

     string sResult;

     int iLen= WideCharToMultiByte( CP_ACP, NULL, wsResult.c_str(), -, NULL, , NULL, FALSE ); // 计算转换后字符串的长度。（包含字符串结束符）

     char *lpsz= new char[iLen];

     WideCharToMultiByte( CP_OEMCP, NULL, wsResult.c_str(), -, lpsz, iLen, NULL, FALSE); // 正式转换。

     sResult.assign( lpsz, iLen- ); // 对string对象进行赋值。

     delete []lpsz;

     return sResult;

 }

 string Preprocess::ICTsplit(const char *sInput)

 {

     if(!ICTCLAS_Init())

     {

         printf("ICTCLAS INIT FAILED!\n");

         string strerr(sInput);

         return strerr;

     }

     ICTCLAS_SetPOSmap(ICT_POS_MAP_SECOND);

     //导入用户词典后

     const char* sResult = ICTCLAS_ParagraphProcess(sInput, );

     string strresult(sResult);

     //printf("%s\n", sResult);

     //把字符串转化成宽字符串

     wstring wsResult=myMultibyteToWideChar(strresult);

     boost::wregex wreg(L"\\s+");

     wsResult=boost::regex_replace(wsResult,wreg,wstring(L"|"));

     strresult=myWideCharToMultibyte(wsResult);

     //ofile<<str1;

     //ofile.close();

     //cout<<str1<<endl;

     //ICTCLAS_FileProcess("text.txt","test_result.txt",1);

     ICTCLAS_Exit();

     return strresult;

 }

 vectorPreprocess::goodWordsinPieceArticle(string rawtext,set stopwords)

 {

     vector goodWordstemp;

     vector goodWords;

     const char* sInput=rawtext.c_str();

     string sResult=ICTsplit(sInput);

     wstring wsResult=myMultibyteToWideChar(sResult);

     boost::wregex wreg(L"\\d+");//去掉中文空格

     wsResult=boost::regex_replace(wsResult,wreg,wstring(L""));

     //boost::regex_split(back_inserter(goodWordstemp),wsResult,wreg);

     boost::split(goodWordstemp,wsResult,boost::is_any_of("|"));

     for(vector::iterator it=goodWordstemp.begin();it!=goodWordstemp.end();it++)

     {

         string temp=myWideCharToMultibyte(*it);

         trim(temp," ");

         if(!stopwords.count(temp)&&!temp.empty())

         {

             goodWords.push_back(temp);

         }

     }

     return goodWords;

 }

 void Preprocess::DFcharicteristicWordSelection(mapint,int>>> &mymap,int DFthreshold)

 {

     int finalKeyWordsCount=;//计算共取了多少个关键词

     vectorint> >tempvector;

     for(mapint,int>>>::iterator it=mymap.begin();it!=mymap.end();++it)

     {

         tempvector.push_back(make_pair(it->first,(it->second).size()));

     }

     stable_sort(tempvector.begin(),tempvector.end(),isLonger);

     ofstream outfile(featurewordsAddress);

     for(vectorint> >::iterator it=tempvector.begin();it!=tempvector.end();it++)

     {

         if(it->second>=DFthreshold)

         {

             //outfile<<it->first<<" "<<it->second<<endl;

             outfile<<it->first<<endl;

             finalKeyWordsCount++;

         }

     }

     outfile.close();

     cout<<"最后共选择特征词"<<finalKeyWordsCount<<endl;

     cout<<"by the way,DFthreshold equals"<<DFthreshold<<endl;

 }

 vectorPreprocess::GetFinalKeyWords()

 {

     vectormyKeys;

     ifstream infile(featurewordsAddress);

     while(!infile.eof())

     {

         string temp;

         infile>>temp;

         if(temp!="")

         {

             myKeys.push_back(temp);

         }

     }

     return myKeys;

 }

 vectorint,int> >Preprocess::GetfinalKeysMaxTFDF(mapint,int>>> &mymap)

 {

     vectorint,int> >maxTFandDF;

     vectormyKeys=GetFinalKeyWords();

     for(vector::iterator it=myKeys.begin();it!=myKeys.end();it++)

     {

         int DF=mymap[*it].size();

         int maxTF=;

         for(vectorint,int> >::iterator subit=mymap[*it].begin();subit!=mymap[*it].end();subit++)

         {

             if(subit->second>maxTF)

             {

                 maxTF=subit->second;

             }

         }

         maxTFandDF.push_back(make_pair(maxTF,DF));

         //find_if(mymap[*it].begin(),mymap[*it].end(),

     }

     return maxTFandDF;

 }

 vectorint,double> >Preprocess::NormalizationVSM(vectorint,double> > tempVSM)

 {

     double sum=;

     for(vectorint,double> >::iterator vsmit=tempVSM.begin();vsmit!=tempVSM.end();++vsmit)

     {

         sum+=pow(vsmit->second,);

     }

     for(vectorint,double> >::iterator vsmit=tempVSM.begin();vsmit!=tempVSM.end();++vsmit)

     {

         vsmit->second/=sqrt(sum);

     }

     return tempVSM;

 }

 string Preprocess::FormatVSMtoString(vectorint,double> > tempVSM)

 {

     string ret="{";

     int commaindication=;

     for(vectorint,double> >::iterator vsmit=tempVSM.begin();vsmit!=tempVSM.end();++vsmit)

     {  

         ret+=do_fraction(vsmit->first)+" "+do_fraction(vsmit->second,);

         if(commaindication

         {

             ret+=",";

         }

         commaindication++;

     }

     ret+="}";

     return ret;

 }

 void Preprocess::WriteHeadArff()

 {

     ofstream ofile(arffFileAddress,ios::binary);

     ofile<<"@relation aticle"<<endl;

     ofile<<"\n";

     vector myKeys=GetFinalKeyWords();

     for(vector::iterator it=myKeys.begin();it!=myKeys.end();it++)

     {

         //string temp="@attribute "+"'"+(*it)+"'"+" real";

         string temp="";

         temp+="@attribute ";

         temp+="'";

         temp+=*(it);

         temp+="'";

         temp+=" real";

         ofile<<temp<<endl;

     }

     ofile<<"\n"<<endl;

     ofile<<"@data"<<endl;

     ofile.close();

 }

 void Preprocess::VSMFormation(mapint,int>>> &mymap)

 {   int corpus_N=endIndex-beginIndex+;

     ofstream ofile1(articleIdsAddress,ios::binary);//保存文章编号的文件

     ofstream ofile2(arffFileAddress,ios::binary|ios::app);

     vector myKeys=GetFinalKeyWords();

     vectorint,int> >maxTFandDF=GetfinalKeysMaxTFDF(mymap);

     for(int i=beginIndex;i<=endIndex;i++)

     {   vectorint,double> >tempVSM;

         for(vector::size_type j=;j

         {

         //vector >::iterator findit=find_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i));

             double TF=(double)count_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i));

             TF=0.5+0.5*(double)TF/(maxTFandDF[j].first);

             TF*=log((double)corpus_N/maxTFandDF[j].second);

             if(TF!=)

             {

                 tempVSM.push_back(make_pair(j,TF));

             }

         }

         if(!tempVSM.empty())

         {

             tempVSM=NormalizationVSM(tempVSM);

             string vsmStr=FormatVSMtoString(tempVSM);

             ofile1<<i<<endl;

             ofile2<<vsmStr<<endl;

         }

         tempVSM.clear();

     }

     ofile1.close();

     ofile2.close();

 }

 void Preprocess::WriteTotalArff(char *dbfield,int DFthreshold,bool isbagOfWordsExist,FUNCSEG seg)

 {

     mapint,int>>> mymap;

     if(!isbagOfWordsExist)

     {

         ConstructMap(mymap,dbfield,seg);

         save(mymap);

         cout<<"词袋子信息已经保存到硬盘"<<endl;

     }

     else

     {

         load(mymap);

     }

     DFcharicteristicWordSelection(mymap,DFthreshold);

     WriteHeadArff();

     VSMFormation(mymap);

     cout<<"arff文件已经形成"<<endl;

     string temp(infoFromWekaAddress);

     cout<<"请您将使用weka聚类，并保存为"<<temp<<endl;

 }

 map<</code>int,vector<</code>double> > Preprocess::VSMConstruction(mapint,int>>> &mymap)

 {

     int corpus_N=endIndex-beginIndex+;

     map<</code>int,vector<</code>double>> vsmMatrix;

     vector myKeys=GetFinalKeyWords();

     vectorint,int> >maxTFandDF=GetfinalKeysMaxTFDF(mymap);

     for(int i=beginIndex;i<=endIndex;i++)

     {

         vectorint,double> >tempVSM;

         for(vector::size_type j=;j

         {

             //vector >::iterator findit=find_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i));

             double TF=(double)count_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i));

             TF=0.5+(double)TF/(maxTFandDF[j].first);

             TF*=log((double)corpus_N/maxTFandDF[j].second);

             tempVSM.push_back(make_pair(j,TF));

         }

         if(!tempVSM.empty())

         {

             tempVSM=NormalizationVSM(tempVSM);

             for(vectorint,double> >::iterator it=tempVSM.begin();it!=tempVSM.end();it++)

             {

                 vsmMatrix[i].push_back(it->second);

             }

         }

         tempVSM.clear();

     }

     return vsmMatrix;

 }

 map<</code>double> > Preprocess::GetClusters()

 {

     map<</code>double> >clusters;

     ifstream ifile(infoFromWekaAddress);

     string temp;

     while(getline(ifile,temp))

     {   boost::smatch matchcluster;

     boost::regex regcluster("Cluster\\s+\\d+",boost::regex::icase);

     if(boost::regex_search(temp,matchcluster,regcluster))

     {

         string clustertmp=matchcluster[].str();

         string ordinates="";

         getline(ifile,ordinates);

         boost::regex regordinates("\\d+(\\.\\d{1,4})?");

         boost::smatch matchordinates;

         std::string::const_iterator it=ordinates.begin();

         std::string::const_iterator end=ordinates.end();

         while (boost::regex_search(it,end,matchordinates,regordinates))

         {

             string digitstemp=matchordinates[].str();

             double digitval=0.0;

             std::stringstream ss;

             ss<<digitstemp;

             ss>>digitval;

             clusters[clustertmp].push_back(digitval);

             it=matchordinates[].second;

         }

     }

     }

     return clusters;

 }

 double Preprocess::CalDotProductOfVectors(const vector<</code>double>&vector1,const vector<</code>double>&vector2)

 {

     double result = 0.0f;

     for (int i = ; i < vector1.size(); i++)

         result += vector1[i] * vector2[i];

     return result;

 }

 double Preprocess::CalCosineofVectors(const vector<</code>double>&vector1,const vector<</code>double>&vector2)

 {

     double numerator=CalDotProductOfVectors(vector1,vector2);

     double denominator=CalDotProductOfVectors(vector1,vector1)*CalDotProductOfVectors(vector2,vector2);

     denominator=sqrt(denominator);

     return numerator/denominator;

 }

 vectorint,string> > Preprocess::GenerateClusterInfo(map<</code>int,vector<</code>double> >&vsmMatrix, map<</code>double> >&clusters)

 {

     vectorint,string> >resultInfo;

     for(map<</code>int,vector<</code>double> >::iterator it=vsmMatrix.begin();it!=vsmMatrix.end();it++)

     {

         vectordouble> >clusterDistanceAist;

         for(map<</code>double> >::iterator clusterit=clusters.begin();clusterit!=clusters.end();clusterit++)

         {

             double temp=CalCosineofVectors(it->second,clusterit->second);

             clusterDistanceAist.push_back(make_pair(clusterit->first,temp));

         }

         sort(clusterDistanceAist.begin(),clusterDistanceAist.end(),myCmp);

         vectordouble> >::iterator cDAit=clusterDistanceAist.begin();

         resultInfo.push_back(make_pair(it->first,cDAit->first));

         clusterDistanceAist.clear();

     }

     return  resultInfo;

 }

 map<</code>int> > Preprocess::FetchArticlesOFClusters(map<</code>double> >&clusters,vectorint,string>>&resultInfo)

 {

     map<</code>int>> articlesInfo;

     for(vectorint,string>>::iterator retit=resultInfo.begin();retit!=resultInfo.end();retit++)

     {

         for(map<</code>double> >::iterator it=clusters.begin();it!=clusters.end();it++)

         {

             if(retit->second==it->first)

             {

                 articlesInfo[it->first].push_back(retit->first);

             }

         }

     }

     return articlesInfo;

 }

 void Preprocess::RetreiveArticleInfoFromDataBase()

 {

     mapint,int>>> mymap;

     vectorint,string>>resultInfo;

     map<</code>double> >clusters;

     map<</code>int,vector<</code>double> >vsmMatrix;

     map<</code>int>> articlesInfo;

     ofstream ofile("F:\\cluster\\ArticlesInPerCluster.txt");

     //boost::regex_replace(strresult)

     //ConstructMap(mymap,1,500);

     //save(mymap);

     load(mymap);

     vsmMatrix=VSMConstruction(mymap);

     clusters=GetClusters();

     resultInfo=GenerateClusterInfo(vsmMatrix,clusters);

     articlesInfo=FetchArticlesOFClusters(clusters,resultInfo);

     for(map<</code>int>>::iterator it=articlesInfo.begin();it!=articlesInfo.end();it++)

     {

         ostringstream out;

         string selectassist;

         char *selectsql=new char[];

         int count=;

         CoInitialize(NULL);

         _ConnectionPtr pConn(__uuidof(Connection));

         _RecordsetPtr pRst(__uuidof(Recordset));

         pConn->ConnectionString=dbconnection;

         pConn->Open("","","",adConnectUnspecified);

         cout <<it->first<<endl;

         ofile<<it->first<<endl;

         out<<"(";

         count=;

         for(int i=;isecond.size();i++)

         {

             out<<(it->second)[i];

             if(countsecond.size()-)

             {

                 out<<",";

             }

             count++;

         }

         out<<")";

         selectassist=out.str();

         sprintf_s(selectsql,,"%s %s","Select ArticleTitle,class from News Where ArticleId in ",selectassist.c_str());

         pRst=pConn->Execute(selectsql,NULL,adCmdText);

         while(!pRst->rsEOF)

         {

         //string keywordstr=(_bstr_t)pRst->GetCollect("CKeyWord");

             string title=(_bstr_t)pRst->GetCollect("ArticleTitle");

             //string rawtext=(_bstr_t)pRst->GetCollect("ArticleText");

             string categorization=(_bstr_t)pRst->GetCollect("class");

             cout<<"文章标题："<<title<<"文章所属类别: "<<categorization<<endl;

             ofile<<"文章标题："<<title<<"文章所属类别: "<<categorization<<endl;

             pRst->MoveNext();

         }

         pRst->Close();

         pConn->Close();

         pRst.Release();

         pConn.Release();

         CoUninitialize();

     }

 ofile.close(); 

 }

 vectorPreprocess:: mySplit(string s,set stopwords)

 {

     vector wordCollection;

     trim(s," ");

     int nPosBegin=;

     int nPosEnd=s.find(' ',nPosBegin);

     while(nPosEnd!=string::npos)

     {

         string temp=s.substr(nPosBegin,nPosEnd-nPosBegin);

         trim(temp," ");

         wordCollection.push_back(temp);

         nPosBegin=s.find_first_not_of(' ',nPosEnd);

         nPosEnd=s.find(' ',nPosBegin);

     }

     string temp=s.substr(nPosBegin,s.size()-nPosBegin);

     trim(temp," ");

     wordCollection.push_back(temp);

     return wordCollection;

 }

K-meams文本聚类算法C++实现的更多相关文章

10.HanLP实现k均值--文本聚类
笔记转载于GitHub项目:https://github.com/NLP-LOVE/Introduction-NLP 10. 文本聚类正所谓物以类聚,人以群分.人们在获取数据时需要整理,将相似的数据 ...
一步步教你轻松学K-means聚类算法
一步步教你轻松学K-means聚类算法(白宁超 2018年9月13日09:10:33) 导读:k-均值算法(英文:k-means clustering),属于比较常用的算法之一,文本首先介绍聚类的理 ...
文本挖掘之文本聚类（MapReduce）
刘勇 Email:lyssym@sina.com 简介针对大数量的文本数据,采用单线程处理时,一方面消耗较长处理时间,另一方面对大量数据的I/O操作也会消耗较长处理时间,同时对内存空间的消耗也是 ...
[python] 使用Jieba工具中文分词及文本聚类概念
声明:由于担心CSDN博客丢失,在博客园简单对其进行备份,以后两个地方都会写文章的~感谢CSDN和博客园提供的平台. 前面讲述了很多关于Python爬取本体Ontology.消息盒Inf ...
【十大算法实现之naive bayes】朴素贝叶斯算法之文本分类算法的理解与实现
关于bayes的基础知识,请参考: 基于朴素贝叶斯分类器的文本聚类算法 (上) http://www.cnblogs.com/phinecos/archive/2008/10/21/1315948.h ...
python聚类算法实战详细笔记 (python3.6+(win10、Linux))
python聚类算法实战详细笔记 (python3.6+(win10.Linux)) 一.基本概念: 1.计算TF-DIF TF-IDF是一种统计方法,用以评估一字词对于一个文件集或一个语料库 ...
文本挖掘之文本聚类（DBSCAN）
刘勇 Email:lyssym@sina.com 简介鉴于基于划分的文本聚类方法只能识别球形的聚类,因此本文对基于密度的文本聚类算法展开研究.DBSCAN(Density-Based Spat ...
K-means算法及文本聚类实践
K-Means是常用的聚类算法,与其他聚类算法相比,其时间复杂度低,聚类的效果也还不错,这里简单介绍一下k-means算法,下图是一个手写体数据集聚类的结果. 基本思想 k-means算法需要事先指定 ...
基于改进人工蜂群算法的K均值聚类算法（附MATLAB版源代码）
其实一直以来也没有准备在园子里发这样的文章,相对来说,算法改进放在园子里还是会稍稍显得格格不入.但是最近邮箱收到的几封邮件让我觉得有必要通过我的博客把过去做过的东西分享出去更给更多需要的人.从论文刊登 ...

随机推荐

Oracle常用命令13（数据库的启动、关闭）
数据库的启动.关闭数据库的启动:安装启动.非安装启动.共享启动.独占启动.约束启动.强制启动 --不登陆的方式进入 Sqlplus /nolog 安装启动: Startup {pfile=<f ...
ps扩大、缩小选区
用"套索工具""魔棒工具"或者等工具将选区选出来,创建出一个需要处理的选区. 点击ps菜单栏中的"选择",在下拉菜单中选择"修 ...
关于Tesseract3.01的使用方法
Tesseract就不多介绍勒,能找到的人都知道是干嘛的下面记录一下C# vs2010下的使用方法(借鉴http://blog.csdn.net/bobo1013767522/article/det ...
asp.net php asp jsp 301重定向的代码
介绍一下针对各类程序系统实施301重定向的代码: 1.Linux主机重定向 Godaddy的Liunx主机,Godaddy本身已经支持Apache,所以直接创建一个.htaccess文件就可以了,一般 ...
Java开源开源工作流
OpenEbXML 点击次数7801 Werkflow 点击次数11181 OSWorkflow 点击次数14988 wfmOpen 点击次数7997 OFBiz 点击次数1234 ...
iMAC——全新重装Mac系统
在参考网上重装Mac系统教程的时候,感觉这篇教程挺不错: http://www.iplaysoft.com/osx-yosemite-usb-install-drive.html (此教程终端命令处需 ...
YSPASYS 中小型企业简单员工评价考核系统
背景:公司运营接近2年时间了,随着不断的有员工入职.离职,使用信息化管理员工各类信息是一件很有必要的事儿.诸如员工基本信息,内部公告,资产盘点,客户管理,工作周报,优秀员工评选,请假.外出.报销.采购 ...
FreeMarker语法
向原作者致敬,原文地址http://www.cnblogs.com/linjiqin/p/3388298.html FreeMarker的插值有如下两种类型:1,通用插值${expr};2,数字格式化 ...
C#判断输入的是否是汉字
第一种方法:正则表达式 string text = "是不是汉字"; for (int i = 0; i < text.Length; i++) { if (Regex.Is ...
[转载]IIS下开启php扩展失效？感谢作者本人泪流满面
用户反应,空间不支持GD.系统环境是IIS PHP. 先用phpinfo探了一下,确实没有找到gd的影子.然后检查php.ini,发现gd扩展没有开启(windows下安装的php,其所有php扩 ...

K-meams文本聚类算法C++实现

K-meams文本聚类算法C++实现的更多相关文章

随机推荐

热门专题