FROM:http://www.cnblogs.com/finallyliuyu/archive/2010/09/03/1817348.html

 头文件:

  #ifndef _Preprocess_H
#define _Preprocess_H
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include"ictclas30.h"
#include"boost\tr1\regex.hpp"
#include"boost/algorithm/string.hpp"
#include"windows.h" //一些谓词函数
using namespace std; class Preprocess
{
//typedef vector(Preprocess::*FUNCSEG)(string,set);
private:
char *bagofwordsAddress;//存放词袋子模型的位置
char * featurewordsAddress;//存放特征词文件的位置;
char *arffFileAddress;//存放ARFF文件的位置
char *infoFromWekaAddress;//存放调用weka后的实验结果
char *articleIdsAddress;//存放被聚类的文章的ID号
char *dbconnection;//数据库的链接字符串
char *dbselect;//数据库select语句
char *dbfield;//数据库字段
int beginIndex;//开始聚类的文章id
int endIndex;//结束聚类的文章id
public:
typedef vector(Preprocess::*FUNCSEG)(string,set);
Preprocess(int c_style_stringsize,const char *mydict,const char *keywordsinfo,const char *tobeCluster,const char * InfoFromWeka,const char *artileIds,const char *conn,const char *selectsql, int beginIndex,int endIndex)
{
bagofwordsAddress=new char[c_style_stringsize];
featurewordsAddress=new char[c_style_stringsize];
arffFileAddress=new char[c_style_stringsize];
infoFromWekaAddress=new char[c_style_stringsize];
articleIdsAddress=new char[c_style_stringsize];
dbconnection=new char[c_style_stringsize];
dbselect=new char[c_style_stringsize];
this->beginIndex=beginIndex;
this->endIndex=endIndex;
sprintf_s(bagofwordsAddress,c_style_stringsize,mydict);
sprintf_s(featurewordsAddress,c_style_stringsize,keywordsinfo);
sprintf_s(arffFileAddress,c_style_stringsize,tobeCluster);
sprintf_s(infoFromWekaAddress,c_style_stringsize,InfoFromWeka);
sprintf_s(articleIdsAddress,c_style_stringsize,artileIds);
sprintf_s(dbconnection,c_style_stringsize,conn);
sprintf_s(dbselect,c_style_stringsize,selectsql); } ~Preprocess()
{
delete []bagofwordsAddress;
delete []featurewordsAddress;
delete []arffFileAddress;
delete [] infoFromWekaAddress;
delete []articleIdsAddress;
delete []dbconnection;
delete []dbselect; }
void trim(string &str,const string val);//去除字符串首尾空白
//构建倒排表: key=word,val= a list of pairs which consists of articleid,and count, count=tf
int ConstructMap(mapint,int>>>&mymap,char *dbfield,FUNCSEG seg);
inline void TruncateArff()
{
ofstream ofile;
ofile.open(arffFileAddress,ios::trunc);
ofile.close();
}
//保存词袋子到硬盘
void save(mapint,int> > >&mymap);
//从内存中加载词袋子模型
void load(mapint,int> > >&mymap);
//打印词袋子模型
void print(mapint,int> > >&mymap);
//窄字符串转化成宽字符串
wstring myMultibyteToWideChar(string sResult);
//宽字符串转化成窄字符串
string myWideCharToMultibyte(wstring wsResult);
//调用ICTclass分词
string ICTsplit(const char *sInput);
//构造停用词表
setMakeStopSet();
//去除停用词,噪声词
vectorgoodWordsinPieceArticle(string rawtext,set stopwords);
//整数转化成字符串
string do_fraction(int val);
//浮点数转化成字符串
string do_fraction(double val, int decplaces=);
//特征词选择算法
void DFcharicteristicWordSelection(mapint,int>>> &mymap,int DFthreshold);
//获取最后的特征词
vector GetFinalKeyWords();
//获取特征词的maxTF,DF
vectorint,int> >GetfinalKeysMaxTFDF(mapint,int>>> &mymap);
//文档向量模型规范化
vectorint,double> > NormalizationVSM(vectorint,double> > tempVSM);
//建立文档向量模型并且写到arff文件里
void VSMFormation(mapint,int>>> &mymap); string FormatVSMtoString(vectorint,double> > tempVSM);
//写Arff文件头部
void WriteHeadArff();
void WriteTotalArff(char * dbfield,int DFthreshlod,bool isbagOfwordsexsist,FUNCSEG seg); map<</code>int,vector<</code>double> >VSMConstruction(mapint,int>>> &mymap); map<</code>double> > GetClusters(); double CalDotProductOfVectors(const vector<</code>double>&vector1,const vector<</code>double>&vector2); double CalCosineofVectors(const vector<</code>double>&vector1,const vector<</code>double>&vector2); vectorint,string> >GenerateClusterInfo(map<</code>int,vector<</code>double> >&vsmMatrix, map<</code>double> >&clusters); map<</code>int> >FetchArticlesOFClusters(map<</code>double> >&clusters,vectorint,string>>&resultInfo);
void RetreiveArticleInfoFromDataBase();
vector mySplit(string s,set stopwords);//分割关键词 }; #endif Preprocess类的函数功能实现文件: #include"stdafx.h"
#include "Preprocess.h" #pragma comment(lib, "ICTCLAS30.lib")
using namespace std;
bool isLonger(const pairint> &pair1, const pairint> &pair2)
{
return pair1.second>pair2.second;
}
bool cntAssist(const pairint> &pair1)
{
return pair1.second<=;
}
bool PredTF(const pair<</code>int,int>& pair1,int articleId)
{
return pair1.first==articleId; }
class PredTFclass
{
private: const int m;
public:
PredTFclass(int id):m(id){};
bool operator()(const pair<</code>int,int>& pair1){return PredTF(pair1,m);};
};
bool myCmp(const pairdouble>&pair1,const pairdouble>&pair2 )
{
return pair1.second>=pair2.second;
} void Preprocess:: trim(string &str,const string val)
{
str.erase(,str.find_first_not_of(val));
str.erase(str.find_last_not_of(val)+val.size());
}
int Preprocess::ConstructMap(mapint,int>>>&mymap,char *dbfield,FUNCSEG seg)
{
//setMakeStopSet();
CoInitialize(NULL);
_ConnectionPtr pConn(__uuidof(Connection));
_RecordsetPtr pRst(__uuidof(Recordset));
pConn->ConnectionString=dbconnection;
pConn->Open("","","",adConnectUnspecified);
pRst=pConn->Execute(dbselect,NULL,adCmdText);
setstopwords=MakeStopSet(); while(!pRst->rsEOF)
{ vectorwordcollection;
//string keywordstr=(_bstr_t)pRst->GetCollect("CKeyWord");
string rawtext=(_bstr_t)pRst->GetCollect(dbfield);
if(rawtext!="")
{
wordcollection=(this->*seg)(rawtext,stopwords);
string tempid=(_bstr_t)pRst->GetCollect("ArticleId");
int articleid=atoi(tempid.c_str());
for(vector::iterator strit=wordcollection.begin();strit!=wordcollection.end();strit++)
{
vectorint,int>>::iterator it;
if(mymap[*strit].empty())
{
pair<</code>int,int>mytemppair=make_pair(articleid,);
mymap[*strit].push_back(mytemppair); }
else
{
for(it=mymap[*strit].begin();it!=mymap[*strit].end();it++)
{
if(it->first==articleid)
{
it->second=++(it->second);
break;
} }
if(it==mymap[*strit].end())
{
pair<</code>int,int>mytemppair=make_pair(articleid,);
mymap[*strit].push_back(mytemppair);
} } } } pRst->MoveNext();
wordcollection.clear();
}
pRst->Close();
pConn->Close();
pRst.Release();
pConn.Release();
CoUninitialize(); return ; }
void Preprocess::save(mapint,int> > >&mymap)
{
ofstream outfile(bagofwordsAddress,ios::binary);
outfile<<mymap.size()<<endl;
mapint,int> > >::iterator it;
for (it=mymap.begin();it!=mymap.end();it++)
{ outfile<<it->first<<endl;
vectorint,int>>::iterator subit;
outfile<<it->second.size()<<endl;
for(subit=(it->second).begin();subit!=(it->second).end();++subit)
{
outfile<<subit->first<<" "<<subit->second<<" "<<";"<<" ";
}
outfile<<endl;
}
//outfile.write((char *)&mymap,sizeof(mymap)); outfile.close(); }
void Preprocess::load(mapint,int> > >&mymap)
{
std::locale loc1 = std::locale::global(std::locale(".936"));
{
// 在这里使用std::ifstream 或者 std::fstream
ifstream infile(bagofwordsAddress,ios::binary);
int lenMyMap;//保存词典长度
int lenVector;//保存每个词出现的文章数目
string key;//保存读出的map的键值
int articleId;//文章标号
int count;//在该文章中刚出现的数目
string comma;
string semicolon;
infile>>lenMyMap;
while(!infile.eof())
{
infile>>key;
infile>>lenVector;
vectorint,int> >temp;
for (int i=;i
{
infile>>articleId>>count>>semicolon;
temp.push_back(make_pair(articleId,count));
}
mymap[key]=temp; } infile.close();
}
std::locale::global(std::locale(loc1)); }
void print(mapint,int> > >&mymap)
{
cout<<mymap.size()<<endl;
mapint,int> > >::iterator it;
for (it=mymap.begin();it!=mymap.end();it++)
{ cout<<it->first<<endl;
vectorint,int>>::iterator subit;
cout<<it->second.size()<<endl;
for(subit=(it->second).begin();subit!=(it->second).end();++subit)
{
cout<<subit->first<<','<<subit->second<<";";
}
cout<<endl;
} }
set Preprocess::MakeStopSet()
{
set stopwordsSet;
ifstream ifile("stopwords.txt");
while(!ifile.eof())
{
string temp;
trim(temp," ");
ifile>>temp;
stopwordsSet.insert(temp);
}
return stopwordsSet;
} string Preprocess::do_fraction(int val)
{
ostringstream out;
out<<val;
string str= out.str(); //从流中取出字符串
str.swap(string(str.c_str()));//删除nul之后的多余字符
return str; }
string Preprocess::do_fraction(double val,int decplaces)
{ //int prec=numeric_limits::digits10;
char DECIMAL_POINT='.';
ostringstream out;
//out.precision(prec);
out<<val;
string str=out.str();
size_t n=str.find(DECIMAL_POINT);
if((n!=string::npos)&&n+decplaces
{
str[n+decplaces]='\0';
}
str.swap(string(str.c_str())); return str;
}
wstring Preprocess::myMultibyteToWideChar(string sResult)
{
int iWLen=MultiByteToWideChar( CP_ACP, , sResult.c_str(), sResult.size(), , );// 计算转换后宽字符串的长度。(不包含字符串结束符)
wchar_t *lpwsz= new wchar_t [iWLen+];
MultiByteToWideChar( CP_ACP, , sResult.c_str(), sResult.size(), lpwsz, iWLen ); // 正式转换。
lpwsz[iWLen] = L'\0';
wstring wsResult(lpwsz);
delete []lpwsz;
return wsResult;
}
string Preprocess::myWideCharToMultibyte(wstring wsResult)
{
string sResult;
int iLen= WideCharToMultiByte( CP_ACP, NULL, wsResult.c_str(), -, NULL, , NULL, FALSE ); // 计算转换后字符串的长度。(包含字符串结束符)
char *lpsz= new char[iLen];
WideCharToMultiByte( CP_OEMCP, NULL, wsResult.c_str(), -, lpsz, iLen, NULL, FALSE); // 正式转换。
sResult.assign( lpsz, iLen- ); // 对string对象进行赋值。
delete []lpsz;
return sResult; }
string Preprocess::ICTsplit(const char *sInput)
{
if(!ICTCLAS_Init())
{
printf("ICTCLAS INIT FAILED!\n");
string strerr(sInput);
return strerr;
}
ICTCLAS_SetPOSmap(ICT_POS_MAP_SECOND);
//导入用户词典后 const char* sResult = ICTCLAS_ParagraphProcess(sInput, );
string strresult(sResult);
//printf("%s\n", sResult);
//把字符串转化成宽字符串
wstring wsResult=myMultibyteToWideChar(strresult);
boost::wregex wreg(L"\\s+");
wsResult=boost::regex_replace(wsResult,wreg,wstring(L"|"));
strresult=myWideCharToMultibyte(wsResult); //ofile<<str1;
//ofile.close();
//cout<<str1<<endl;
//ICTCLAS_FileProcess("text.txt","test_result.txt",1);
ICTCLAS_Exit(); return strresult;
}
vectorPreprocess::goodWordsinPieceArticle(string rawtext,set stopwords)
{
vector goodWordstemp;
vector goodWords;
const char* sInput=rawtext.c_str();
string sResult=ICTsplit(sInput);
wstring wsResult=myMultibyteToWideChar(sResult);
boost::wregex wreg(L"\\d+");//去掉中文空格
wsResult=boost::regex_replace(wsResult,wreg,wstring(L""));
//boost::regex_split(back_inserter(goodWordstemp),wsResult,wreg);
boost::split(goodWordstemp,wsResult,boost::is_any_of("|")); for(vector::iterator it=goodWordstemp.begin();it!=goodWordstemp.end();it++)
{
string temp=myWideCharToMultibyte(*it);
trim(temp," ");
if(!stopwords.count(temp)&&!temp.empty())
{
goodWords.push_back(temp);
} } return goodWords;
}
void Preprocess::DFcharicteristicWordSelection(mapint,int>>> &mymap,int DFthreshold)
{
int finalKeyWordsCount=;//计算共取了多少个关键词
vectorint> >tempvector;
for(mapint,int>>>::iterator it=mymap.begin();it!=mymap.end();++it)
{
tempvector.push_back(make_pair(it->first,(it->second).size()));
} stable_sort(tempvector.begin(),tempvector.end(),isLonger);
ofstream outfile(featurewordsAddress);
for(vectorint> >::iterator it=tempvector.begin();it!=tempvector.end();it++)
{
if(it->second>=DFthreshold)
{
//outfile<<it->first<<" "<<it->second<<endl;
outfile<<it->first<<endl;
finalKeyWordsCount++; } }
outfile.close();
cout<<"最后共选择特征词"<<finalKeyWordsCount<<endl;
cout<<"by the way,DFthreshold equals"<<DFthreshold<<endl; }
vectorPreprocess::GetFinalKeyWords()
{
vectormyKeys;
ifstream infile(featurewordsAddress);
while(!infile.eof())
{
string temp;
infile>>temp;
if(temp!="")
{
myKeys.push_back(temp);
} }
return myKeys;
}
vectorint,int> >Preprocess::GetfinalKeysMaxTFDF(mapint,int>>> &mymap)
{
vectorint,int> >maxTFandDF;
vectormyKeys=GetFinalKeyWords();
for(vector::iterator it=myKeys.begin();it!=myKeys.end();it++)
{
int DF=mymap[*it].size();
int maxTF=;
for(vectorint,int> >::iterator subit=mymap[*it].begin();subit!=mymap[*it].end();subit++)
{
if(subit->second>maxTF)
{
maxTF=subit->second;
} }
maxTFandDF.push_back(make_pair(maxTF,DF));
//find_if(mymap[*it].begin(),mymap[*it].end(),
}
return maxTFandDF;
}
vectorint,double> >Preprocess::NormalizationVSM(vectorint,double> > tempVSM)
{ double sum=;
for(vectorint,double> >::iterator vsmit=tempVSM.begin();vsmit!=tempVSM.end();++vsmit)
{
sum+=pow(vsmit->second,);
}
for(vectorint,double> >::iterator vsmit=tempVSM.begin();vsmit!=tempVSM.end();++vsmit)
{
vsmit->second/=sqrt(sum);
}
return tempVSM; }
string Preprocess::FormatVSMtoString(vectorint,double> > tempVSM)
{
string ret="{";
int commaindication=;
for(vectorint,double> >::iterator vsmit=tempVSM.begin();vsmit!=tempVSM.end();++vsmit)
{ ret+=do_fraction(vsmit->first)+" "+do_fraction(vsmit->second,);
if(commaindication
{
ret+=",";
}
commaindication++;
}
ret+="}";
return ret;
}
void Preprocess::WriteHeadArff()
{
ofstream ofile(arffFileAddress,ios::binary);
ofile<<"@relation aticle"<<endl;
ofile<<"\n";
vector myKeys=GetFinalKeyWords();
for(vector::iterator it=myKeys.begin();it!=myKeys.end();it++)
{
//string temp="@attribute "+"'"+(*it)+"'"+" real";
string temp="";
temp+="@attribute ";
temp+="'";
temp+=*(it);
temp+="'";
temp+=" real"; ofile<<temp<<endl;
}
ofile<<"\n"<<endl;
ofile<<"@data"<<endl;
ofile.close();
}
void Preprocess::VSMFormation(mapint,int>>> &mymap)
{ int corpus_N=endIndex-beginIndex+;
ofstream ofile1(articleIdsAddress,ios::binary);//保存文章编号的文件
ofstream ofile2(arffFileAddress,ios::binary|ios::app); vector myKeys=GetFinalKeyWords();
vectorint,int> >maxTFandDF=GetfinalKeysMaxTFDF(mymap);
for(int i=beginIndex;i<=endIndex;i++)
{ vectorint,double> >tempVSM;
for(vector::size_type j=;j
{
//vector >::iterator findit=find_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i));
double TF=(double)count_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i)); TF=0.5+0.5*(double)TF/(maxTFandDF[j].first);
TF*=log((double)corpus_N/maxTFandDF[j].second);
if(TF!=)
{
tempVSM.push_back(make_pair(j,TF)); } }
if(!tempVSM.empty())
{
tempVSM=NormalizationVSM(tempVSM);
string vsmStr=FormatVSMtoString(tempVSM);
ofile1<<i<<endl;
ofile2<<vsmStr<<endl;
}
tempVSM.clear(); }
ofile1.close();
ofile2.close(); }
void Preprocess::WriteTotalArff(char *dbfield,int DFthreshold,bool isbagOfWordsExist,FUNCSEG seg)
{ mapint,int>>> mymap;
if(!isbagOfWordsExist)
{
ConstructMap(mymap,dbfield,seg);
save(mymap);
cout<<"词袋子信息已经保存到硬盘"<<endl;
}
else
{
load(mymap);
}
DFcharicteristicWordSelection(mymap,DFthreshold);
WriteHeadArff();
VSMFormation(mymap);
cout<<"arff文件已经形成"<<endl; string temp(infoFromWekaAddress); cout<<"请您将使用weka聚类,并保存为"<<temp<<endl;
}
map<</code>int,vector<</code>double> > Preprocess::VSMConstruction(mapint,int>>> &mymap)
{
int corpus_N=endIndex-beginIndex+;
map<</code>int,vector<</code>double>> vsmMatrix;
vector myKeys=GetFinalKeyWords();
vectorint,int> >maxTFandDF=GetfinalKeysMaxTFDF(mymap);
for(int i=beginIndex;i<=endIndex;i++)
{
vectorint,double> >tempVSM;
for(vector::size_type j=;j
{
//vector >::iterator findit=find_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i));
double TF=(double)count_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i));
TF=0.5+(double)TF/(maxTFandDF[j].first);
TF*=log((double)corpus_N/maxTFandDF[j].second);
tempVSM.push_back(make_pair(j,TF)); }
if(!tempVSM.empty())
{
tempVSM=NormalizationVSM(tempVSM);
for(vectorint,double> >::iterator it=tempVSM.begin();it!=tempVSM.end();it++)
{
vsmMatrix[i].push_back(it->second);
} }
tempVSM.clear(); }
return vsmMatrix; }
map<</code>double> > Preprocess::GetClusters()
{ map<</code>double> >clusters;
ifstream ifile(infoFromWekaAddress);
string temp;
while(getline(ifile,temp))
{ boost::smatch matchcluster;
boost::regex regcluster("Cluster\\s+\\d+",boost::regex::icase);
if(boost::regex_search(temp,matchcluster,regcluster))
{
string clustertmp=matchcluster[].str();
string ordinates="";
getline(ifile,ordinates);
boost::regex regordinates("\\d+(\\.\\d{1,4})?");
boost::smatch matchordinates;
std::string::const_iterator it=ordinates.begin();
std::string::const_iterator end=ordinates.end();
while (boost::regex_search(it,end,matchordinates,regordinates))
{
string digitstemp=matchordinates[].str();
double digitval=0.0;
std::stringstream ss;
ss<<digitstemp;
ss>>digitval;
clusters[clustertmp].push_back(digitval);
it=matchordinates[].second;
} }
}
return clusters;
}
double Preprocess::CalDotProductOfVectors(const vector<</code>double>&vector1,const vector<</code>double>&vector2)
{
double result = 0.0f;
for (int i = ; i < vector1.size(); i++)
result += vector1[i] * vector2[i];
return result;
}
double Preprocess::CalCosineofVectors(const vector<</code>double>&vector1,const vector<</code>double>&vector2)
{
double numerator=CalDotProductOfVectors(vector1,vector2);
double denominator=CalDotProductOfVectors(vector1,vector1)*CalDotProductOfVectors(vector2,vector2);
denominator=sqrt(denominator);
return numerator/denominator;
}
vectorint,string> > Preprocess::GenerateClusterInfo(map<</code>int,vector<</code>double> >&vsmMatrix, map<</code>double> >&clusters)
{
vectorint,string> >resultInfo;
for(map<</code>int,vector<</code>double> >::iterator it=vsmMatrix.begin();it!=vsmMatrix.end();it++)
{
vectordouble> >clusterDistanceAist;
for(map<</code>double> >::iterator clusterit=clusters.begin();clusterit!=clusters.end();clusterit++)
{ double temp=CalCosineofVectors(it->second,clusterit->second);
clusterDistanceAist.push_back(make_pair(clusterit->first,temp)); }
sort(clusterDistanceAist.begin(),clusterDistanceAist.end(),myCmp);
vectordouble> >::iterator cDAit=clusterDistanceAist.begin(); resultInfo.push_back(make_pair(it->first,cDAit->first));
clusterDistanceAist.clear();
}
return resultInfo; }
map<</code>int> > Preprocess::FetchArticlesOFClusters(map<</code>double> >&clusters,vectorint,string>>&resultInfo)
{
map<</code>int>> articlesInfo; for(vectorint,string>>::iterator retit=resultInfo.begin();retit!=resultInfo.end();retit++)
{
for(map<</code>double> >::iterator it=clusters.begin();it!=clusters.end();it++)
{
if(retit->second==it->first)
{
articlesInfo[it->first].push_back(retit->first);
}
}
} return articlesInfo; }
void Preprocess::RetreiveArticleInfoFromDataBase()
{
mapint,int>>> mymap;
vectorint,string>>resultInfo;
map<</code>double> >clusters;
map<</code>int,vector<</code>double> >vsmMatrix;
map<</code>int>> articlesInfo;
ofstream ofile("F:\\cluster\\ArticlesInPerCluster.txt");
//boost::regex_replace(strresult)
//ConstructMap(mymap,1,500);
//save(mymap);
load(mymap);
vsmMatrix=VSMConstruction(mymap);
clusters=GetClusters();
resultInfo=GenerateClusterInfo(vsmMatrix,clusters);
articlesInfo=FetchArticlesOFClusters(clusters,resultInfo); for(map<</code>int>>::iterator it=articlesInfo.begin();it!=articlesInfo.end();it++)
{
ostringstream out;
string selectassist;
char *selectsql=new char[];
int count=;
CoInitialize(NULL);
_ConnectionPtr pConn(__uuidof(Connection));
_RecordsetPtr pRst(__uuidof(Recordset));
pConn->ConnectionString=dbconnection;
pConn->Open("","","",adConnectUnspecified);
cout <<it->first<<endl;
ofile<<it->first<<endl;
out<<"(";
count=;
for(int i=;isecond.size();i++)
{
out<<(it->second)[i];
if(countsecond.size()-)
{
out<<",";
}
count++; }
out<<")";
selectassist=out.str();
sprintf_s(selectsql,,"%s %s","Select ArticleTitle,class from News Where ArticleId in ",selectassist.c_str()); pRst=pConn->Execute(selectsql,NULL,adCmdText);
while(!pRst->rsEOF)
{
//string keywordstr=(_bstr_t)pRst->GetCollect("CKeyWord");
string title=(_bstr_t)pRst->GetCollect("ArticleTitle");
//string rawtext=(_bstr_t)pRst->GetCollect("ArticleText");
string categorization=(_bstr_t)pRst->GetCollect("class");
cout<<"文章标题:"<<title<<"文章所属类别: "<<categorization<<endl;
ofile<<"文章标题:"<<title<<"文章所属类别: "<<categorization<<endl; pRst->MoveNext(); }
pRst->Close();
pConn->Close();
pRst.Release();
pConn.Release();
CoUninitialize(); } ofile.close(); }
vectorPreprocess:: mySplit(string s,set stopwords)
{
vector wordCollection;
trim(s," "); int nPosBegin=;
int nPosEnd=s.find(' ',nPosBegin);
while(nPosEnd!=string::npos)
{
string temp=s.substr(nPosBegin,nPosEnd-nPosBegin);
trim(temp," ");
wordCollection.push_back(temp);
nPosBegin=s.find_first_not_of(' ',nPosEnd);
nPosEnd=s.find(' ',nPosBegin);
}
string temp=s.substr(nPosBegin,s.size()-nPosBegin);
trim(temp," ");
wordCollection.push_back(temp); return wordCollection; }

K-meams文本聚类算法C++实现的更多相关文章

  1. 10.HanLP实现k均值--文本聚类

    笔记转载于GitHub项目:https://github.com/NLP-LOVE/Introduction-NLP 10. 文本聚类 正所谓物以类聚,人以群分.人们在获取数据时需要整理,将相似的数据 ...

  2. 一步步教你轻松学K-means聚类算法

    一步步教你轻松学K-means聚类算法(白宁超  2018年9月13日09:10:33) 导读:k-均值算法(英文:k-means clustering),属于比较常用的算法之一,文本首先介绍聚类的理 ...

  3. 文本挖掘之文本聚类(MapReduce)

    刘 勇  Email:lyssym@sina.com 简介 针对大数量的文本数据,采用单线程处理时,一方面消耗较长处理时间,另一方面对大量数据的I/O操作也会消耗较长处理时间,同时对内存空间的消耗也是 ...

  4. [python] 使用Jieba工具中文分词及文本聚类概念

    声明:由于担心CSDN博客丢失,在博客园简单对其进行备份,以后两个地方都会写文章的~感谢CSDN和博客园提供的平台.        前面讲述了很多关于Python爬取本体Ontology.消息盒Inf ...

  5. 【十大算法实现之naive bayes】朴素贝叶斯算法之文本分类算法的理解与实现

    关于bayes的基础知识,请参考: 基于朴素贝叶斯分类器的文本聚类算法 (上) http://www.cnblogs.com/phinecos/archive/2008/10/21/1315948.h ...

  6. python聚类算法实战详细笔记 (python3.6+(win10、Linux))

    python聚类算法实战详细笔记 (python3.6+(win10.Linux)) 一.基本概念:     1.计算TF-DIF TF-IDF是一种统计方法,用以评估一字词对于一个文件集或一个语料库 ...

  7. 文本挖掘之文本聚类(DBSCAN)

    刘 勇   Email:lyssym@sina.com 简介 鉴于基于划分的文本聚类方法只能识别球形的聚类,因此本文对基于密度的文本聚类算法展开研究.DBSCAN(Density-Based Spat ...

  8. K-means算法及文本聚类实践

    K-Means是常用的聚类算法,与其他聚类算法相比,其时间复杂度低,聚类的效果也还不错,这里简单介绍一下k-means算法,下图是一个手写体数据集聚类的结果. 基本思想 k-means算法需要事先指定 ...

  9. 基于改进人工蜂群算法的K均值聚类算法(附MATLAB版源代码)

    其实一直以来也没有准备在园子里发这样的文章,相对来说,算法改进放在园子里还是会稍稍显得格格不入.但是最近邮箱收到的几封邮件让我觉得有必要通过我的博客把过去做过的东西分享出去更给更多需要的人.从论文刊登 ...

随机推荐

  1. Oracle导入SQL脚本执行 scott 用户下的表删除了

    执行 .sql 文件时,应在 sqlplus  或 cmd 中执行,速度比plsql 中的command window 中书许多, scott 用户下的表删除了 可以执行如下 @D:\app\Admi ...

  2. saltstack实战4--综合练习1

    规范配置管理 实际工作中可能会有现网环境,基线环境,开发环境. 需要使用saltstack统一管理.机器多了,业务多了,可能配置文件存放的会比较乱,因此可以统一管理起来 我们可以再加2个目录,test ...

  3. 4632 NOIP[2015] 运输计划

    4632 NOIP[2015] 运输计划  时间限制: 1 s  空间限制: 256000 KB  题目等级 : 大师 Master 题解       题目描述 Description 公元 2044 ...

  4. ActionBar 的简单使用

    About ActionBar The action bar is one of the most important design elements you can implement for yo ...

  5. python学习day4--python基础--字典

    字典的常用操作: #字典天然去重,key唯一,如果key相同,只能打印出一个 id_db={ 220456789852963741:{ 'name':"alex", 'age':3 ...

  6. JAVA之经典Student问题1

    通过“三目运算符”求最大值与最小值. class student { //定义学生编号 private String stu; //学生姓名 private String name; //学生书信成绩 ...

  7. JAVA 实现通过URL下载文件到本地库

    /** * TODO 下载文件到本地 * @author nadim * @date Sep 11, 2015 11:45:31 AM * @param fileUrl 远程地址 * @param f ...

  8. MSSQL 数字钱转化为大写

    --说明: --1.本函数范围从 毫 ~ 兆 --2.有四种精度(元,角 ,分,厘 ,毫) --3.有三种进位规则(四舍五入,接舍去,非0就入) --参数说明:dbo.MoneyToCapital( ...

  9. 使用JS调用WebService接口

    <script> $(document).ready(function () { var username = "admin"; var password = &quo ...

  10. ViewTreeObserver类概述

    ViewTreeObserver 版本:Android 3.0 r1 结构 继承关系 public final class ViewTreeObserver extends Object java.l ...