大数据挖掘算法篇之K-Means实例
一、引言
K-Means算法是聚类算法中,应用最为广泛的一种。本文基于欧几里得距离公式:d = sqrt((x1-x2)^+(y1-y2)^)计算二维向量间的距离,作为聚类划分的依据,输入数据为二维数据两列数据,输出结果为聚类中心和元素划分结果。输入数据格式如下:
0.0 0.0
1.0 0.0
0.0 1.0
2.0 1.0
1.0 2.0
2.0 2.0
2.0 0.0
0.0 2.0
7.0 6.0
7.0 7.0
7.0 8.0
8.0 6.0
8.0 7.0
8.0 8.0
8.0 9.0
9.0 7.0
9.0 8.0
9.0 9.0
二、欧几里得距离:
/****************************************************************************
* *
* KMEANS *
* *
*****************************************************************************/ #include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <conio.h>
#include <math.h> // FUNCTION PROTOTYPES // DEFINES
#define SUCCESS 1
#define FAILURE 0
#define TRUE 1
#define FALSE 0
#define MAXVECTDIM 20
#define MAXPATTERN 20
#define MAXCLUSTER 10 char *f2a(double x, int width){
char cbuf[];
char *cp;
int i,k;
int d,s;
cp=fcvt(x,width,&d,&s);
if (s) {
strcpy(cbuf,"-");
}
else {
strcpy(cbuf," ");
} /* endif */
if (d>) {
for (i=; i<d; i++) {
cbuf[i+]=cp[i];
} /* endfor */
cbuf[d+]=;
cp+=d;
strcat(cbuf,".");
strcat(cbuf,cp);
} else {
if (d==) {
strcat(cbuf,".");
strcat(cbuf,cp);
}
else {
k=-d;
strcat(cbuf,".");
for (i=; i<k; i++) {
strcat(cbuf,"");
} /* endfor */
strcat(cbuf,cp);
} /* endif */
} /* endif */
cp=&cbuf[];
return cp;
} // ***** Defined structures & classes *****
struct aCluster {
double Center[MAXVECTDIM];
int Member[MAXPATTERN]; //Index of Vectors belonging to this cluster
int NumMembers;
}; struct aVector {
double Center[MAXVECTDIM];
int Size;
}; class System {
private:
double Pattern[MAXPATTERN][MAXVECTDIM+];
aCluster Cluster[MAXCLUSTER];
int NumPatterns; // Number of patterns
int SizeVector; // Number of dimensions in vector
int NumClusters; // Number of clusters
void DistributeSamples(); // Step 2 of K-means algorithm
int CalcNewClustCenters();// Step 3 of K-means algorithm
double EucNorm(int, int); // Calc Euclidean norm vector
int FindClosestCluster(int); //ret indx of clust closest to pattern
//whose index is arg
public:
void system();
int LoadPatterns(char *fname); // Get pattern data to be clustered
void InitClusters(); // Step 1 of K-means algorithm
void RunKMeans(); // Overall control K-means process
void ShowClusters(); // Show results on screen
void SaveClusters(char *fname); // Save results to file
void ShowCenters();
};
//输出聚类中心
void System::ShowCenters(){
int i,j;
printf("Cluster centers:\n");
for (i=; i<NumClusters; i++) {
Cluster[i].Member[]=i;
printf("ClusterCenter[%d]=(%f,%f)\n",i,Cluster[i].Center[],Cluster[i].Center[]);
} /* endfor */
printf("\n");
getchar();
} //读取文件
int System::LoadPatterns(char *fname)
{
FILE *InFilePtr;
int i,j;
double x;
if((InFilePtr = fopen(fname, "r")) == NULL)
return FAILURE;
fscanf(InFilePtr, "%d", &NumPatterns); // Read # of patterns 18数据量
fscanf(InFilePtr, "%d", &SizeVector); // Read dimension of vector 2维度
fscanf(InFilePtr, "%d", &NumClusters); // Read # of clusters for K-Means 2簇
for (i=; i<NumPatterns; i++) { // For each vector
for (j=; j<SizeVector; j++) { // create a pattern
fscanf(InFilePtr,"%lg",&x); // consisting of all elements
Pattern[i][j]=x;
} /* endfor */
} /* endfor */
//输出所有数据元素
printf("Input patterns:\n");
for (i=; i<NumPatterns; i++) {
printf("Pattern[%d]=(%2.3f,%2.3f)\n",i,Pattern[i][],Pattern[i][]);
} /* endfor */
printf("\n--------------------\n");
getchar();
return SUCCESS;
}
//***************************************************************************
// InitClusters *
// Arbitrarily assign a vector to each of the K clusters *
// We choose the first K vectors to do this *
//***************************************************************************
//初始化聚类中心
void System::InitClusters(){
int i,j;
printf("Initial cluster centers:\n");
for (i=; i<NumClusters; i++) {
Cluster[i].Member[]=i;
for (j=; j<SizeVector; j++) {
Cluster[i].Center[j]=Pattern[i][j];
} /* endfor */
} /* endfor */
for (i=; i<NumClusters; i++) {
printf("ClusterCenter[%d]=(%f,%f)\n",i,Cluster[i].Center[],Cluster[i].Center[]); //untransplant
} /* endfor */
printf("\n");
getchar();
}
//运行KMeans
void System::RunKMeans(){
int converged;
int pass;
pass=;
converged=FALSE;
//第N次聚类
while (converged==FALSE) {
printf("PASS=%d\n",pass++);
DistributeSamples();
converged=CalcNewClustCenters();
ShowCenters();
getchar();
} /* endwhile */
}
//在二维和三维空间中的欧式距离的就是两点之间的距离,二维的公式是
//d = sqrt((x1-x2)^+(y1-y2)^)
//通过这种运算,就可以把所有列的属性都纳入进来
double System::EucNorm(int p, int c){ // Calc Euclidean norm of vector difference
double dist,x; // between pattern vector, p, and cluster
int i; // center, c.
char zout[];
char znum[];
char *pnum;
//
pnum=&znum[];
strcpy(zout,"d=sqrt(");
printf("The distance from pattern %d to cluster %d is calculated as:\n",p,c);
dist=;
for (i=; i<SizeVector ;i++){
//拼写字符串
x=(Cluster[c].Center[i]-Pattern[p][i])*(Cluster[c].Center[i]-Pattern[p][i]);
strcat(zout,f2a(x,));
if (i==)
strcat(zout,"+");
//计算距离
dist += (Cluster[c].Center[i]-Pattern[p][i])*(Cluster[c].Center[i]-Pattern[p][i]);
} /* endfor */
printf("%s)\n",zout);
return dist;
}
//查找最近的群集
int System::FindClosestCluster(int pat){
int i, ClustID;
double MinDist, d;
MinDist =9.9e+99;
ClustID=-;
for (i=; i<NumClusters; i++) {
d=EucNorm(pat,i);
printf("Distance from pattern %d to cluster %d is %f\n\n",pat,i,sqrt(d));
if (d<MinDist) {
MinDist=d;
ClustID=i;
} /* endif */
} /* endfor */
if (ClustID<) {
printf("Aaargh");
exit();
} /* endif */
return ClustID;
}
//
void System::DistributeSamples(){
int i,pat,Clustid,MemberIndex;
//Clear membership list for all current clusters
for (i=; i<NumClusters;i++){
Cluster[i].NumMembers=;
}
for (pat=; pat<NumPatterns; pat++) {
//Find cluster center to which the pattern is closest
Clustid= FindClosestCluster(pat);//查找最近的聚类中心
printf("patern %d assigned to cluster %d\n\n",pat,Clustid);
//post this pattern to the cluster
MemberIndex=Cluster[Clustid].NumMembers;
Cluster[Clustid].Member[MemberIndex]=pat;
Cluster[Clustid].NumMembers++;
} /* endfor */
}
//计算新的群集中心
int System::CalcNewClustCenters(){
int ConvFlag,VectID,i,j,k;
double tmp[MAXVECTDIM];
char xs[];
char ys[];
char nc1[];
char nc2[];
char *pnc1;
char *pnc2;
char *fpv; pnc1=&nc1[];
pnc2=&nc2[];
ConvFlag=TRUE;
printf("The new cluster centers are now calculated as:\n");
for (i=; i<NumClusters; i++) { //for each cluster
pnc1=itoa(Cluster[i].NumMembers,nc1,);
pnc2=itoa(i,nc2,);
strcpy(xs,"Cluster Center");
strcat(xs,nc2);
strcat(xs,"(1/");
strcpy(ys,"(1/");
strcat(xs,nc1);
strcat(ys,nc1);
strcat(xs,")(");
strcat(ys,")(");
for (j=; j<SizeVector; j++) { // clear workspace
tmp[j]=0.0;
} /* endfor */
for (j=; j<Cluster[i].NumMembers; j++) { //traverse member vectors
VectID=Cluster[i].Member[j];
for (k=; k<SizeVector; k++) { //traverse elements of vector
tmp[k] += Pattern[VectID][k]; // add (member) pattern elmnt into temp
if (k==) {
strcat(xs,f2a(Pattern[VectID][k],));
} else {
strcat(ys,f2a(Pattern[VectID][k],));
} /* endif */
} /* endfor */
if(j<Cluster[i].NumMembers-){
strcat(xs,"+");
strcat(ys,"+");
}
else {
strcat(xs,")");
strcat(ys,")");
}
} /* endfor */
for (k=; k<SizeVector; k++) { //traverse elements of vector
tmp[k]=tmp[k]/Cluster[i].NumMembers;
if (tmp[k] != Cluster[i].Center[k])
ConvFlag=FALSE;
Cluster[i].Center[k]=tmp[k];
} /* endfor */
printf("%s,\n",xs);
printf("%s\n",ys);
} /* endfor */
return ConvFlag;
}
//输出聚类
void System::ShowClusters(){
int cl;
for (cl=; cl<NumClusters; cl++) {
printf("\nCLUSTER %d ==>[%f,%f]\n", cl,Cluster[cl].Center[],Cluster[cl].Center[]);
} /* endfor */
} void System::SaveClusters(char *fname){
}
四、主调程序
void main(int argc, char *argv[])
{ System kmeans;
/*
if (argc<2) {
printf("USAGE: KMEANS PATTERN_FILE\n");
exit(0);
}*/
if (kmeans.LoadPatterns("KM2.DAT")==FAILURE ){
printf("UNABLE TO READ PATTERN_FILE:%s\n",argv[]);
exit();
} kmeans.InitClusters();
kmeans.RunKMeans();
kmeans.ShowClusters();
}
五、输出结果
Input patterns:
Pattern[]=(0.000,0.000)
Pattern[]=(1.000,0.000)
Pattern[]=(0.000,1.000)
Pattern[]=(2.000,1.000)
Pattern[]=(1.000,2.000)
Pattern[]=(2.000,2.000)
Pattern[]=(2.000,0.000)
Pattern[]=(0.000,2.000)
Pattern[]=(7.000,6.000)
Pattern[]=(7.000,7.000)
Pattern[]=(7.000,8.000)
Pattern[]=(8.000,6.000)
Pattern[]=(8.000,7.000)
Pattern[]=(8.000,8.000)
Pattern[]=(8.000,9.000)
Pattern[]=(9.000,7.000)
Pattern[]=(9.000,8.000)
Pattern[]=(9.000,9.000) -------------------- Initial cluster centers:
ClusterCenter[]=(0.000000,0.000000)
ClusterCenter[]=(1.000000,0.000000) PASS=
The distance from pattern to cluster is calculated as:
d=sqrt( .+ .)
Distance from pattern to cluster is 0.000000 The distance from pattern to cluster is calculated as:
d=sqrt( 1.0000+ .)
Distance from pattern to cluster is 1.000000 patern assigned to cluster The distance from pattern to cluster is calculated as:
d=sqrt( 1.0000+ .)
Distance from pattern to cluster is 1.000000 The distance from pattern to cluster is calculated as:
d=sqrt( .+ .)
Distance from pattern to cluster is 0.000000 patern assigned to cluster The distance from pattern to cluster is calculated as:
d=sqrt( .+ 1.0000)
Distance from pattern to cluster is 1.000000 The distance from pattern to cluster is calculated as:
d=sqrt( 1.0000+ 1.0000)
Distance from pattern to cluster is 1.414214 patern assigned to cluster The distance from pattern to cluster is calculated as:
d=sqrt( 4.0000+ 1.0000)
Distance from pattern to cluster is 2.236068 The distance from pattern to cluster is calculated as:
d=sqrt( 1.0000+ 1.0000)
Distance from pattern to cluster is 1.414214 patern assigned to cluster The distance from pattern to cluster is calculated as:
d=sqrt( 1.0000+ 4.0000)
Distance from pattern to cluster is 2.236068 The distance from pattern to cluster is calculated as:
d=sqrt( .+ 4.0000)
Distance from pattern to cluster is 2.000000 patern assigned to cluster The distance from pattern to cluster is calculated as:
d=sqrt( 4.0000+ 4.0000)
Distance from pattern to cluster is 2.828427 The distance from pattern to cluster is calculated as:
d=sqrt( 1.0000+ 4.0000)
Distance from pattern to cluster is 2.236068 patern assigned to cluster The distance from pattern to cluster is calculated as:
d=sqrt( 4.0000+ .)
Distance from pattern to cluster is 2.000000 The distance from pattern to cluster is calculated as:
d=sqrt( 1.0000+ .)
Distance from pattern to cluster is 1.000000 patern assigned to cluster The distance from pattern to cluster is calculated as:
d=sqrt( .+ 4.0000)
Distance from pattern to cluster is 2.000000 The distance from pattern to cluster is calculated as:
d=sqrt( 1.0000+ 4.0000)
Distance from pattern to cluster is 2.236068 patern assigned to cluster The distance from pattern to cluster is calculated as:
d=sqrt( 49.0000+ 36.0000)
Distance from pattern to cluster is 9.219544 The distance from pattern to cluster is calculated as:
d=sqrt( 36.0000+ 36.0000)
Distance from pattern to cluster is 8.485281 patern assigned to cluster The distance from pattern to cluster is calculated as:
d=sqrt( 49.0000+ 49.0000)
Distance from pattern to cluster is 9.899495 The distance from pattern to cluster is calculated as:
d=sqrt( 36.0000+ 49.0000)
Distance from pattern to cluster is 9.219544 patern assigned to cluster The distance from pattern to cluster is calculated as:
d=sqrt( 49.0000+ 64.0000)
Distance from pattern to cluster is 10.630146 The distance from pattern to cluster is calculated as:
d=sqrt( 36.0000+ 64.0000)
Distance from pattern to cluster is 10.000000 patern assigned to cluster The distance from pattern to cluster is calculated as:
d=sqrt( 64.0000+ 36.0000)
Distance from pattern to cluster is 10.000000 The distance from pattern to cluster is calculated as:
d=sqrt( 49.0000+ 36.0000)
Distance from pattern to cluster is 9.219544 patern assigned to cluster The distance from pattern to cluster is calculated as:
d=sqrt( 64.0000+ 49.0000)
Distance from pattern to cluster is 10.630146 The distance from pattern to cluster is calculated as:
d=sqrt( 49.0000+ 49.0000)
Distance from pattern to cluster is 9.899495 patern assigned to cluster The distance from pattern to cluster is calculated as:
d=sqrt( 64.0000+ 64.0000)
Distance from pattern to cluster is 11.313708 The distance from pattern to cluster is calculated as:
d=sqrt( 49.0000+ 64.0000)
Distance from pattern to cluster is 10.630146 patern assigned to cluster The distance from pattern to cluster is calculated as:
d=sqrt( 64.0000+ 81.0000)
Distance from pattern to cluster is 12.041595 The distance from pattern to cluster is calculated as:
d=sqrt( 49.0000+ 81.0000)
Distance from pattern to cluster is 11.401754 patern assigned to cluster The distance from pattern to cluster is calculated as:
d=sqrt( 81.0000+ 49.0000)
Distance from pattern to cluster is 11.401754 The distance from pattern to cluster is calculated as:
d=sqrt( 64.0000+ 49.0000)
Distance from pattern to cluster is 10.630146 patern assigned to cluster The distance from pattern to cluster is calculated as:
d=sqrt( 81.0000+ 64.0000)
Distance from pattern to cluster is 12.041595 The distance from pattern to cluster is calculated as:
d=sqrt( 64.0000+ 64.0000)
Distance from pattern to cluster is 11.313708 patern assigned to cluster The distance from pattern to cluster is calculated as:
d=sqrt( 81.0000+ 81.0000)
Distance from pattern to cluster is 12.727922 The distance from pattern to cluster is calculated as:
d=sqrt( 64.0000+ 81.0000)
Distance from pattern to cluster is 12.041595 patern assigned to cluster The new cluster centers are now calculated as:
Cluster Center0(/)( .+ .+ .),
(/)( .+ 1.000+ 2.000)
Cluster Center1(/)( 1.000+ 2.000+ 1.000+ 2.000+ 2.000+ 7.000+ 7.000+ 7.000+
.+ 8.000+ 8.000+ 8.000+ 9.000+ 9.000+ 9.000),
(/)( .+ 1.000+ 2.000+ 2.000+ .+ 6.000+ 7.000+ 8.000+ 6.000+ 7.000+ 8.00
+ 9.000+ 7.000+ 8.000+ 9.000)
Cluster centers:
ClusterCenter[]=(0.000000,1.000000)
ClusterCenter[]=(5.866667,5.333333)
大数据挖掘算法篇之K-Means实例的更多相关文章
- Python聚类算法之基本K均值实例详解
Python聚类算法之基本K均值实例详解 本文实例讲述了Python聚类算法之基本K均值运算技巧.分享给大家供大家参考,具体如下: 基本K均值 :选择 K 个初始质心,其中 K 是用户指定的参数,即所 ...
- 图说十大数据挖掘算法(一)K最近邻算法
如果你之前没有学习过K最近邻算法,那今天几张图,让你明白什么是K最近邻算法. 先来一张图,请分辨它是什么水果 很多同学不假思索,直接回答:“菠萝”!!! 仔细看看同学们,这是菠萝么?那再看下边这这张图 ...
- python实现十大核心算法(桶排没实例)
# author:sevenduke # 2019-06-11 # 一.交换排序 # 排序算法的温故:冒泡排序 def dubblesort(arr): for i in range(0, len(a ...
- 详解十大经典数据挖掘算法之——Apriori
本文始发于个人公众号:TechFlow,原创不易,求个关注 今天是机器学习专题的第19篇文章,我们来看经典的Apriori算法. Apriori算法号称是十大数据挖掘算法之一,在大数据时代威风无两,哪 ...
- 机器学习——十大数据挖掘之一的决策树CART算法
本文始发于个人公众号:TechFlow,原创不易,求个关注 今天是机器学习专题的第23篇文章,我们今天分享的内容是十大数据挖掘算法之一的CART算法. CART算法全称是Classification ...
- 【十大经典数据挖掘算法】k
[十大经典数据挖掘算法]系列 C4.5 K-Means SVM Apriori EM PageRank AdaBoost kNN Naïve Bayes CART 1. 引言 k-means与kNN虽 ...
- 【十大经典数据挖掘算法】PageRank
[十大经典数据挖掘算法]系列 C4.5 K-Means SVM Apriori EM PageRank AdaBoost kNN Naïve Bayes CART 我特地把PageRank作为[十大经 ...
- 【十大经典数据挖掘算法】CART
[十大经典数据挖掘算法]系列 C4.5 K-Means SVM Apriori EM PageRank AdaBoost kNN Naïve Bayes CART 1. 前言 分类与回归树(Class ...
- ICDM评选:数据挖掘十大经典算法
原文地址:http://blog.csdn.net/aladdina/article/details/4141177 国际权威的学术组织the IEEE International Conferenc ...
随机推荐
- [转载]struts1小项目
http://www.blogjava.net/nokiaguy/archive/2009/01/13/251101.html
- 常用docker
随便什么版本的linux 接入daocloud.io 在发现镜像中选择DockerHub 搜索对应的image,然后部署. 手动输入YAML即可 aria: image: cuteribs/aria2 ...
- select * from v$reserved_words
select * from v$reserved_words 查询库中所有关键字
- tp 邮件发送
1.需要phpmail邮件发送包, 2.邮件发送函数function sendMail($to, $title, $content){ require_once('./PHPMailer_v5.1/c ...
- Android下打印堆栈的两种方法
1. for(StackTraceElement i:Thread.currentThread().getStackTrace()){ System.out.println(i); } 2. Log. ...
- .Net EF Core数据库使用SQL server 2008 R2分页报错How to avoid the “Incorrect syntax near 'OFFSET'. Invalid usage of the option NEXT in the FETCH statement.”
一. 问题说明 最近.Net EF core 程序部署到服务器,服务器数据库安装的是SQL server 2008 R2,我本地用的的是SQL server 2014,在用到分页查询时报错如下: H ...
- vue 局部引入js插件
参考:https://blog.csdn.net/zhouzuoluo/article/details/84781490
- uDig配图与GeoServer添加Style
软件介绍: uDig是一个开源的桌面GIS软件,可以进行shp与栅格数据地图文件的编辑和查看,对OpenGIS标准,关于互联网GIS.网络地图服务器和网络功能服务器有特别的加强.通常和GeoServe ...
- TCP状态转化图 TIME_WAIT解析
先上转换图: 重点研究TIME_WAIT状态,根据UNIX网络编程中的思路,TIME_WAIT状态有两个存在的理由: 理由1. 客户端执行主动关闭,假设最终的ACK丢失,服务器将重新发送它的最后那个F ...
- react 部分ES6写法
react+react-router+antd 栗子:https://github.com/Aquarius1993/reactApp 模块: 1. 引入模块 import React from 'r ...