#coding=utf-8
import codecs
import numpy
from numpy import *
import pylab def loadDataSet(fileName):
dataMat = []
fr = codecs.open(fileName)
for line in fr.readlines():
curLine = line.strip().split('\t')
fltLine = map(float, curLine)
dataMat.append(fltLine)
return dataMat def distMeasure(vecA, vecB):
#print vecA
dist = sqrt(sum(power(vecA - vecB, 2)))
return dist def kMeansInitCentroids(X, K):
"""
KMEANSINITCENTROIDS This function initializes K centroids that are to be
used in K-Means on the dataset X
centroids = KMEANSINITCENTROIDS(X, K) returns K initial centroids to be
used with the K-Means on the dataset X.
"""
n = shape(X)[1]
centroids = mat(zeros((K,n)))
for j in range(n):
#print X[:,j]
minJ = min(X[:,j])
rangeJ = float(max(array(X)[:,j]) - minJ)
centroids[:,j] = minJ + rangeJ * random.rand(K,1)
return centroids def findClosestCentroids(X, centroids):
"""
FINDCLOSESTCENTROIDS computes the centroid memberships for every example
idx = FINDCLOSESTCENTROIDS (X, centroids) returns the closest centroids
in idx for a dataset X where each row is a single example. idx = m x 1
vector of centroid assignments (i.e. each entry in range [1..K])
"""
# 数据总量
m = shape(X)[0]
K = shape(centroids)[0]
clusterAssment = mat(zeros((m,2)))#create mat to assign data points
#to a centroid, also holds SE of each point
#centroids = createCent(dataSet, k)
clusterChanged = True
while clusterChanged:
clusterChanged = False
for i in range(m):#for each data point assign it to the closest centroid
minDist = inf; minIndex = -1
# k个中间数据(质心)都与数据i进行欧氏比较,选择距离最近的第minIndex类
for j in range(K):
distJI = distMeasure(centroids[j,:],X[i,:])
if distJI < minDist:
minDist = distJI; minIndex = j
if clusterAssment[i,0] != minIndex: clusterChanged = True
clusterAssment[i,:] = minIndex,minDist**2
return clusterAssment def computeCentroids(X, clusterAssment, K):
"""
COMPUTECENTROIDS returs the new centroids by computing the means of the
data points assigned to each centroid.
centroids = COMPUTECENTROIDS(X, idx, K) returns the new centroids by
computing the means of the data points assigned to each centroid. It is
given a dataset X where each row is a single data point, a vector
idx of centroid assignments (i.e. each entry in range [1..K]) for each
example, and K, the number of centroids. You should return a matrix
centroids, where each row of centroids is the mean of the data points
assigned to it.
"""
n = shape(X)[1]
centroids = mat(zeros((K,n)))
for centroid in range(K):#recalculate centroids
# nonzero会产生两个array,第一个非零的为序号列表
ptsInClust = X[nonzero(clusterAssment[:,0].A==centroid)[0]]#get all the point in this cluster
#print 'ererer:',ptsInClust,'dfdf'
centroids[centroid,:] = mean(ptsInClust, axis=0) #assign centroid to mean
return centroids def show(dataSet, k, centroids, clusterAssment):
from matplotlib import pyplot as plt
numSamples, dim = dataSet.shape
mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr']
print type(dataSet)
for i in xrange(numSamples):
markIndex = int(clusterAssment[i, 0])
plt.plot(dataSet[i, 0], dataSet[i, 1], mark[markIndex])
mark = ['Dr', 'Db', 'Dg', 'Dk', '^b', '+b', 'sb', 'db', '<b', 'pb']
for i in range(k):
plt.plot(centroids[i, 0], centroids[i, 1], mark[i], markersize = 12)
plt.show() def runkMeans(X, initial_centroids,max_iters, plot_progress):
"""
RUNKMEANS runs the K-Means algorithm on data matrix X, where each row of X
is a single example
[centroids, idx] = RUNKMEANS(X, initial_centroids, max_iters, ...
plot_progress) runs the K-Means algorithm on data matrix X, where each
row of X is a single example. It uses initial_centroids used as the
initial centroids. max_iters specifies the total number of interactions
of K-Means to execute. plot_progress is a true/false flag that
indicates if the function should also plot its progress as the
learning happens. This is set to false by default. runkMeans returns
centroids, a Kxn matrix of the computed centroids and idx, a m x 1
vector of centroid assignments (i.e. each entry in range [1..K]).
"""
(m,n) = shape(X)
K = shape(initial_centroids)[0]
centroids = initial_centroids
clusterAssment = zeros((m,2)) #Run K-Means
for i in range(max_iters):
clusterAssment = findClosestCentroids(X, centroids)
centroids = computeCentroids(X, clusterAssment, K); return centroids, clusterAssment def main():
K =5
max_iters = 10
dataSet = loadDataSet('E://PythonSpace//TextClustering//data//test2.txt')
X = array(dataSet)
X = (X - mean(X)) / std(X) initial_centroids = kMeansInitCentroids(X, K)
myCentroids, clusterAssment = runkMeans(X, initial_centroids, max_iters,False);
print "-------------------------------------"
show(X, K, myCentroids, clusterAssment) main()

参考了Andrew Ng的Machine Learning Assignment(https://github.com/rieder91/MachineLearning/blob/master/Exercise%207/ex7/runkMeans.m)

以及博文http://www.cnblogs.com/MrLJC/p/4127553.html

运行结果:

k-means算法的Python实现的更多相关文章

  1. Fuzzy C Means 算法及其 Python 实现——写得很清楚,见原文

    Fuzzy C Means 算法及其 Python 实现 转自:http://note4code.com/2015/04/14/fuzzy-c-means-%E7%AE%97%E6%B3%95%E5% ...

  2. 分类算法——k最近邻算法(Python实现)(文末附工程源代码)

    kNN算法原理 k最近邻(k-Nearest Neighbor)算法是比较简单的机器学习算法.它采用测量不同特征值之间的距离方法进行分类,思想很简单:如果一个样本在特征空间中的k个最近邻(最相似)的样 ...

  3. KNN 与 K - Means 算法比较

    KNN K-Means 1.分类算法 聚类算法 2.监督学习 非监督学习 3.数据类型:喂给它的数据集是带label的数据,已经是完全正确的数据 喂给它的数据集是无label的数据,是杂乱无章的,经过 ...

  4. K-means算法

    K-means算法很简单,它属于无监督学习算法中的聚类算法中的一种方法吧,利用欧式距离进行聚合啦. 解决的问题如图所示哈:有一堆没有标签的训练样本,并且它们可以潜在地分为K类,我们怎么把它们划分呢?  ...

  5. Python实现kNN(k邻近算法)

    Python实现kNN(k邻近算法) 运行环境 Pyhton3 numpy科学计算模块 计算过程 st=>start: 开始 op1=>operation: 读入数据 op2=>op ...

  6. 机器学习算法与Python实践之(五)k均值聚类(k-means)

    机器学习算法与Python实践这个系列主要是参考<机器学习实战>这本书.因为自己想学习Python,然后也想对一些机器学习算法加深下了解,所以就想通过Python来实现几个比较常用的机器学 ...

  7. 机器学习算法与Python实践之(六)二分k均值聚类

    http://blog.csdn.net/zouxy09/article/details/17590137 机器学习算法与Python实践之(六)二分k均值聚类 zouxy09@qq.com http ...

  8. 用Python从零开始实现K近邻算法

    KNN算法的定义: KNN通过测量不同样本的特征值之间的距离进行分类.它的思路是:如果一个样本在特征空间中的k个最相似(即特征空间中最邻近)的样本中的大多数属于某一个类别,则该样本也属于这个类别.K通 ...

  9. 机器学习 Python实践-K近邻算法

    机器学习K近邻算法的实现主要是参考<机器学习实战>这本书. 一.K近邻(KNN)算法 K最近邻(k-Nearest Neighbour,KNN)分类算法,理解的思路是:如果一个样本在特征空 ...

  10. K均值算法-python实现

    测试数据展示: #coding:utf-8__author__ = 'similarface''''实现K均值算法 算法摘要:-----------------------------输入:所有数据点 ...

随机推荐

  1. Unity3d 物体沿着正七边形轨迹移动

    不对之处,敬请谅解. 1.圆内接正七边形半径 public static float r = 10; 2.存储七个顶点的值 Vector3[] ar = new Vector3[7]; 3.圆心角 s ...

  2. Windows下搭建PHP开发环境【总结】

    一.准备工作-下载所需软件 Apache 进入apache服务器官网http://httpd.apache.org/ ,下面是下载的教程:http://jingyan.baidu.com/articl ...

  3. Hibernate 关系映射方式(1)

    来源:本文转载自:http://blog.csdn.net/huangaigang6688/article/details/7761310 Hibernate映射解析——七种映射关系 首先我们了解一个 ...

  4. Photoshop基础,前景背景,图层,选取

    1*前景色背景色 Alt+Delete 键 前景色填充 Ctrl+Delete 键 背景色填充 X 颜色转换 D 颜色互换 两个填充的原因: 2*图层(只要做东西就要建图层)透明的纸进行叠加,尽量多建 ...

  5. byte数组与int,long,short,byte转换 (转载)

    byte数组和short数组转换 public short bytesToShort(byte[] bytes) { return ByteBuffer.wrap(bytes).order(ByteO ...

  6. NumPy基础:数组和矢量计算

    今天被老板fire了,还是继续抄书吧,安抚我受伤的小心脏.知识还是得慢慢积累,一步一个脚印,这样或许才是最快的捷径. ------2015-2-16-------------------------- ...

  7. group_concat()函数总结

    group_concat(),手册上说明:该函数返回带有来自一个组的连接的非NULL值的字符串结果.比较抽象,难以理解. 通俗点理解,其实是这样的:group_concat()会计算哪些行属于同一组, ...

  8. 转载 Deep learning:三(Multivariance Linear Regression练习)

    前言: 本文主要是来练习多变量线性回归问题(其实本文也就3个变量),参考资料见网页:http://openclassroom.stanford.edu/MainFolder/DocumentPage. ...

  9. IE8浏览器兼容问题总结

    IE8+兼容经验小结 January 15, 2014 本文分享下我在项目中积累的IE8+兼容性问题的解决方法.根据我的实践经验,如果你在写HTML/CSS时候是按照W3C推荐的方式写的,然后下面的几 ...

  10. 转:【WebDriver】封装GET方法来解决页面跳转不稳定的问题

    在大多数测试环境中,网络或者测试服务器主机之间并不是永远不出问题的,很多时候一个客户端的一个跳转的请求会因为不稳定的网络或者偶发的其它异常hang死在那里半天不动,直到人工干预动作的出现.      ...