python机器学习-聚类KMeans


基本原理

代码实现

先给出完整代码,再分别说明

#-*- coding:utf-8

import traceback
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt

'''
函数名:draw_original
功能:根据样本和真相画图
@X:样本
@Y:真相
'''

def draw_original(X, Y):
try:

plt.subplot(1,1,1)
plt.scatter(X[:, 0], X[:, 1], c=Y)

plt.title("original clusters")
plt.xlabel("Feature1")
plt.ylabel("Feature2")
plt.show()

except Exception,e:
print traceback.print_exc()

'''
函数名: drawing_n_clusters
功能: 根据聚类列表,聚类中心画图
@cluster_list: 聚类列表,每个元素是一个聚类。这个聚类样本的列表,注意是列表
@centroids: 聚类中心,矩阵形式存储
'''

def drawing_n_clusters(cluster_list, centroids):
try:
n_clusters = len(cluster_list)
k_clusters = [np.array(cluster) for cluster in cluster_list] # 每一个类别以矩阵形式存储

plt.subplot(1, 1, 1)
color_list = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w']
for idx, cluster in enumerate(k_clusters):
label_str = "cluster" + str(idx)
plt.scatter(cluster[:, 0], cluster[:, 1], c=color_list[idx], label=label_str)

plt.scatter(centroids[:, 0], centroids[:, 1], marker='+', color='y', s=200)

title = str(n_clusters) + " clusters"
plt.title(title)
plt.xlabel("Feature1")
plt.ylabel("Feature2")
plt.legend(loc=2)

plt.show()

except Exception, e:
print traceback.print_exc()

'''
函数名: draw_error
功能: 画出不同K时的损失和
@k_list: K列表
@error_list: 每个K下的损失和
'''

def draw_error( k_list, error_list ):
try:
# 画框设置
plt.figure(figsize=(8, 5), dpi=80) # 创建图像
plt.subplot(1,1,1)

# 画点
plt.plot(k_list, error_list, marker='o', c = 'blue')

# 加标题
plt.title("Total Error vs. # of Clusters")
# 加坐标轴
plt.xticks(k_list)
plt.xlabel("k")
plt.ylabel("total squared error")

# 显示
plt.show()

except Exception,e:
print traceback.print_exc()

'''
-------------------------------------------------------------------------------
'''


'''
函数名:load_dataset
功能:加载数据集:
@input_path:输入路径
@X: 样本列表,每个样本也以列表形式存储
@Y: 原始标签列表,以列表形式存储
'''

def load_dataset( input_path ):
try:

X = []
Y = []

infile = open(input_path, "r")

for line in infile:
data = line.rstrip('\r\n').split('\t')
x = []
y = []
y.append(int(data[0]))
x.append(float(data[1]))
x.append(float(data[2]))

X.append(x)
Y.append(y)

infile.close()
return X, Y

print "[INFO]: load_dataset is finished!"
except Exception,e:
print traceback.print_exc()



'''
函数名:training
功能:训练kmeans聚类器,初始点的选择采用kmeans++,对于K的训练迭代多次,返回最优值的聚类结果
@X: 样本-矩阵形式,均以向量的形式保存
@K: 聚类数量
@label: 返回每个样本的训练标签
@loss: means square均方误差
@centroids: 聚类中心
'''

def training( X, K ):
try:

kmeans = KMeans(n_clusters=K).fit(X)

label = kmeans.labels_
loss = kmeans.inertia_
centroids = kmeans.cluster_centers_

return label, loss, centroids

except Exception,e:
print traceback.print_exc()

'''
函数名:get_clusters
功能: 根据样本和聚类结果,获得每个聚类
@X: 样本-矩阵形式
@label: 样本标签-矩阵形式
@K: 聚类数量
@cluster_list: 类别
'''

def get_clusters(X, label, K):
try:

cluster_list = [ [] for x in range(K) ] # 每一个聚类用一个列表存。每个列表存这个聚类的样本向量
idx = 0
len_label = len(label)
while idx < len_label:
cluster_list[label[idx]].append(X[idx])
idx += 1

return cluster_list

except Exception,e:
print traceback.print_exc()

'''
-------------------------------------------------------------------------------
'''

def find_K():
try:

INPUT_PATH = "../data/4k2_far.txt"
OUTPUT_PATH_K = "../output/test_for_4k2/inertia.txt"

_X, _Y = load_dataset(INPUT_PATH)

X = np.array(_X)
#Y = np.array(_Y)

error_list = []
outfile = open(OUTPUT_PATH_K, "w")
for K in range(1,21):
_, loss, _ = training(X, K)
error_list.append(loss)
line = "K=" + str(K) + "," + str(loss)
outfile.write(line + '\n')
outfile.close()

k_list = [ k for k in range(1,21) ]
draw_error(k_list, error_list)

print "[INFO]: find_K is finished!"
except Exception,e:
print traceback.print_exc()

def main():
try:
INPUT_PATH = "../data/4k2_far.txt"

_X, _Y = load_dataset(INPUT_PATH)

X = np.array(_X)
Y = np.array(_Y)

K = 2
label, loss, centroids = training(X, K)

cluster_list = get_clusters(X, label, K)
drawing_n_clusters(cluster_list, centroids)

except Exception,e:
print traceback.print_exc()

if __name__ == '__main__':
#main()
find_K()
智能推荐

注意!

本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系我们删除。



 
© 2014-2019 ITdaan.com 粤ICP备14056181号  

赞助商广告