import matplotlib.pyplot as plt
import numpy as np
##########加载数据############
def load_data_set():
"""
加载数据集
:return:返回两个数组,普通数组
data_arr -- 原始数据的特征
label_arr -- 原始数据的标签,也就是每条样本对应的类别
"""
data_arr = []
label_arr = []
# 如果想下载参照https://github.com/varyshare/AiLearning/blob/master/data/6.SVM/testSet.txt
# 欢迎follow的我github
f = open('myspace/svm_data.txt', 'r')
for line in f.readlines():
line_arr = line.strip().split()
data_arr.append([np.float(line_arr[0]), np.float(line_arr[1])])
label_arr.append(int(line_arr[2]))
return np.array(data_arr), np.array(label_arr)
x,label = load_data_set()
# 绘制出数据点分析看有几个聚类
#plt.scatter(x[:,0],x[:,1])
##############k-Means算法#################
# 创建k个聚类数组,用于存放属于该聚类的点
clusters = []
p1 = [6,4]
p2 = [1,3]
cluster_center = np.array([p1,p2])
k = 2
for i in range(k):
clusters.append([])
epoch = 3
for _ in range(epoch):
for i in range(k):
clusters[i]=[]
# 计算所有点到这k个聚类中心的距离
for i in range(x.shape[0]):
xi = x[i]
distances = np.sum((cluster_center-xi)**2,axis=1)
# 离哪个聚类中心近,就把这个点序号加到哪个聚类中
c = np.argmin(distances)
clusters[c].append(i)
# 重新计算k个聚类的聚类中心(每个聚类所有点加起来取平均)
for i in range(k):
cluster_center[i] = np.sum(x[clusters[i]],axis=0)/len(clusters[i])
plt.scatter(x[clusters[0],0],x[clusters[0],1])
plt.scatter(x[clusters[1],0],x[clusters[1],1])