机器学习算法之KMeans聚类

以你之姓@ 2023-06-06 10:52 122阅读 0赞

算法原理

聚类指的是把集合,分组成多个类,每个类中的对象都是彼此相似的。K-means是聚类中最常用的方法之一,它是基于点与点距离的相似度来计算最佳类别归属。

在使用该方法前,要注意(1)对数据异常值的处理;(2)对数据标准化处理(x-min(x))/(max(x)-min(x));(3)每一个类别的数量要大体均等;(4)不同类别间的特质值应该差异较大。

算法流程

(1)选择k个初始聚类中心

(2)计算每个对象与这k个中心各自的距离,按照最小距离原则分配到最邻近聚类

(3)使用每个聚类中的样本均值作为新的聚类中心

(4)重复步骤(2)和(3)直到聚类中心不再变化

(5)结束,得到k个聚类

代码实现

  1. #coding=utf-8
  2. from collections import Counter
  3. from copy import deepcopy
  4. from time import time
  5. from random import randint, seed, random
  6. # 统计程序运行时间函数
  7. # fn代表运行的函数
  8. def run_time(fn):
  9. def fun():
  10. start = time()
  11. fn()
  12. ret = time() - start
  13. if ret < 1e-6:
  14. unit = "ns"
  15. ret *= 1e9
  16. elif ret < 1e-3:
  17. unit = "us"
  18. ret *= 1e6
  19. elif ret < 1:
  20. unit = "ms"
  21. ret *= 1e3
  22. else:
  23. unit = "s"
  24. print("Total run time is %.1f %s\n" % (ret, unit))
  25. return fun()
  26. def load_data():
  27. f = open("boston/breast_cancer.csv")
  28. X = []
  29. y = []
  30. for line in f:
  31. line = line[:-1].split(',')
  32. xi = [float(s) for s in line[:-1]]
  33. yi = line[-1]
  34. if '.' in yi:
  35. yi = float(yi)
  36. else:
  37. yi = int(yi)
  38. X.append(xi)
  39. y.append(yi)
  40. f.close()
  41. return X, y
  42. # 将数据归一化到[0, 1]范围
  43. def min_max_scale(X):
  44. m = len(X[0])
  45. x_max = [-float('inf') for _ in range(m)]
  46. x_min = [float('inf') for _ in range(m)]
  47. for row in X:
  48. x_max = [max(a, b) for a, b in zip(x_max, row)]
  49. x_min = [min(a, b) for a, b in zip(x_min, row)]
  50. ret = []
  51. for row in X:
  52. tmp = [(x - b) / (a - b) for a, b, x in zip(x_max, x_min, row)]
  53. ret.append(tmp)
  54. return ret
  55. def get_euclidean_distance(arr1, arr2):
  56. return sum((x1 - x2) ** 2 for x1, x2 in zip(arr1, arr2)) ** 0.5
  57. def get_cosine_distance(arr1, arr2):
  58. numerator = sum(x1 * x2 for x1, x2 in zip(arr1, arr2))
  59. denominator = (sum(x1 ** 2 for x1 in arr1) *
  60. sum(x2 ** 2 for x2 in arr2)) ** 0.5
  61. return numerator / denominator
  62. class KMeans(object):
  63. # k 簇的个数
  64. # n_features 特征的个数
  65. # clister_centers 聚类中心
  66. # distance_fn 距离计算函数
  67. # cluster_samples_cnt 每个簇里面的样本数
  68. def __init__(self):
  69. self.k = None
  70. self.n_features = None
  71. self.cluster_centers = None
  72. self.distance_fn = None
  73. self.cluster_samples_cnt = None
  74. # 二分,查找有序列表里面大于目标值的第一个值
  75. def bin_search(self, target, nums):
  76. low = 0
  77. high = len(nums) - 1
  78. assert nums[low] <= target < nums[high], "Cannot find target!"
  79. while 1:
  80. mid = (low + high) // 2
  81. if mid == 0 or target >= nums[mid]:
  82. low = mid + 1
  83. elif target < nums[mid - 1]:
  84. high = mid - 1
  85. else:
  86. break
  87. return mid
  88. # 比较两个向量是否为同一向量
  89. def cmp_arr(self, arr1, arr2, eps=1e-8):
  90. return len(arr1) == len(arr2) and \
  91. all(abs(a- b) < eps for a, b in zip(arr1, arr2))
  92. # 初始化聚类中心
  93. def init_cluster_centers(self, X, k, n_features, distance_fn):
  94. n = len(X)
  95. centers = [X[randint(0, n-1)]]
  96. for _ in range(k-1):
  97. center_pre = centers[-1]
  98. idxs_dists = ([i, distance_fn(Xi, center_pre)] for i, Xi in enumerate(X))
  99. # 对距离进行排序
  100. idxs_dists = sorted(idxs_dists, key=lambda x: x[1])
  101. dists = [x[1] for x in idxs_dists]
  102. tot = sum(dists)
  103. for i in range(1, n):
  104. dists[i] /= tot
  105. for i in range(1, n):
  106. dists[i] += dists[i-1]
  107. # 随机选择一个聚类中心
  108. while 1:
  109. num = random()
  110. # 查找>=num的距离
  111. dist_idx = self.bin_search(num, dists)
  112. row_idx = idxs_dists[dist_idx][0]
  113. center_cur = X[row_idx]
  114. if not any(self.cmp_arr(center_cur, center) for center in centers):
  115. break
  116. centers.append(center_cur)
  117. return centers
  118. # 寻找距离Xi最近的聚类中心
  119. def get_nearest_center(self, Xi, centers, distance_fn):
  120. return min(((i, distance_fn(Xi, center)) for
  121. i, center in enumerate(centers)), key=lambda x: x[1])[0]
  122. # 寻找X最近的聚类中心
  123. def get_nearest_centers(self, X, distance_fn, centers):
  124. return [self.get_nearest_center(Xi, centers, distance_fn) for Xi in X]
  125. # 获取空的簇
  126. def get_empty_cluster_idxs(self, cluster_samples_cnt, k):
  127. clusters = ((i, cluster_samples_cnt[i]) for i in range(k))
  128. empty_clusters = filter(lambda x: x[1] == 0, clusters)
  129. return [empty_clusters[0] for empty_cluster in empty_clusters]
  130. # 在X中找到到所有非空簇中心的最远样本
  131. def get_furthest_row(self, X, distance_fn, centers, empty_cluster_idxs):
  132. def f(Xi, centers):
  133. return sum(distance_fn(Xi, centers) for center in centers)
  134. non_empty_centers = map(lambda x: x[1], filter(
  135. lambda x: x[0] not in empty_cluster_idxs, enumerate(centers)))
  136. return max(map(lambda x: [x, f(x, non_empty_centers)], X), key=lambda x: x[1])[0]
  137. # 处理空的簇
  138. def process_empty_clusters(self, X, distance_fn, n_features, centers, empty_cluster_idxs):
  139. for i in empty_cluster_idxs:
  140. center_cur = self.get_furthest_row(X, distance_fn, centers, empty_cluster_idxs)
  141. while any(self._cmp_arr(center_cur, center) for center in centers):
  142. center_cur = self.get_furthest_row(X, distance_fn, centers,
  143. empty_cluster_idxs)
  144. centers[i] = center_cur
  145. return centers
  146. # 重新获取聚类中心
  147. def get_cluster_centers(self, X, k, n_features, y, cluster_samples_cnt):
  148. ret = [[0 for _ in range(n_features)] for _ in range(k)]
  149. for Xi, cetner_num in zip(X, y):
  150. for j in range(n_features):
  151. ret[cetner_num][j] += Xi[j] / cluster_samples_cnt[cetner_num]
  152. return ret
  153. # 训练
  154. def fit(self, X, k, fn=None, n_iter=100):
  155. n_features = len(X[0])
  156. if fn is None:
  157. distance_fn = get_euclidean_distance
  158. else:
  159. error_msg = "Parameter distance_fn must be eu or cos!"
  160. assert fn in ("eu", "cos"), error_msg
  161. if fn == "eu":
  162. distance_fn = get_euclidean_distance
  163. if fn == "cos":
  164. distance_fn = get_cosine_distance
  165. centers = self.init_cluster_centers(X, k, n_features, distance_fn)
  166. for i in range(n_iter):
  167. while 1:
  168. # 寻找X的最近聚类中心
  169. y = self.get_nearest_centers(X, distance_fn, centers)
  170. # 统计每个簇的样本个数
  171. cluster_samples_cnt = Counter(y)
  172. # 获取空的簇
  173. empty_cluster_idxs = self.get_empty_cluster_idxs(cluster_samples_cnt, k)
  174. # 如果有空的簇
  175. if empty_cluster_idxs:
  176. centers = self.process_empty_clusters(centers, empty_cluster_idxs, n_features)
  177. else:
  178. break
  179. centers_new = self.get_cluster_centers(X, k, n_features, y, cluster_samples_cnt)
  180. centers = deepcopy(centers_new)
  181. print("Iteration: %d" % i)
  182. self.k = k
  183. self.n_features = n_features
  184. self.distance_fn = distance_fn
  185. self.cluster_centers = centers
  186. self.cluster_samples_cnt = cluster_samples_cnt
  187. def _predict(self, Xi):
  188. return self.get_nearest_center(Xi, self.cluster_centers, self.distance_fn)
  189. def predict(self, X):
  190. return [self._predict(Xi) for Xi in X]
  191. @run_time
  192. def main():
  193. print("Tesing the performance of Kmeans...")
  194. # Load data
  195. X, y = load_data()
  196. X = min_max_scale(X)
  197. # Train model
  198. est = KMeans()
  199. k = 2
  200. est.fit(X, k)
  201. print()
  202. # Model performance
  203. prob_pos = sum(y) / len(y)
  204. print("Positive probability of X is:%.1f%%.\n" % (prob_pos * 100))
  205. y_hat = est.predict(X)
  206. cluster_pos_tot_cnt = {i: [0, 0] for i in range(k)}
  207. for yi_hat, yi in zip(y_hat, y):
  208. cluster_pos_tot_cnt[yi_hat][0] += yi
  209. cluster_pos_tot_cnt[yi_hat][1] += 1
  210. cluster_prob_pos = {k: v[0] / v[1] for k, v in cluster_pos_tot_cnt.items()}
  211. for i in range(k):
  212. tot_cnt = cluster_pos_tot_cnt[i][1]
  213. prob_pos = cluster_prob_pos[i]
  214. print("Count of elements in cluster %d is:%d." %
  215. (i, tot_cnt))
  216. print("Positive probability of cluster %d is:%.1f%%.\n" % (i, prob_pos * 100))

对肺癌数据集聚类100轮结果展示

在这里插入图片描述
可以看到经过100次聚类后,正负样本被大量聚集在了一起,证明了聚类算法的有效性。

github源码

https://github.com/BBuf/machine-learning

发表评论

表情:
评论列表 (有 0 条评论,122人围观)

还没有评论,来说两句吧...

相关阅读

    相关 机器学习算法KMeans

    算法原理 聚类指的是把集合,分组成多个类,每个类中的对象都是彼此相似的。K-means是聚类中最常用的方法之一,它是基于点与点距离的相似度来计算最佳类别归属。 在使用该