机器学习之聚类算法——聚类效果评估可视化

忘是亡心i 2023-06-28 04:55 68阅读 0赞

我曾在机器学习之聚类算法应用篇中介绍过,聚类算法常使用轮廓系数来评估聚类效果,不过有时候并不是轮廓系数越大越好,如下面两幅图所示,图中的红色虚线表示聚类系数分数:

watermark_type_ZmFuZ3poZW5naGVpdGk_shadow_10_text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3FxXzI2ODIyMDI5_size_16_color_FFFFFF_t_70

watermark_type_ZmFuZ3poZW5naGVpdGk_shadow_10_text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3FxXzI2ODIyMDI5_size_16_color_FFFFFF_t_70 1

显然将簇数据设置为2的时候得到的轮廓系数最高,达到了0.705分,但是这并不一定是最好的聚类结果,显然在这个测试集中,我们有4个簇。为了将各个簇的轮廓系数以可视化的形式展现出来,辅助决策聚类参数,【机器学习】菜菜的sklearn课堂06 - 聚类算法与KMeans 中提供了对应的代码,能够绘制上面的可视化图,带有详细注释的代码见最后。

另外我已经将两份聚类可视化的代码封装,方便调参以及调用。

cluster_algos.py

第一份代码是做各个聚类算法对比的cluster_algos.py:

  1. # 不同聚类算法结果对比
  2. from sklearn.datasets import make_circles
  3. import matplotlib.pyplot as plt
  4. import numpy as np
  5. from sklearn.cluster import KMeans, DBSCAN, SpectralClustering, Birch, MeanShift, AgglomerativeClustering
  6. from sklearn.metrics import silhouette_score, silhouette_samples
  7. from sklearn.decomposition import PCA
  8. import pandas as pd
  9. import time
  10. import functools
  11. def time_cost(func):
  12. @functools.wraps(func)
  13. def wrapper(*args, **kwargs):
  14. t0 = time.time()
  15. func(*args, **kwargs)
  16. t1 = time.time()
  17. print(args[0],':%.2fs' % (t1 - t0))
  18. return func(*args, **kwargs), t1 - t0
  19. return wrapper
  20. def load_data(file):
  21. assert file != ''
  22. df = pd.read_csv(file)
  23. x = df.values
  24. pca = PCA(n_components=2)
  25. pca_result = pca.fit_transform(x)
  26. return x, pca_result
  27. @time_cost
  28. def cluster_function(model_name, model, data):
  29. y_pred = model.fit_predict(data)
  30. return y_pred
  31. if __name__ == '__main__':
  32. model_list = {
  33. "AgglomerativeClustering": AgglomerativeClustering(n_clusters = 4),
  34. "KMeans": KMeans(n_clusters = 4, random_state=10),
  35. "DBSCAN": DBSCAN(eps=0.1),
  36. "Birch": Birch(n_clusters=4),
  37. "SpectralClustering": SpectralClustering(n_clusters = 4, random_state=10),
  38. "MeanShift": MeanShift()
  39. }
  40. x, pca_result = load_data('./data.csv')
  41. i = 1
  42. fig = plt.figure(figsize=(15,10))
  43. for model in model_list:
  44. fig.add_subplot(2,3,i)
  45. result = cluster_function(model, model_list[model], x)
  46. plt.scatter(pca_result[:,0], pca_result[:,1], marker='.', c=result[0])
  47. plt.title("{}({})".format(model, silhouette_score(x, result[0])))
  48. plt.text(.99, .01, ('%.2fs' % (result[1])).lstrip('0'), transform=plt.gca().transAxes, horizontalalignment='right')
  49. i += 1
  50. plt.show()

其生成结果如下图所示:

watermark_type_ZmFuZ3poZW5naGVpdGk_shadow_10_text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3FxXzI2ODIyMDI5_size_16_color_FFFFFF_t_70 2

clusters_test

第二份代码用于测试给定聚类算法情况下簇的个数的选择对结果的影响

  1. # 不同簇个数聚类结果对比
  2. from sklearn.datasets import make_circles
  3. import matplotlib.pyplot as plt
  4. import numpy as np
  5. from sklearn.cluster import KMeans, DBSCAN, SpectralClustering, Birch, MeanShift, AgglomerativeClustering
  6. from sklearn.metrics import silhouette_score, silhouette_samples
  7. from sklearn.decomposition import PCA
  8. import pandas as pd
  9. import time
  10. import functools
  11. import matplotlib.cm as cm
  12. def cluster_test(model_name, model, X, clusters_list = [2,3,4,5,6,7]):
  13. for n_clusters in clusters_list:
  14. if hasattr(model, "n_clusters"):
  15. model.set_params(n_clusters = n_clusters)
  16. elif len(clusters_list) >= 2 and n_clusters == clusters_list[1]:
  17. print("{} do not have parameter 'n_clusters', return automatically.".format(model_name))
  18. return
  19. fig, (ax1, ax2) = plt.subplots(1, 2)
  20. fig.set_size_inches(18, 7)
  21. ax1.set_xlim([-0.1, 1])
  22. ax1.set_ylim([0, X.shape[0] + (n_clusters + 1) * 10])
  23. clusterer, t = cluster_function(model_name, model, X)
  24. cluster_labels = clusterer.labels_
  25. silhouette_avg = silhouette_score(X, cluster_labels)
  26. print("For n_clusters = ", n_clusters, " the average silhoutte_score is ", silhouette_avg)
  27. sample_silhouette_values = silhouette_samples(X, cluster_labels)
  28. y_lower = 10
  29. for i in range(n_clusters):
  30. ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
  31. ith_cluster_silhouette_values.sort()
  32. size_cluster_i = ith_cluster_silhouette_values.shape[0]
  33. y_upper = y_lower + size_cluster_i
  34. color = cm.nipy_spectral(float(i) / n_clusters)
  35. ax1.fill_betweenx(np.arange(y_lower, y_upper), ith_cluster_silhouette_values, facecolor = color, alpha = 0.7)
  36. ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
  37. y_lower = y_upper + 10
  38. ax1.set_title("The silhouette plot for the various clusters")
  39. ax1.set_xlabel("The silhouette coefficient values")
  40. ax1.set_ylabel("Cluster label")
  41. ax1.axvline(x = silhouette_avg, color = 'red', linestyle = "--")
  42. ax1.set_yticks([])
  43. ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
  44. colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
  45. ax2.scatter(pca_result[:,0], pca_result[:,1], marker = 'o', s = 8, c = colors)
  46. if hasattr(clusterer, 'cluster_centers_'):
  47. centers = clusterer.cluster_centers_
  48. ax2.scatter(centers[:, 0], centers[:, 1], marker = 'x', c = 'red', alpha = 1, s = 200)
  49. ax2.text(.99, .01, ('%.2fs' % (t)).lstrip('0'), transform=plt.gca().transAxes, size=12,horizontalalignment='right')
  50. ax2.set_title("The visualization of the clustered data")
  51. ax2.set_xlabel("Feature space for the 1st feature")
  52. ax2.set_ylabel("Feature space for the 2nd feature")
  53. plt.suptitle("Silhouette analysis for {} clustering on sample data with n_clusters = {} ({})".format(model_name, n_clusters, silhouette_avg), fontsize = 14, fontweight="bold")
  54. plt.show()
  55. def time_cost(func):
  56. @functools.wraps(func)
  57. def wrapper(*args, **kwargs):
  58. t0 = time.time()
  59. func(*args, **kwargs)
  60. t1 = time.time()
  61. return func(*args, **kwargs), t1 - t0
  62. return wrapper
  63. @time_cost
  64. def cluster_function(model_name, model, data):
  65. model = model.fit(data)
  66. return model
  67. def load_data(file):
  68. assert file != ''
  69. df = pd.read_csv(file)
  70. x = df.values
  71. pca = PCA(n_components=2)
  72. pca_result = pca.fit_transform(x)
  73. return x, pca_result
  74. # 加载数据
  75. x, pca_result = load_data('data.csv')
  76. cluster_test("AgglomerativeClustering", AgglomerativeClustering(), x)

绘图代码详细解释

此原始代码来自【机器学习】菜菜的sklearn课堂06 - 聚类算法与KMeans

  1. import numpy as np
  2. import matplotlib.cm as cm
  3. from sklearn.metrics import silhouette_score, silhouette_samples
  4. from sklearn.datasets import make_blobs
  5. from sklearn.cluster import KMeans
  6. X, y = make_blobs(n_samples=500, n_features=2, centers=4, random_state = 1)
  7. for n_clusters in [2,3,4,5,6,7]:
  8. n_clusters = n_clusters
  9. # 创建画布,画布上共有一行两列两个子图
  10. fig, (ax1, ax2) = plt.subplots(1, 2)
  11. # 画布尺寸
  12. fig.set_size_inches(18, 7)
  13. ax1.set_xlim([-0.1, 1])
  14. ax1.set_ylim([0, X.shape[0] + (n_clusters + 1) * 10])
  15. # 聚类
  16. clusterer = KMeans(n_clusters = n_clusters, random_state = 10).fit(X)
  17. cluster_labels = clusterer.labels_
  18. # 使用轮廓系数分数
  19. silhouette_avg = silhouette_score(X, cluster_labels)
  20. print("For n_clusters = ", n_clusters, " the average silhoutte_score is ", silhouette_avg)
  21. # 调用silhouette_samples,返回每个样本点的轮廓系数,这就是我们的横坐标
  22. sample_silhouette_values = silhouette_samples(X, cluster_labels)
  23. # 设定y轴上的初始取值
  24. y_lower = 10
  25. # 对每一个簇进行循环
  26. for i in range(n_clusters):
  27. # 从每个样本的轮廓系数中抽取出第i个簇的轮廓系数,并对他进行排序
  28. ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
  29. # 排序
  30. ith_cluster_silhouette_values.sort()
  31. # 查看这个簇中有多少个样本
  32. size_cluster_i = ith_cluster_silhouette_values.shape[0]
  33. # 这一个簇在y轴上从上往下画图的起点
  34. y_upper = y_lower + size_cluster_i
  35. # colormap库中的,使用小数来调用颜色的函数
  36. color = cm.nipy_spectral(float(i) / n_clusters)
  37. ax1.fill_betweenx(np.arange(y_lower, y_upper), ith_cluster_silhouette_values, facecolor = color, alpha = 0.7)
  38. # 为每个簇的轮廓系数写上簇的编号
  39. ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
  40. # 为下一个簇计算新的y周上的初始值
  41. y_lower = y_upper + 10
  42. # 给图1加上标题
  43. ax1.set_title("The silhouette plot for the various clusters")
  44. ax1.set_xlabel("The silhouette coefficient values")
  45. ax1.set_ylabel("Cluster label")
  46. # 把整个数据集上的轮廓系数的均值以虚线的形式放入我们的图中
  47. ax1.axvline(x = silhouette_avg, color = 'red', linestyle = "--")
  48. # 让y轴不显示任何刻度
  49. ax1.set_yticks([])
  50. # 设置x轴刻度
  51. ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
  52. # 开始对第二个图进行处理
  53. colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
  54. ax2.scatter(X[:,0], X[:,1], marker = 'o', s = 8, c = colors)
  55. # 把生成的质心放到图像中去
  56. centers = clusterer.cluster_centers_
  57. ax2.scatter(centers[:, 0], centers[:, 1], marker = 'x', c = 'red', alpha = 1, s = 200)
  58. # 为图二设置标题
  59. ax2.set_title("The visualization of the clustered data")
  60. ax2.set_xlabel("Feature space for the 1st feature")
  61. ax2.set_ylabel("Feature space for the 2nd feature")
  62. # 为整个图设置标题
  63. plt.suptitle("Silhouette analysis for KMeans clustering on sample data with n_clusters = {}".format(n_clusters), fontsize = 14, fontweight="bold")
  64. plt.show()

(完)

发表评论

表情:
评论列表 (有 0 条评论,68人围观)

还没有评论,来说两句吧...

相关阅读

    相关 机器学习算法KMeans

    算法原理 聚类指的是把集合,分组成多个类,每个类中的对象都是彼此相似的。K-means是聚类中最常用的方法之一,它是基于点与点距离的相似度来计算最佳类别归属。 在使用该

    相关 效果评估指标总结

    前言 实际工作中经常会用到一些聚类算法对一些数据进行聚类处理,如何评估每次聚类效果的好坏?可选的方法有1、根据一些聚类效果的指标来评估;2、直接打点。今天就主要总结下这段