分类算法实例四:用Logistic算法和KNN算法进行鸢尾花数据分类

墨蓝 2023-07-13 13:37 240阅读 0赞
  1. import numpy as np
  2. import matplotlib as mpl
  3. import matplotlib.pyplot as plt
  4. import pandas as pd
  5. import warnings
  6. import sklearn
  7. from sklearn.linear_model import LogisticRegressionCV
  8. from sklearn.linear_model.coordinate_descent import ConvergenceWarning
  9. from sklearn.model_selection import train_test_split
  10. from sklearn.preprocessing import StandardScaler
  11. from sklearn.neighbors import KNeighborsClassifier
  12. from sklearn.preprocessing import label_binarize
  13. from sklearn import metrics
  14. # 设置字符集,防止中文乱码
  15. mpl.rcParams['font.sans-serif']=[u'simHei']
  16. mpl.rcParams['axes.unicode_minus']=False
  17. # 拦截异常
  18. warnings.filterwarnings(action = 'ignore', category=ConvergenceWarning)
  19. # 数据加载
  20. path = "datas/iris.data"
  21. names = ['sepal length', 'sepal width', 'petal length', 'petal width', 'cla']
  22. df = pd.read_csv(path, header=None, names=names)
  23. df['cla'].value_counts()
  24. df.head()

20200312132438124.png

  1. def parseRecord(record):
  2. result=[]
  3. r = zip(names,record)
  4. for name,v in r:
  5. if name == 'cla':
  6. if v == 'Iris-setosa':
  7. result.append(1)
  8. elif v == 'Iris-versicolor':
  9. result.append(2)
  10. elif v == 'Iris-virginica':
  11. result.append(3)
  12. else:
  13. result.append(np.nan)
  14. else:
  15. result.append(float(v))
  16. return result
  17. # 1. 数据转换为数字以及分割
  18. # 数据转换
  19. datas = df.apply(lambda r: parseRecord(r), axis=1)
  20. # 异常数据删除
  21. datas = datas.dropna(how='any')
  22. # 数据分割
  23. X = datas[names[0:-1]]
  24. Y = datas[names[-1]]
  25. # 数据抽样(训练数据和测试数据分割)
  26. X_train,X_test,Y_train,Y_test = train_test_split(X, Y, test_size=0.4, random_state=0)
  27. print ("原始数据条数:%d;训练数据条数:%d;特征个数:%d;测试样本条数:%d" % (len(X), len(X_train), X_train.shape[1], X_test.shape[0]))

20200312133056855.png

  1. # 2. 数据标准化
  2. ss = StandardScaler()
  3. X_train = ss.fit_transform(X_train)
  4. X_test = ss.transform(X_test)
  5. # 3. 特征选择
  6. # 4. 降维处理
  7. # 5. 模型构建
  8. lr = LogisticRegressionCV(Cs=np.logspace(-4,1,50), cv=3, fit_intercept=True, penalty='l2', solver='lbfgs', tol=0.01, multi_class='multinomial')
  9. # solver可选:‘newton-cg’, 'lbfgs', 'liblinear', 'sag'(mini-batch),默认为liblinear
  10. lr.fit(X_train, Y_train)

20200312134313510.png

  1. # 6. 模型效果输出
  2. # 将正确的数据转换为矩阵形式
  3. y_test_hot = label_binarize(Y_test,classes=(1,2,3))
  4. # 得到预测的损失值
  5. lr_y_score = lr.decision_function(X_test)
  6. # 计算ROC的值
  7. lr_fpr, lr_tpr, lr_threasholds = metrics.roc_curve(y_test_hot.ravel(),lr_y_score.ravel())
  8. # threasholds阈值
  9. # 计算AUC的值
  10. lr_auc = metrics.auc(lr_fpr, lr_tpr)
  11. print ("Logistic算法R值:", lr.score(X_train, Y_train))
  12. print ("Logistic算法AUC值:", lr_auc)
  13. # 7. 模型预测
  14. lr_y_predict = lr.predict(X_test)

20200312134703137.png

  1. # KNN算法实现
  2. # a. 模型构建
  3. knn = KNeighborsClassifier(n_neighbors=3)
  4. knn.fit(X_train, Y_train)
  5. # b. 模型效果输出
  6. # 将正确的数据转换为矩阵形式
  7. y_test_hot = label_binarize(Y_test,classes=(1,2,3))
  8. # 得到预测的损失值
  9. knn_y_score = knn.predict_proba(X_test)
  10. # 计算ROC的值
  11. knn_fpr, knn_tpr, knn_threasholds = metrics.roc_curve(y_test_hot.ravel(),knn_y_score.ravel())
  12. # 计算AUC的值
  13. knn_auc = metrics.auc(knn_fpr, knn_tpr)
  14. print ("KNN算法R值:", knn.score(X_train, Y_train))
  15. print ("KNN算法AUC值:", knn_auc)
  16. # c. 模型预测
  17. knn_y_predict = knn.predict(X_test)
  18. knn_y_score

watermark_type_ZmFuZ3poZW5naGVpdGk_shadow_10_text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2dvbmd4aWZhY2FpX2JlbGlldmU_size_16_color_FFFFFF_t_70

  1. # 画图1:ROC曲线画图
  2. plt.figure(figsize=(8, 6), facecolor='w')
  3. plt.plot(lr_fpr,lr_tpr,c='r',lw=2,label=u'Logistic算法,AUC=%.3f' % lr_auc)
  4. plt.plot(knn_fpr,knn_tpr,c='g',lw=2,label=u'KNN算法,AUC=%.3f' % knn_auc)
  5. plt.plot((0,1),(0,1),c='#a0a0a0',lw=2,ls='--')
  6. # 设置X轴的最大和最小值
  7. plt.xlim(-0.01, 1.02)
  8. # 设置y轴的最大和最小值
  9. plt.ylim(-0.01, 1.02)
  10. plt.xticks(np.arange(0, 1.1, 0.1))
  11. plt.yticks(np.arange(0, 1.1, 0.1))
  12. plt.xlabel('False Positive Rate(FPR)', fontsize=16)
  13. plt.ylabel('True Positive Rate(TPR)', fontsize=16)
  14. plt.grid(b=True, ls=':')
  15. plt.legend(loc='lower right', fancybox=True, framealpha=0.8, fontsize=12)
  16. plt.title(u'鸢尾花数据Logistic和KNN算法的ROC/AUC', fontsize=18)
  17. plt.show()

watermark_type_ZmFuZ3poZW5naGVpdGk_shadow_10_text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2dvbmd4aWZhY2FpX2JlbGlldmU_size_16_color_FFFFFF_t_70 1

  1. # 画图2:预测结果画图
  2. x_test_len = range(len(X_test))
  3. plt.figure(figsize=(12, 9), facecolor='w')
  4. plt.ylim(0.5,3.5)
  5. plt.plot(x_test_len, Y_test, 'ro',markersize = 6, zorder=3, label=u'真实值')
  6. plt.plot(x_test_len, lr_y_predict, 'go', markersize = 10, zorder=2, label=u'Logis算法预测值,$R^2$=%.3f' % lr.score(X_test, Y_test))
  7. plt.plot(x_test_len, knn_y_predict, 'yo', markersize = 16, zorder=1, label=u'KNN算法预测值,$R^2$=%.3f' % knn.score(X_test, Y_test))
  8. plt.legend(loc = 'lower right')
  9. plt.xlabel(u'数据编号', fontsize=18)
  10. plt.ylabel(u'种类', fontsize=18)
  11. plt.title(u'鸢尾花数据分类', fontsize=20)
  12. plt.show()

watermark_type_ZmFuZ3poZW5naGVpdGk_shadow_10_text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2dvbmd4aWZhY2FpX2JlbGlldmU_size_16_color_FFFFFF_t_70 2

发表评论

表情:
评论列表 (有 0 条评论,240人围观)

还没有评论,来说两句吧...

相关阅读

    相关 [分类] KNN算法

    KNN算法 KNN,K-NearestNeighbor——K最近邻(不常用) 是什么? 是机器学习中有监督机器学习下的一种简单的分类算法. K最近邻,就是k个

    相关 KNN分类算法

    KNN分类算法 最简单最初级的分类器,就是将全部的训练数据所对应的类别都记录下来,当测试对象的属性和某个训练对象的属性完全匹配时,便可以对其进行分类 K近邻(k-nea