分类算法——基于heart数据集实现

🕗 发布于 2024-11-27 20:47 分类数据挖掘 人工智能

1 heart数据集——描述性统计分析

import matplotlib.pyplot as plt
import pandas as pd

# Load the dataset
heart = pd.read_csv(r"heart.csv", sep=',')

# Check the columns in the DataFrame
print(heart.columns)

a=heart.loc[:, 'y'].value_counts()
print(a)
heart.loc[:, 'y'].value_counts().plot(kind='bar')
#设置0和1的标签，0为无心脏病，1为有心脏病
plt.xticks([0, 1], ['No heart disease', 'Yes heart disease'])
#设置横坐标旋转45度
plt.xticks(rotation=0)
# 设置矩形数据标签
for x, y in enumerate(heart.loc[:, 'y'].value_counts()):
    plt.text(x, y, '%s' % y, ha='center', va='bottom')
#更改颜色
plt.bar([0, 1], heart.loc[:, 'y'].value_counts(), color=['#FF0000', '#00FF00'])

#设置标题
plt.title('Heart disease distribution')
plt.show()

Index(['sbp', 'tobacco', 'ldl', 'adiposity', 'age', 'y'], dtype='object')
y
0    302
1    160
Name: count, dtype: int64

在这里插入图片描述

2 Cp交叉验证，选择最优的k值进行判别分析

#Cp交叉验证，选择最优的k值进行判别分析
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
    
X = heart.iloc[:, 0:5]
y = heart.loc[:, 'y']
k_range = range(1, 31)
k_scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
    k_scores.append(scores.mean())
    
plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')

#选择最优的k值
k = k_scores.index(max(k_scores)) + 1
print('Optimal k: %d' % k)
#绘制最优k值在图中的位置
plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')
plt.scatter(k, max(k_scores), color='red')

#显示最优k直在图中等于多少
plt.text(k, max(k_scores), '(%d, %.2f)' % (k, max(k_scores)), ha='center', va='bottom')
plt.show()

Optimal k: 22

在这里插入图片描述

KNN分类器

#使用最优k值建立KNN进行分类
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Initialize and fit the KNN classifier
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)

# Predict and print accuracy
y_pred = knn.predict(X_test)
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))

#绘制决策区域
from matplotlib.colors import ListedColormap
import numpy as np
from sklearn.decomposition import PCA

def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):
    # Reduce dimensionality to 2D using PCA
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X)

    # setup marker generator and color map
    markers = ('s', 'x', 'o', '^', 'v')
    colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
    cmap = ListedColormap(colors[:len(np.unique(y))])

    # plot the decision surface
    x1_min, x1_max = X_pca[:, 0].min() - 1, X_pca[:, 0].max() + 1
    x2_min, x2_max = X_pca[:, 1].min() - 1, X_pca[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
                           np.arange(x2_min, x2_max, resolution))
    Z = classifier.predict(pca.inverse_transform(np.array([xx1.ravel(), xx2.ravel()]).T))
    Z = Z.reshape(xx1.shape)
    plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
    plt.xlim(xx1.min(), xx1.max())
    plt.ylim(xx2.min(), xx2.max())

    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(x=X_pca[y == cl, 0], y=X_pca[y == cl, 1],
                    alpha=0.8, c=[cmap(idx)],
                    marker=markers[idx], label=cl)

    # highlight test samples
    if test_idx:
        X_test, y_test = X_pca[test_idx, :2], y[test_idx]
        plt.scatter(X_test[:, 0], X_test[:, 1],
                    alpha=1.0, linewidth=1, marker='o',
                    s=55, label='test set')
        
# Plot decision regions using PCA-transformed features
X_combined = np.vstack((X_train, X_test))
y_combined = np.hstack((y_train, y_test))
plot_decision_regions(X=X_combined, y=y_combined, classifier=knn, test_idx=range(len(y_train), len(y_train) + len(y_test)))
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(loc='upper left')
plt.show()

Accuracy: 0.69

在这里插入图片描述

朴素贝叶斯分类器

#朴素贝叶斯分类器
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from matplotlib.colors import ListedColormap

# Load the dataset
heart = pd.read_csv(r"heart.csv", sep=',')

# Select features and target
X = heart.iloc[:, 0:5]
y = heart.loc[:, 'y']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Initialize and fit the Gaussian Naive Bayes classifier
gnb = GaussianNB()
gnb.fit(X_train, y_train)

# Predict and print accuracy
y_pred = gnb.predict(X_test)
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))

# Define the function to plot decision regions
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.decomposition import PCA

def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):
    # Reduce dimensionality to 2D using PCA
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X)

    # setup marker generator and color map
    markers = ('s', 'x', 'o', '^', 'v')
    colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
    cmap = ListedColormap(colors[:len(np.unique(y))])

    # plot the decision surface
    x1_min, x1_max = X_pca[:, 0].min() - 1, X_pca[:, 0].max() + 1
    x2_min, x2_max = X_pca[:, 1].min() - 1, X_pca[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
                           np.arange(x2_min, x2_max, resolution))
    Z = classifier.predict(pca.inverse_transform(np.array([xx1.ravel(), xx2.ravel()]).T))
    Z = Z.reshape(xx1.shape)
    plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
    plt.xlim(xx1.min(), xx1.max())
    plt.ylim(xx2.min(), xx2.max())

    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(x=X_pca[y == cl, 0], y=X_pca[y == cl, 1],
                    alpha=0.8, c=[cmap(idx)],
                    marker=markers[idx], label=cl)

    # # highlight test samples
    # if test_idx:
    #     X_test, y_test = X_pca[test_idx, :2], y[test_idx]
    #     plt.scatter(X_test[:, 0], X_test[:, 1],
    #                 alpha=1.0, linewidth=1, marker='o',
    #                 s=55, label='test set')

# Plot decision regions using PCA-transformed features
X_combined = np.vstack((X_train, X_test))
y_combined = np.hstack((y_train, y_test))
plot_decision_regions(X=X_combined, y=y_combined, classifier=gnb, test_idx=range(len(y_train), len(y_train) + len(y_test)))
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(loc='upper left')
plt.show()

Accuracy: 0.70

在这里插入图片描述

SVM分类器

#使用SVM进行分类
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
    
from sklearn.svm import SVC

# Load the dataset
heart = pd.read_csv(r"heart.csv", sep=',')
# Select features and target
X = heart.iloc[:, 0:5]
y = heart.loc[:, 'y']
    
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Initialize and fit the SVM classifier
svm = SVC(kernel='linear', C=1.0, random_state=0)
svm.fit(X_train, y_train)

# Predict and print accuracy
y_pred = svm.predict(X_test)
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))

Accuracy: 0.66


# Plot decision regions using PCA-transformed features
X_combined = np.vstack((X_train, X_test))
y_combined = np.hstack((y_train, y_test))
plot_decision_regions(X=X_combined, y=y_combined, classifier=svm, test_idx=range(len(y_train), len(y_train) + len(y_test)))
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(loc='upper left')
plt.show()

在这里插入图片描述

随机森林分类

# Import necessary libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
import pydotplus
from IPython.display import Image

# Load the dataset
heart = pd.read_csv(r"heart.csv", sep=',')

# Select features and target
X = heart.iloc[:, 0:5]
y = heart.loc[:, 'y']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Initialize and fit the Decision Tree classifier
tree = DecisionTreeClassifier(max_depth=3, random_state=0)
tree.fit(X_train, y_train)

# Predict and print accuracy
y_pred = tree.predict(X_test)
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))

# Export the decision tree to a file
export_graphviz(tree, out_file='tree.dot', feature_names=X.columns)

# Convert the dot file to a png
graph = pydotplus.graph_from_dot_file('tree.dot')
Image(graph.create_png())

# Plot decision regions using PCA-transformed features
X_combined = np.vstack((X_train, X_test))
y_combined = np.hstack((y_train, y_test))
plot_decision_regions(X=X_combined, y=y_combined, classifier=tree, test_idx=range(len(y_train), len(y_train) + len(y_test)))
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(loc='upper left')
plt.show()

Accuracy: 0.68

在这里插入图片描述

决策树分类


#绘制出决策树
from sklearn.tree import plot_tree
plt.figure(figsize=(20, 10))
plot_tree(tree, filled=True, feature_names=X.columns, class_names=['0', '1'])
plt.show()

在这里插入图片描述

原文地址：https://blog.csdn.net/2301_76574743/article/details/144000642

免责声明：本站文章内容转载自网络资源，如侵犯了原著者的合法权益，可联系本站删除。更多内容请关注自学内容网（zxcms.com）！

上一篇：【Linux】vim的使用
下一篇：Spring Boot 与 Spring Cloud Alibaba 版本兼容对照

集成Sleuth实现链路追踪
【代码】集成Sleuth实现链路追踪。
阅读更多2025-01-24
Nuxt：利用public-ip这个npm包来获取公网IP
Nuxt：利用public-ip这个npm包来获取公网IP
阅读更多2025-01-24
【Linux】文件操作、系统IO相关操作、inode和输入输出重定向
文件在磁盘里，磁盘是永久性存储介质，因此文件在磁盘上的存储是永久性的，磁盘是外设（即是输出设备也是输入设备），对磁盘上所有文件的操作本质都是对外设的输入和输出，简称IO
阅读更多2025-01-24
10个非常基础的 Javascript 问题
根据MDN，JavaScript（通常缩写为JS）是一种轻量级的，解释性的，面向对象的语言，具有一流的功能，并且最著名的是Web页面的脚本语言，但它也用于许多非浏览器环境中。所有声明（函数，var，l
阅读更多2025-01-24
分布式与微服务：构建现代应用的关键架构
分布式系统和微服务架构是现代计算机系统的重要组成部分，它们为构建高可用、高可扩展和高性能的应用提供了强大的解决方案。通过上述 Java 示例，我们展示了不同场景下的实现方式，并深入探讨了其底层原理、性
阅读更多2025-01-24
青少年CTF练习平台 PHP的后门
根据提示，PHP/8.1.0-dev漏洞，修改请求头利用此漏洞。注意 zerodium后面拼接函数发送。
阅读更多2025-01-24
mock可视化&生成前端代码
介绍：mock是我们前后端分离的必要一环、ts、axios编写起来也很麻烦。我们就可以使用以下插件，来解决我们的问题。目前支持vite和webpack。欢迎小伙伴们提issues、我们共建。提升我们的
阅读更多2025-01-24
科比断腱之战所穿球鞋将被拍卖预计成交价会超过60万美元
虽然他在篮球运动方面有着得天独厚的天赋，但真正让他在竞争中脱颖而出的是他每天都坚持不懈地让自己变得更好。这双比赛穿的球鞋体现了他非凡的毅力，正是这种毅力使他成为史上最伟大的球员之一。在2013年4月1
阅读更多2025-01-24
分布式光纤应变监测是一种高精度、分布式的监测技术
主体结构应变监测：在高层建筑的混凝土柱、核心筒、梁等主要承重结构中安装分布式光纤应变传感器，能够实时监测建筑物在自身重力、风荷载、地震作用下的应变状态。例如，在高层建筑受到强风时，建筑物的迎风面和背风
阅读更多2025-01-24
30、Java中的异常(Exception)
自定义异常是开发者根据特定需求创建的异常类，它们通常继承自 Java 的 Exception 类或 RuntimeException 类。自定义异常使得异常处理更加灵活和具体，能够更好地描述程序中可能
阅读更多2025-01-24

分类算法——基于heart数据集实现

1 heart数据集——描述性统计分析

2 Cp交叉验证，选择最优的k值进行判别分析

KNN分类器

朴素贝叶斯分类器

SVM分类器

随机森林分类

决策树分类

相关文章