自学内容网 自学内容网

LSTM预测:糖尿病的发生情况

本文为为🔗365天深度学习训练营内部文章

原作者:K同学啊

 本期,做个二维结构化数据的分类预测。提到结构化数据,一般的分类算法常用有:逻辑回归(二分类)、KNN、SVM、决策树、贝叶斯、随机森林、XGBoost等。本次我们采用LSTM长短期记忆网络进行分类预测

一 数据导入 

import torch.nn as nn
import torch.nn.functional as F
import torchvision,torch
from torch.utils.data import TensorDataset,DataLoader
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
import warnings
warnings.filterwarnings('ignore')
# 设置硬件设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

df = pd.read_excel('dia.xls')
print(df.head())
print(df.shape)

print('数据缺失值-------------------')
print(df.isnull().sum())
print('数据重复值-------------------')
print(df.duplicated().sum())
卡号  性别  年龄  高密度脂蛋白胆固醇  低密度脂蛋白胆固醇  极低密度脂蛋白胆固醇  甘油三酯  总胆固醇  脉搏  舒张压   
0  18054421   0  38       1.25       2.99        1.07  0.64  5.31  83   83  \
1  18054422   0  31       1.15       1.99        0.84  0.50  3.98  85   63   
2  18054423   0  27       1.29       2.21        0.69  0.60  4.19  73   61   
3  18054424   0  33       0.93       2.01        0.66  0.84  3.60  83   60   
4  18054425   0  36       1.17       2.83        0.83  0.73  4.83  85   67   

   高血压史   尿素氮     尿酸  肌酐  体重检查结果  是否糖尿病  
0     0  4.99  243.3  50       1      0  
1     0  4.72  391.0  47       1      0  
2     0  5.87  325.7  51       1      0  
3     0  2.40  203.2  40       2      0  
4     0  4.09  236.8  43       0      0  
(1006, 16)
数据缺失值-------------------
卡号            0
性别            0
年龄            0
高密度脂蛋白胆固醇     0
低密度脂蛋白胆固醇     0
极低密度脂蛋白胆固醇    0
甘油三酯          0
总胆固醇          0
脉搏            0
舒张压           0
高血压史          0
尿素氮           0
尿酸            0
肌酐            0
体重检查结果        0
是否糖尿病         0
dtype: int64
数据重复值-------------------
0

 二 探索性数据分析

columns = df.drop(['是否糖尿病','卡号','性别'],axis=1).columns
plt.figure(figsize=(15,10))
for i,col in enumerate(columns,1):
    plt.subplot(3,5,i)
    sns.boxplot(x=df['是否糖尿病'],y=df[col])
    plt.title(f'{col}的箱线图')
    plt.ylabel('数值')
    plt.grid(True)
plt.tight_layout()
plt.show()

三 相关性分析 

df_corr = df.drop(['卡号'],axis=1).corr()
plt.figure(figsize=(8,6))
plt.title('相关性热图')
sns.heatmap(df_corr,annot=True)
plt.show()

四 划分数据集 

将负相关的特征(高密度脂蛋白胆固醇)剔除,只保留正相关的特征

# 划分数据集
X = df.drop(['是否糖尿病','高密度脂蛋白胆固醇','卡号'],axis=1)
y = df['是否糖尿病']

X = torch.tensor(np.array(X),dtype=torch.float32)
y = torch.tensor(np.array(y),dtype=torch.int64)

train_X,test_X,train_y,test_y = train_test_split(X,y,test_size=0.3,random_state=1)

# 数据集构建
train_dl = DataLoader(TensorDataset(train_X,train_y),
                      batch_size=64,
                      shuffle=False)

test_dl = DataLoader(TensorDataset(test_X,test_y),
                     batch_size=64,
                     shuffle=False)

五 构建LSTM模型 

# 定义模型
class model_lstm(nn.Module):
    def __init__(self):
        super(model_lstm,self).__init__()
        self.lstm0 = nn.LSTM(input_size=13,hidden_size=200,num_layers=1,batch_first=True)
        self.lstm1 = nn.LSTM(input_size=200,hidden_size=200,num_layers=1,batch_first=True)
        self.fc0 = nn.Linear(200,2)

    def forward(self,x):
        out,hidden1 = self.lstm0(x)
        out,_ = self.lstm1(out,hidden1)
        out = self.fc0(out)
        return out

model = model_lstm().to(device)
print(model)

六 训练并评估 

# 训练循环
def train(dataloader,model,loss_fn,optimizer):
    size = len(dataloader.dataset)   # 训练集的大小
    num_batches = len(dataloader)      # 批次数目,(size/batchsize,向上取整)

    train_acc,train_loss = 0,0  # 初始化训练损失和正确率

    for x,y in dataloader:    # 获取数据
        X,y = x.to(device),y.to(device)

        # 计算预测误差
        pred = model(X)   # 网络输出
        loss = loss_fn(pred,y)   # 计算误差

        # 反向传播
        optimizer.zero_grad()    # grad属性归零
        loss.backward()   # 反向传播
        optimizer.step()   # 每一步自动更新

        # 记录acc与loss
        train_acc += (pred.argmax(1) == y).type(torch.float).sum().item()
        train_loss += loss.item()

    train_acc /= size
    train_loss /= num_batches

    return train_acc,train_loss
# 测试循环
def valid(dataloader,model,loss_fn):
    size = len(dataloader.dataset)  # 训练集的大小
    num_batches = len(dataloader)  # 批次数目,(size/batchsize,向上取整)

    test_loss, test_acc = 0, 0  # 初始化训练损失和正确率

    # 当不进行训练时,停止梯度更新,节省计算内存消耗
    with torch.no_grad():
        for imgs,target in dataloader:
            imgs,target = imgs.to(device),target.to(device)

            # 计算loss
            target_pred = model(imgs)
            loss = loss_fn(target_pred,target)

            test_loss += loss.item()
            test_acc += (target_pred.argmax(1) == target).type(torch.float).sum().item()

    test_acc /= size
    test_loss /= num_batches

    return test_acc,test_loss
loss_fn = nn.CrossEntropyLoss()   # 创建损失函数
learn_rate = 1e-4   # 学习率
opt = torch.optim.Adam(model.parameters(),lr=learn_rate)
epochs = 30

train_loss = []
train_acc = []
test_loss = []
test_acc = []

for epoch in range(epochs):
    model.train()
    epoch_train_acc,epoch_train_loss = train(train_dl,model,loss_fn,opt)

    model.eval()
    epoch_test_acc,epoch_test_loss = valid(test_dl,model,loss_fn)

    train_acc.append(epoch_train_acc)
    train_loss.append(epoch_train_loss)
    test_acc.append(epoch_test_acc)
    test_loss.append(epoch_test_loss)

    # 获取当前的学习率
    lr = opt.state_dict()['param_groups'][0]['lr']

    template = ('Epoch:{:2d},Train_acc:{:.1f}%,Train_loss:{:.3f},Test_acc:{:.1f}%,Test_loss:{:.3f},lr:{:.2E}')
    print(template.format(epoch+1,epoch_train_acc*100,epoch_train_loss,epoch_test_acc*100,epoch_test_loss,lr))

print("="*20,'Done',"="*20)
Epoch: 1,Train_acc:56.5%,Train_loss:0.687,Test_acc:53.3%,Test_loss:0.686,lr:1.00E-04
Epoch: 2,Train_acc:56.7%,Train_loss:0.683,Test_acc:53.3%,Test_loss:0.687,lr:1.00E-04
Epoch: 3,Train_acc:56.5%,Train_loss:0.682,Test_acc:53.3%,Test_loss:0.686,lr:1.00E-04
Epoch: 4,Train_acc:56.5%,Train_loss:0.681,Test_acc:53.3%,Test_loss:0.685,lr:1.00E-04
Epoch: 5,Train_acc:56.7%,Train_loss:0.681,Test_acc:53.3%,Test_loss:0.685,lr:1.00E-04
Epoch: 6,Train_acc:56.8%,Train_loss:0.679,Test_acc:53.3%,Test_loss:0.684,lr:1.00E-04
Epoch: 7,Train_acc:57.0%,Train_loss:0.678,Test_acc:53.3%,Test_loss:0.683,lr:1.00E-04
Epoch: 8,Train_acc:56.8%,Train_loss:0.676,Test_acc:53.3%,Test_loss:0.681,lr:1.00E-04
Epoch: 9,Train_acc:56.5%,Train_loss:0.674,Test_acc:53.3%,Test_loss:0.679,lr:1.00E-04
Epoch:10,Train_acc:56.7%,Train_loss:0.671,Test_acc:53.3%,Test_loss:0.676,lr:1.00E-04
Epoch:11,Train_acc:57.0%,Train_loss:0.668,Test_acc:53.3%,Test_loss:0.673,lr:1.00E-04
Epoch:12,Train_acc:57.4%,Train_loss:0.665,Test_acc:53.3%,Test_loss:0.669,lr:1.00E-04
Epoch:13,Train_acc:57.7%,Train_loss:0.660,Test_acc:53.6%,Test_loss:0.664,lr:1.00E-04
Epoch:14,Train_acc:58.8%,Train_loss:0.655,Test_acc:53.6%,Test_loss:0.660,lr:1.00E-04
Epoch:15,Train_acc:59.5%,Train_loss:0.649,Test_acc:54.0%,Test_loss:0.655,lr:1.00E-04
Epoch:16,Train_acc:59.9%,Train_loss:0.643,Test_acc:56.3%,Test_loss:0.650,lr:1.00E-04
Epoch:17,Train_acc:61.8%,Train_loss:0.636,Test_acc:57.9%,Test_loss:0.644,lr:1.00E-04
Epoch:18,Train_acc:63.6%,Train_loss:0.628,Test_acc:60.3%,Test_loss:0.637,lr:1.00E-04
Epoch:19,Train_acc:65.3%,Train_loss:0.618,Test_acc:61.6%,Test_loss:0.630,lr:1.00E-04
Epoch:20,Train_acc:65.8%,Train_loss:0.607,Test_acc:63.2%,Test_loss:0.623,lr:1.00E-04
Epoch:21,Train_acc:66.6%,Train_loss:0.596,Test_acc:63.9%,Test_loss:0.616,lr:1.00E-04
Epoch:22,Train_acc:67.8%,Train_loss:0.584,Test_acc:64.6%,Test_loss:0.609,lr:1.00E-04
Epoch:23,Train_acc:70.3%,Train_loss:0.572,Test_acc:64.2%,Test_loss:0.602,lr:1.00E-04
Epoch:24,Train_acc:71.4%,Train_loss:0.560,Test_acc:66.6%,Test_loss:0.595,lr:1.00E-04
Epoch:25,Train_acc:72.4%,Train_loss:0.549,Test_acc:66.9%,Test_loss:0.590,lr:1.00E-04
Epoch:26,Train_acc:73.9%,Train_loss:0.538,Test_acc:66.6%,Test_loss:0.584,lr:1.00E-04
Epoch:27,Train_acc:74.3%,Train_loss:0.528,Test_acc:66.9%,Test_loss:0.579,lr:1.00E-04
Epoch:28,Train_acc:74.7%,Train_loss:0.518,Test_acc:67.5%,Test_loss:0.574,lr:1.00E-04
Epoch:29,Train_acc:76.0%,Train_loss:0.508,Test_acc:69.5%,Test_loss:0.570,lr:1.00E-04
Epoch:30,Train_acc:76.8%,Train_loss:0.499,Test_acc:70.5%,Test_loss:0.566,lr:1.00E-04
==================== Done ====================

七 可视化

epochs_range = range(30)
plt.figure(figsize=(14,4))
plt.subplot(1,2,1)
plt.plot(epochs_range,train_acc,label='training accuracy')
plt.plot(epochs_range,test_acc,label='validation accuracy')
plt.legend(loc='lower right')
plt.title('training and validation accuracy')

plt.subplot(1,2,2)
plt.plot(epochs_range,train_loss,label='training loss')
plt.plot(epochs_range,test_loss,label='validation loss')
plt.legend(loc='upper right')
plt.title('training and validation loss')
plt.show()

总结:

 

1. 处理长序列数据

LSTM通过其独特的门控机制(输入门、遗忘门和输出门)有效地保留和忘记信息,解决了传统RNN在长序列学习中面临的梯度消失和梯度爆炸问题。这使得LSTM在处理长序列数据时能保持较好的性能。

2. 捕捉时间依赖性

LSTM能够捕捉序列中长期和短期的时间依赖性,对于许多应用(如语言建模、语音识别和视频分析等)来说,这种能力至关重要。它能够记住之前的信息并利用这些信息来影响当前的输出。

3. 适用于多种输入类型

LSTM不仅可以处理一维的时间序列数据,还可以处理多维的序列数据,适用于图像序列、文本序列等多种数据类型。这使得LSTM在许多不同的领域具有广泛的应用。


原文地址:https://blog.csdn.net/2301_76606951/article/details/143062605

免责声明:本站文章内容转载自网络资源,如本站内容侵犯了原著者的合法权益,可联系本站删除。更多内容请关注自学内容网(zxcms.com)!