自学内容网 自学内容网

python实现了对一个数据集(从csv文件读取)的数据预处理、异常值检测、数据重构以及使用多种机器学习模型进行评估和调优的功能

代码主要实现了对一个数据集(从DM_Project_24.csv文件读取)的数据预处理、异常值检测、数据重构以及使用多种机器学习模型进行评估和调优的功能

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
     "end_time": "2024-10-23T03:53:51.550153600Z",
     "start_time": "2024-10-23T03:53:51.542117400Z"
    }
   },
   "outputs": [],
   "source": [
    "#1. imputation\n",
    "#2. outlier detection\n",
    "#3. Normalization\n",
    "#4."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 315,
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "np.random.seed(42)\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn import metrics"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-10-23T11:04:48.194406300Z",
     "start_time": "2024-10-23T11:04:48.177847900Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 316,
   "outputs": [],
   "source": [
    "data = np.genfromtxt('DM_Project_24.csv',delimiter=\",\",skip_header=1)\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-10-23T11:04:48.750530300Z",
     "start_time": "2024-10-23T11:04:48.650819800Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 317,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total Missing Values is:9499\n",
      "Total Missing Values on Label is :0\n",
      "Number of missing values by feature:\n",
      " [ 67 101  86  83 101 108  78  90  88  91  81 110  83  94  78  80 107  85\n",
      "  87 100  96  90  84  96  83  90  90  78  82  95 107  98  80  82  97  83\n",
      "  83  86  98  75  83  79  81  95  90  71 100  84  88  77  98  98  91  99\n",
      " 104  73  85 103  78 104  84  87  81  85  89  89  94  81  83  88  95  75\n",
      "  95  82  81  90  87  84  84  87  86  99  92  85  80  98 102 106  81 101\n",
      "  88 101  95 101  95  89  95  98  89  82 109  79  81 151 143   0]\n",
      "Numeber of label 0: 1406\n",
      "Number of label 1: 194\n"
     ]
    }
   ],
   "source": [
    "#calculate total number of missing values\n",
    "total_Value = np.sum(np.isnan(data))\n",
    "print(f'Total Missing Values is:{total_Value}')\n",
    "\n",
    "#calculate count of missing values on labels\n",
    "missing_on_label = np.sum(np.isnan(data[:,-1]))\n",
    "print(f'Total Missing Values on Label is :{missing_on_label}')\n",
    "\n",
    "#show number of missing values by each feature\n",
    "missing_on_feature = np.sum(np.isnan(data),axis=0)\n",
    "print(f'Number of missing values by feature:\\n {missing_on_feature}')\n",
    "\n",
    "#calculate number of 1 and0\n",
    "num_label_zero = np.sum(data[:,-1] == 0)\n",
    "num_label_one = np.sum(data[:,-1] == 1)\n",
    "print(f'Numeber of label 0: {num_label_zero}')\n",
    "print(f'Number of label 1: {num_label_one}')\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-10-23T11:04:49.245104700Z",
     "start_time": "2024-10-23T11:04:49.212110900Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 318,
   "outputs": [],
   "source": [
    "col_features = data[:,:-1]\n",
    "col_label = data[:,-1]\n",
    "col_numerical = data[:,:103]\n",
    "col_nominal = data[:,103:-1]"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-10-23T11:04:49.774289600Z",
     "start_time": "2024-10-23T11:04:49.742662500Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 319,
   "outputs": [
    {
     "data": {
      "text/plain": "(1600, 2)"
     },
     "execution_count": 319,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Missing value\n",
    "from sklearn.impute import SimpleImputer\n",
    "numerical_imputer = SimpleImputer(missing_values = np.nan, strategy = 'median')\n",
    "nominal_imputer = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')\n",
    "impute_col_numerical = numerical_imputer.fit_transform(col_numerical)\n",
    "impute_col_nominal = nominal_imputer.fit_transform(col_nominal)\n",
    "impute_col_nominal.shape"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-10-23T11:04:50.262671100Z",
     "start_time": "2024-10-23T11:04:50.230889900Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 320,
   "outputs": [],
   "source": [
    "#Normalisaion\n",
    "from sklearn.preprocessing import MinMaxScaler, StandardScaler\n",
    "min_max_scaler = MinMaxScaler()\n",
    "standard_scaler = StandardScaler()\n",
    "\n",
    "minmax_col_numerical = min_max_scaler.fit_transform(impute_col_numerical)\n",
    "std_col_numerical = standard_scaler.fit_transform(impute_col_numerical)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-10-23T11:04:50.965009600Z",
     "start_time": "2024-10-23T11:04:50.949314900Z"
    }
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Z-score"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 320,
   "outputs": [],
   "source": [],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-10-23T11:04:52.112475400Z",
     "start_time": "2024-10-23T11:04:52.082097100Z"
    }
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "# DBSCAN"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 321,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of samples in remove_minmax_numerical: 1555\n",
      "Number of samples in remove_std_numerical: 1552\n"
     ]
    }
   ],
   "source": [
    "from sklearn.cluster import DBSCAN\n",
    "# MinMax\n",
    "dbscan = DBSCAN(eps=0.2, min_samples=5)\n",
    "db_minmax_outliers = dbscan.fit_predict(minmax_col_numerical)\n",
    "db_minmax_inliers_mask = db_minmax_outliers != -1  # -1 表示噪声点,即异常值\n",
    "remove_minmax_db_numerical = minmax_col_numerical[db_minmax_inliers_mask]\n",
    "remove_minmax_db_nominal = impute_col_nominal[db_minmax_inliers_mask]\n",
    "db_minmax_labels = col_label[db_minmax_inliers_mask]\n",
    "print(f\"Number of samples in remove_minmax_numerical: {remove_minmax_db_numerical.shape[0]}\")\n",
    "\n",
    "#std\n",
    "dbscan = DBSCAN(eps=4, min_samples=5)\n",
    "db_std_outliers = dbscan.fit_predict(std_col_numerical)\n",
    "db_std_inliers_mask = db_std_outliers != -1  # -1 表示噪声点,即异常值\n",
    "remove_std_db_numerical = std_col_numerical[db_std_inliers_mask]\n",
    "remove_std_db_nominal = impute_col_nominal[db_std_inliers_mask]\n",
    "db_std_labels = col_label[db_std_inliers_mask]\n",
    "print(f\"Number of samples in remove_std_numerical: {remove_std_db_numerical.shape[0]}\")"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-10-23T11:04:53.459760700Z",
     "start_time": "2024-10-23T11:04:53.365329400Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 186,
   "outputs": [
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
      "\u001B[1;31mKeyboardInterrupt\u001B[0m                         Traceback (most recent call last)",
      "Cell \u001B[1;32mIn[186], line 19\u001B[0m\n\u001B[0;32m     16\u001B[0m dbscan \u001B[38;5;241m=\u001B[39m DBSCAN(eps\u001B[38;5;241m=\u001B[39meps, min_samples\u001B[38;5;241m=\u001B[39mmin_samples)\n\u001B[0;32m     18\u001B[0m \u001B[38;5;66;03m# 拟合模型并进行聚类\u001B[39;00m\n\u001B[1;32m---> 19\u001B[0m clusters \u001B[38;5;241m=\u001B[39m \u001B[43mdbscan\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mfit_predict\u001B[49m\u001B[43m(\u001B[49m\u001B[43mminmax_col_numerical\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m     21\u001B[0m \u001B[38;5;66;03m# 如果形成的簇数量大于 1,则计算轮廓系数\u001B[39;00m\n\u001B[0;32m     22\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mlen\u001B[39m(\u001B[38;5;28mset\u001B[39m(clusters)) \u001B[38;5;241m>\u001B[39m \u001B[38;5;241m1\u001B[39m:\n",
      "File \u001B[1;32mD:\\ANACONDA\\envs\\infs7410\\lib\\site-packages\\sklearn\\cluster\\_dbscan.py:454\u001B[0m, in \u001B[0;36mDBSCAN.fit_predict\u001B[1;34m(self, X, y, sample_weight)\u001B[0m\n\u001B[0;32m    429\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mfit_predict\u001B[39m(\u001B[38;5;28mself\u001B[39m, X, y\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mNone\u001B[39;00m, sample_weight\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mNone\u001B[39;00m):\n\u001B[0;32m    430\u001B[0m \u001B[38;5;250m    \u001B[39m\u001B[38;5;124;03m\"\"\"Compute clusters from a data or distance matrix and predict labels.\u001B[39;00m\n\u001B[0;32m    431\u001B[0m \n\u001B[0;32m    432\u001B[0m \u001B[38;5;124;03m    Parameters\u001B[39;00m\n\u001B[1;32m   (...)\u001B[0m\n\u001B[0;32m    452\u001B[0m \u001B[38;5;124;03m        Cluster labels. Noisy samples are given the label -1.\u001B[39;00m\n\u001B[0;32m    453\u001B[0m \u001B[38;5;124;03m    \"\"\"\u001B[39;00m\n\u001B[1;32m--> 454\u001B[0m     \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mfit\u001B[49m\u001B[43m(\u001B[49m\u001B[43mX\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43msample_weight\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43msample_weight\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m    455\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mlabels_\n",
      "File \u001B[1;32mD:\\ANACONDA\\envs\\infs7410\\lib\\site-packages\\sklearn\\base.py:1152\u001B[0m, in \u001B[0;36m_fit_context.<locals>.decorator.<locals>.wrapper\u001B[1;34m(estimator, *args, **kwargs)\u001B[0m\n\u001B[0;32m   1145\u001B[0m     estimator\u001B[38;5;241m.\u001B[39m_validate_params()\n\u001B[0;32m   1147\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m config_context(\n\u001B[0;32m   1148\u001B[0m     skip_parameter_validation\u001B[38;5;241m=\u001B[39m(\n\u001B[0;32m   1149\u001B[0m         prefer_skip_nested_validation \u001B[38;5;129;01mor\u001B[39;00m global_skip_validation\n\u001B[0;32m   1150\u001B[0m     )\n\u001B[0;32m   1151\u001B[0m ):\n\u001B[1;32m-> 1152\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mfit_method\u001B[49m\u001B[43m(\u001B[49m\u001B[43mestimator\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n",
      "File \u001B[1;32mD:\\ANACONDA\\envs\\infs7410\\lib\\site-packages\\sklearn\\cluster\\_dbscan.py:418\u001B[0m, in \u001B[0;36mDBSCAN.fit\u001B[1;34m(self, X, y, sample_weight)\u001B[0m\n\u001B[0;32m    415\u001B[0m core_samples \u001B[38;5;241m=\u001B[39m np\u001B[38;5;241m.\u001B[39masarray(n_neighbors \u001B[38;5;241m>\u001B[39m\u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mmin_samples, dtype\u001B[38;5;241m=\u001B[39mnp\u001B[38;5;241m.\u001B[39muint8)\n\u001B[0;32m    416\u001B[0m dbscan_inner(core_samples, neighborhoods, labels)\n\u001B[1;32m--> 418\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcore_sample_indices_ \u001B[38;5;241m=\u001B[39m \u001B[43mnp\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mwhere\u001B[49m\u001B[43m(\u001B[49m\u001B[43mcore_samples\u001B[49m\u001B[43m)\u001B[49m[\u001B[38;5;241m0\u001B[39m]\n\u001B[0;32m    419\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mlabels_ \u001B[38;5;241m=\u001B[39m labels\n\u001B[0;32m    421\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mlen\u001B[39m(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcore_sample_indices_):\n\u001B[0;32m    422\u001B[0m     \u001B[38;5;66;03m# fix for scipy sparse indexing issue\u001B[39;00m\n",
      "File \u001B[1;32m<__array_function__ internals>:200\u001B[0m, in \u001B[0;36mwhere\u001B[1;34m(*args, **kwargs)\u001B[0m\n",
      "\u001B[1;31mKeyboardInterrupt\u001B[0m: "
     ]
    }
   ],
   "source": [],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-10-23T05:53:51.644205Z",
     "start_time": "2024-10-23T05:53:43.489842700Z"
    }
   }
  },
  {
   "cell_type": "markdown",
   "source": [],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Outlier detection using Isolation Forest"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 271,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of samples in remove_minmax_numerical: 1508\n",
      "Number of samples in remove_std_numerical: 1508\n"
     ]
    }
   ],
   "source": [
    "from sklearn.ensemble import IsolationForest\n",
    "iso_forest = IsolationForest(n_estimators=118, contamination=0.0575, max_features=1)\n",
    "if_minmax_outliers = iso_forest.fit_predict(minmax_col_numerical)\n",
    "if_minmax_inliers_mask = if_minmax_outliers == 1\n",
    "remove_minmax_if_numerical = minmax_col_numerical[if_minmax_inliers_mask]\n",
    "remove_minmax_if_nominal = impute_col_nominal[if_minmax_inliers_mask]\n",
    "if_minmax_labels = col_label[if_minmax_inliers_mask]\n",
    "print(f\"Number of samples in remove_minmax_numerical: {remove_minmax_if_numerical.shape[0]}\")\n",
    "\n",
    "\n",
    "if_std_outliers = iso_forest.fit_predict(std_col_numerical)\n",
    "if_std_inliers_mask = if_std_outliers ==1\n",
    "remove_std_if_numerical = std_col_numerical[if_std_inliers_mask]\n",
    "remove_std_if_nominal = impute_col_nominal[if_std_inliers_mask]\n",
    "if_std_lable = col_label[if_std_inliers_mask]\n",
    "print(f\"Number of samples in remove_std_numerical: {remove_std_if_numerical.shape[0]}\")\n",
    "\n",
    "\n",
    "\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-10-23T06:36:27.531422400Z",
     "start_time": "2024-10-23T06:36:27.337119700Z"
    }
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Outlier detection using LOF"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 272,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "MinMax删除异常值后的数据点数量: 1549\n",
      "std删除异常值后的数据点数量: 1555\n"
     ]
    }
   ],
   "source": [
    "from sklearn.neighbors import LocalOutlierFactor\n",
    "from collections import Counter\n",
    "\n",
    "lofer = LocalOutlierFactor(n_neighbors=95)\n",
    "\n",
    "minmax_outliers = lofer.fit_predict(minmax_col_numerical)\n",
    "minmax_outlier_count = Counter(minmax_outliers)\n",
    "lof_minmax_inliers_mask = minmax_outliers == 1\n",
    "remove_minmax_lof_numerical = minmax_col_numerical[lof_minmax_inliers_mask]\n",
    "minmax_lof_labels = col_label[lof_minmax_inliers_mask]\n",
    "remove_minmax_lof_nominal = impute_col_nominal[lof_minmax_inliers_mask]\n",
    "print(f\"MinMax删除异常值后的数据点数量: {remove_minmax_lof_numerical.shape[0]}\")\n",
    "\n",
    "\n",
    "std_outliers = lofer.fit_predict(std_col_numerical)\n",
    "std_outlier_count = Counter(std_outliers)\n",
    "lof_std_inliers_mask = std_outliers == 1\n",
    "remove_std_numerical = std_col_numerical[lof_std_inliers_mask]\n",
    "std_labels = col_label[lof_std_inliers_mask]\n",
    "remove_std_lof_nominal = impute_col_nominal[lof_std_inliers_mask]\n",
    "print(f\"std删除异常值后的数据点数量: {remove_std_numerical.shape[0]}\")\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-10-23T06:36:33.175575200Z",
     "start_time": "2024-10-23T06:36:33.005551900Z"
    }
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Recontructure Data"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 322,
   "outputs": [],
   "source": [
    "#Missing value\n",
    "data_impute_without_label = np.concatenate((impute_col_numerical,impute_col_nominal),axis=1)\n",
    "impute_data = np.concatenate((data_impute_without_label,col_label.reshape(-1,1)),axis=1)\n",
    "\n",
    "#Missing value, Minmax\n",
    "data_minmax_without_label = np.concatenate((minmax_col_numerical,impute_col_nominal),axis=1)\n",
    "minmax_data = np.concatenate((data_minmax_without_label,col_label.reshape(-1, 1)),axis=1)\n",
    "\n",
    "#Missing value, standard\n",
    "data_std_without_label = np.concatenate((std_col_numerical,impute_col_nominal),axis=1)\n",
    "std_data = np.concatenate((data_std_without_label,col_label.reshape(-1, 1)),axis=1)\n",
    "#impute_data.shape\n",
    "\n",
    "#Missing value, Min-Max, DBSCAN\n",
    "data_db_without_lable = np.concatenate((remove_minmax_db_numerical,remove_minmax_db_nominal),axis=1)\n",
    "db_minmax_data = np.concatenate((data_db_without_lable,db_minmax_labels.reshape(-1, 1)),axis=1)\n",
    "#db_minmax_data.shape\n",
    "\n",
    "data_db_std_without_lable = np.concatenate((remove_std_db_numerical,remove_std_db_nominal),axis=1)\n",
    "db_std_data = np.concatenate((data_db_std_without_lable,db_std_labels.reshape(-1, 1)),axis=1)\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "#Missing value, Min-Max, Isolation Forest\n",
    "data_if_without_lable = np.concatenate((remove_minmax_if_numerical,remove_minmax_if_nominal),axis=1)\n",
    "if_minmax_data = np.concatenate((data_if_without_lable,if_minmax_labels.reshape(-1, 1)),axis=1)\n",
    "#if_minmax_data.shape\n",
    "\n",
    "#Missing value, Standard, Isolation Forest\n",
    "data_if_std_without_lable = np.concatenate((remove_std_if_numerical,remove_std_if_nominal),axis=1)\n",
    "if_std_data = np.concatenate((data_if_std_without_lable,if_std_lable.reshape(-1,1)),axis=1)\n",
    "#if_std_data.shape\n",
    "\n",
    "#Missing value, Min-Max, LOF\n",
    "data_without_lable = np.concatenate((remove_minmax_lof_numerical,remove_minmax_lof_nominal),axis=1)\n",
    "#data_without_lable = np.concatenate((remove_minmax_numerical,impute_col_nominal),axis=1)\n",
    "lof_minmax_data = np.concatenate((data_without_lable,minmax_lof_labels.reshape(-1, 1)),axis=1)\n",
    "#lof_minmax_data.shape\n",
    "\n",
    "#Missing value, Standard, LOF\n",
    "data_without_lable = np.concatenate((remove_std_numerical,remove_std_lof_nominal),axis=1)\n",
    "lof_std_data = np.concatenate((data_without_lable,std_labels.reshape(-1, 1)),axis=1)\n",
    "#lof_std_data.shape\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-10-23T11:05:00.838247800Z",
     "start_time": "2024-10-23T11:05:00.806645100Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 322,
   "outputs": [],
   "source": [],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-10-23T11:05:01.567003100Z",
     "start_time": "2024-10-23T11:05:01.549144800Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 325,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.672668604141718\n"
     ]
    }
   ],
   "source": [
    "# Decision Tress Implementation\n",
    "from sklearn import tree\n",
    "from sklearn.model_selection import cross_val_score\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "\n",
    "dt = tree.DecisionTreeClassifier(random_state=42 )\n",
    "X = if_std_data[:, :-1]\n",
    "y = if_std_data[:,-1]\n",
    "dt_score = cross_val_score(dt,X,y,scoring=\"f1\",cv = 5)\n",
    "print(sum(dt_score)/5)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-10-23T11:05:42.246363800Z",
     "start_time": "2024-10-23T11:05:40.384686100Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 297,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.07141067616800681\n"
     ]
    }
   ],
   "source": [
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "neigh = KNeighborsClassifier()\n",
    "knn_score = cross_val_score(neigh,X,y,scoring=\"f1\",cv = 5)\n",
    "print(sum(knn_score)/5)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-10-23T06:44:26.602704300Z",
     "start_time": "2024-10-23T06:44:26.478424600Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 298,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.7311561426901243\n"
     ]
    }
   ],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "rfc = RandomForestClassifier()\n",
    "rfc_score = cross_val_score(rfc,X,y,scoring=\"f1\",cv = 5)\n",
    "print(sum(rfc_score)/5)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-10-23T06:44:35.550518Z",
     "start_time": "2024-10-23T06:44:26.602704300Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 299,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.21726542893236117\n"
     ]
    }
   ],
   "source": [
    "from sklearn.naive_bayes import GaussianNB\n",
    "gnb = GaussianNB()\n",
    "gnb_score = cross_val_score(gnb,X,y,scoring=\"f1\",cv = 5)\n",
    "print(sum(gnb_score)/5)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-10-23T06:44:35.603433700Z",
     "start_time": "2024-10-23T06:44:35.552596900Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 330,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best parameters: {'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'best'}\n",
      "Best F1 Score: 0.773772741931235\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import make_scorer, f1_score\n",
    "dt = tree.DecisionTreeClassifier(random_state=42,ccp_alpha=0.01)\n",
    "\n",
    "\n",
    "f1_scorer = make_scorer(f1_score, average='binary')\n",
    "X = if_minmax_data[:, :-1]\n",
    "y = if_minmax_data[:,-1]\n",
    "\n",
    "param_grid = {\n",
    "    'criterion':['gini','entropy','log_loss'],\n",
    "    'splitter' : ['best'],\n",
    "    'max_depth': [3,5,7,10,15, 20, 30],\n",
    "    'min_samples_split': [2,3,4, 5, 9,10,15],\n",
    "    'min_samples_leaf': [1, 2,3, 4,6,10]\n",
    "}\n",
    "#criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'best f1:0.7716\n",
    "# param_grid = {\n",
    "#     'criterion':['gini','entropy','log_loss'],\n",
    "#     'splitter' : ['best'],\n",
    "#     'max_depth': [3, 5, 10, 15, 20, 30],\n",
    "#     'min_samples_split': [2,3,4, 5, 9,10,15],\n",
    "#     'min_samples_leaf': [1, 2,3, 4,6,10]\n",
    "# }\n",
    "grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=10, scoring=f1_scorer, n_jobs=-1)\n",
    "grid_search.fit(X, y)\n",
    "print(f\"Best parameters: {grid_search.best_params_}\")\n",
    "print(f\"Best F1 Score: {grid_search.best_score_}\")\n",
    "# cv = [5,10,15,20,25,30]\n",
    "# for c in cv:\n",
    "#     grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=c, scoring='f1', n_jobs=-1)\n",
    "#     grid_search.fit(X, y)\n",
    "#     print(f'cv:{c}')\n",
    "#     print(f\"Best parameters: {grid_search.best_params_}\")\n",
    "#     print(f\"Best F1 Score: {grid_search.best_score_}\")"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-10-23T11:41:29.914445Z",
     "start_time": "2024-10-23T11:40:09.879004100Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 303,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best parameters for KNN: {'algorithm': 'auto', 'metric': 'minkowski', 'n_neighbors': 6, 'p': 2, 'weights': 'uniform'}\n",
      "Best F1 Score for KNN: 0.7780195374467731\n"
     ]
    }
   ],
   "source": [
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "neigh = KNeighborsClassifier()\n",
    "param_grid_knn = {\n",
    "    'n_neighbors': [1,2,3,4, 5,6, 7,8, 9,10,11],\n",
    "    'weights': ['uniform', 'distance'],\n",
    "    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],\n",
    "     'metric': ['minkowski', 'euclidean', 'manhattan'],\n",
    "    'p': [1, 2]  # 1 for Manhattan distance, 2 for Euclidean distance\n",
    "}\n",
    "\n",
    "grid_search_knn = GridSearchCV(estimator=neigh, param_grid=param_grid_knn, cv=5, scoring='f1', n_jobs=-1)\n",
    "grid_search_knn.fit(X, y)\n",
    "\n",
    "print(f\"Best parameters for KNN: {grid_search_knn.best_params_}\")\n",
    "print(f\"Best F1 Score for KNN: {grid_search_knn.best_score_}\")"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-10-23T07:02:54.568379900Z",
     "start_time": "2024-10-23T07:02:51.141055600Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 310,
   "outputs": [
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
      "\u001B[1;31mKeyboardInterrupt\u001B[0m                         Traceback (most recent call last)",
      "Cell \u001B[1;32mIn[310], line 22\u001B[0m\n\u001B[0;32m     12\u001B[0m \u001B[38;5;66;03m#f1 = 0.7768 gini,'criterion': 'gini', 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100\u001B[39;00m\n\u001B[0;32m     13\u001B[0m \u001B[38;5;66;03m# param_grid_rfc = {\u001B[39;00m\n\u001B[0;32m     14\u001B[0m \u001B[38;5;66;03m#     'n_estimators': [50, 100, 150, 200],\u001B[39;00m\n\u001B[1;32m   (...)\u001B[0m\n\u001B[0;32m     18\u001B[0m \u001B[38;5;66;03m#     'min_samples_leaf': [1, 2, 4]\u001B[39;00m\n\u001B[0;32m     19\u001B[0m \u001B[38;5;66;03m# }\u001B[39;00m\n\u001B[0;32m     21\u001B[0m grid_search_rfc \u001B[38;5;241m=\u001B[39m GridSearchCV(estimator\u001B[38;5;241m=\u001B[39mrfc, param_grid\u001B[38;5;241m=\u001B[39mparam_grid_rfc, cv\u001B[38;5;241m=\u001B[39m\u001B[38;5;241m5\u001B[39m, scoring\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mf1\u001B[39m\u001B[38;5;124m'\u001B[39m, n_jobs\u001B[38;5;241m=\u001B[39m\u001B[38;5;241m-\u001B[39m\u001B[38;5;241m1\u001B[39m)\n\u001B[1;32m---> 22\u001B[0m \u001B[43mgrid_search_rfc\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mfit\u001B[49m\u001B[43m(\u001B[49m\u001B[43mX\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43my\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m     24\u001B[0m \u001B[38;5;28mprint\u001B[39m(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mBest parameters for RandomForestClassifier: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mgrid_search_rfc\u001B[38;5;241m.\u001B[39mbest_params_\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m)\n\u001B[0;32m     25\u001B[0m \u001B[38;5;28mprint\u001B[39m(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mBest F1 Score for RandomForestClassifier: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mgrid_search_rfc\u001B[38;5;241m.\u001B[39mbest_score_\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m)\n",
      "File \u001B[1;32mD:\\ANACONDA\\envs\\infs7410\\lib\\site-packages\\sklearn\\base.py:1152\u001B[0m, in \u001B[0;36m_fit_context.<locals>.decorator.<locals>.wrapper\u001B[1;34m(estimator, *args, **kwargs)\u001B[0m\n\u001B[0;32m   1145\u001B[0m     estimator\u001B[38;5;241m.\u001B[39m_validate_params()\n\u001B[0;32m   1147\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m config_context(\n\u001B[0;32m   1148\u001B[0m     skip_parameter_validation\u001B[38;5;241m=\u001B[39m(\n\u001B[0;32m   1149\u001B[0m         prefer_skip_nested_validation \u001B[38;5;129;01mor\u001B[39;00m global_skip_validation\n\u001B[0;32m   1150\u001B[0m     )\n\u001B[0;32m   1151\u001B[0m ):\n\u001B[1;32m-> 1152\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mfit_method\u001B[49m\u001B[43m(\u001B[49m\u001B[43mestimator\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n",
      "File \u001B[1;32mD:\\ANACONDA\\envs\\infs7410\\lib\\site-packages\\sklearn\\model_selection\\_search.py:898\u001B[0m, in \u001B[0;36mBaseSearchCV.fit\u001B[1;34m(self, X, y, groups, **fit_params)\u001B[0m\n\u001B[0;32m    892\u001B[0m     results \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_format_results(\n\u001B[0;32m    893\u001B[0m         all_candidate_params, n_splits, all_out, all_more_results\n\u001B[0;32m    894\u001B[0m     )\n\u001B[0;32m    896\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m results\n\u001B[1;32m--> 898\u001B[0m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_run_search\u001B[49m\u001B[43m(\u001B[49m\u001B[43mevaluate_candidates\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m    900\u001B[0m \u001B[38;5;66;03m# multimetric is determined here because in the case of a callable\u001B[39;00m\n\u001B[0;32m    901\u001B[0m \u001B[38;5;66;03m# self.scoring the return type is only known after calling\u001B[39;00m\n\u001B[0;32m    902\u001B[0m first_test_score \u001B[38;5;241m=\u001B[39m all_out[\u001B[38;5;241m0\u001B[39m][\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mtest_scores\u001B[39m\u001B[38;5;124m\"\u001B[39m]\n",
      "File \u001B[1;32mD:\\ANACONDA\\envs\\infs7410\\lib\\site-packages\\sklearn\\model_selection\\_search.py:1422\u001B[0m, in \u001B[0;36mGridSearchCV._run_search\u001B[1;34m(self, evaluate_candidates)\u001B[0m\n\u001B[0;32m   1420\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21m_run_search\u001B[39m(\u001B[38;5;28mself\u001B[39m, evaluate_candidates):\n\u001B[0;32m   1421\u001B[0m \u001B[38;5;250m    \u001B[39m\u001B[38;5;124;03m\"\"\"Search all candidates in param_grid\"\"\"\u001B[39;00m\n\u001B[1;32m-> 1422\u001B[0m     \u001B[43mevaluate_candidates\u001B[49m\u001B[43m(\u001B[49m\u001B[43mParameterGrid\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mparam_grid\u001B[49m\u001B[43m)\u001B[49m\u001B[43m)\u001B[49m\n",
      "File \u001B[1;32mD:\\ANACONDA\\envs\\infs7410\\lib\\site-packages\\sklearn\\model_selection\\_search.py:845\u001B[0m, in \u001B[0;36mBaseSearchCV.fit.<locals>.evaluate_candidates\u001B[1;34m(candidate_params, cv, more_results)\u001B[0m\n\u001B[0;32m    837\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mverbose \u001B[38;5;241m>\u001B[39m \u001B[38;5;241m0\u001B[39m:\n\u001B[0;32m    838\u001B[0m     \u001B[38;5;28mprint\u001B[39m(\n\u001B[0;32m    839\u001B[0m         \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mFitting \u001B[39m\u001B[38;5;132;01m{0}\u001B[39;00m\u001B[38;5;124m folds for each of \u001B[39m\u001B[38;5;132;01m{1}\u001B[39;00m\u001B[38;5;124m candidates,\u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[0;32m    840\u001B[0m         \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m totalling \u001B[39m\u001B[38;5;132;01m{2}\u001B[39;00m\u001B[38;5;124m fits\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;241m.\u001B[39mformat(\n\u001B[0;32m    841\u001B[0m             n_splits, n_candidates, n_candidates \u001B[38;5;241m*\u001B[39m n_splits\n\u001B[0;32m    842\u001B[0m         )\n\u001B[0;32m    843\u001B[0m     )\n\u001B[1;32m--> 845\u001B[0m out \u001B[38;5;241m=\u001B[39m \u001B[43mparallel\u001B[49m\u001B[43m(\u001B[49m\n\u001B[0;32m    846\u001B[0m \u001B[43m    \u001B[49m\u001B[43mdelayed\u001B[49m\u001B[43m(\u001B[49m\u001B[43m_fit_and_score\u001B[49m\u001B[43m)\u001B[49m\u001B[43m(\u001B[49m\n\u001B[0;32m    847\u001B[0m \u001B[43m        \u001B[49m\u001B[43mclone\u001B[49m\u001B[43m(\u001B[49m\u001B[43mbase_estimator\u001B[49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m    848\u001B[0m \u001B[43m        \u001B[49m\u001B[43mX\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m    849\u001B[0m \u001B[43m        \u001B[49m\u001B[43my\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m    850\u001B[0m \u001B[43m        \u001B[49m\u001B[43mtrain\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mtrain\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m    851\u001B[0m \u001B[43m        \u001B[49m\u001B[43mtest\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mtest\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m    852\u001B[0m \u001B[43m        \u001B[49m\u001B[43mparameters\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mparameters\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m    853\u001B[0m \u001B[43m        \u001B[49m\u001B[43msplit_progress\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43msplit_idx\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mn_splits\u001B[49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m    854\u001B[0m \u001B[43m        \u001B[49m\u001B[43mcandidate_progress\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43mcand_idx\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mn_candidates\u001B[49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m    855\u001B[0m \u001B[43m        \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mfit_and_score_kwargs\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m    856\u001B[0m \u001B[43m    \u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m    857\u001B[0m \u001B[43m    \u001B[49m\u001B[38;5;28;43;01mfor\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[43m(\u001B[49m\u001B[43mcand_idx\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mparameters\u001B[49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43m(\u001B[49m\u001B[43msplit_idx\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43m(\u001B[49m\u001B[43mtrain\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mtest\u001B[49m\u001B[43m)\u001B[49m\u001B[43m)\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;129;43;01min\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[43mproduct\u001B[49m\u001B[43m(\u001B[49m\n\u001B[0;32m    858\u001B[0m \u001B[43m        \u001B[49m\u001B[38;5;28;43menumerate\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43mcandidate_params\u001B[49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43menumerate\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43mcv\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43msplit\u001B[49m\u001B[43m(\u001B[49m\u001B[43mX\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43my\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mgroups\u001B[49m\u001B[43m)\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m    859\u001B[0m \u001B[43m    \u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m    860\u001B[0m \u001B[43m\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m    862\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mlen\u001B[39m(out) \u001B[38;5;241m<\u001B[39m \u001B[38;5;241m1\u001B[39m:\n\u001B[0;32m    863\u001B[0m     \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\n\u001B[0;32m    864\u001B[0m         \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mNo fits were performed. \u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[0;32m    865\u001B[0m         \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mWas the CV iterator empty? \u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[0;32m    866\u001B[0m         \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mWere there no candidates?\u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[0;32m    867\u001B[0m     )\n",
      "File \u001B[1;32mD:\\ANACONDA\\envs\\infs7410\\lib\\site-packages\\sklearn\\utils\\parallel.py:65\u001B[0m, in \u001B[0;36mParallel.__call__\u001B[1;34m(self, iterable)\u001B[0m\n\u001B[0;32m     60\u001B[0m config \u001B[38;5;241m=\u001B[39m get_config()\n\u001B[0;32m     61\u001B[0m iterable_with_config \u001B[38;5;241m=\u001B[39m (\n\u001B[0;32m     62\u001B[0m     (_with_config(delayed_func, config), args, kwargs)\n\u001B[0;32m     63\u001B[0m     \u001B[38;5;28;01mfor\u001B[39;00m delayed_func, args, kwargs \u001B[38;5;129;01min\u001B[39;00m iterable\n\u001B[0;32m     64\u001B[0m )\n\u001B[1;32m---> 65\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43msuper\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[38;5;21;43m__call__\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43miterable_with_config\u001B[49m\u001B[43m)\u001B[49m\n",
      "File \u001B[1;32mD:\\ANACONDA\\envs\\infs7410\\lib\\site-packages\\joblib\\parallel.py:2007\u001B[0m, in \u001B[0;36mParallel.__call__\u001B[1;34m(self, iterable)\u001B[0m\n\u001B[0;32m   2001\u001B[0m \u001B[38;5;66;03m# The first item from the output is blank, but it makes the interpreter\u001B[39;00m\n\u001B[0;32m   2002\u001B[0m \u001B[38;5;66;03m# progress until it enters the Try/Except block of the generator and\u001B[39;00m\n\u001B[0;32m   2003\u001B[0m \u001B[38;5;66;03m# reaches the first `yield` statement. This starts the asynchronous\u001B[39;00m\n\u001B[0;32m   2004\u001B[0m \u001B[38;5;66;03m# dispatch of the tasks to the workers.\u001B[39;00m\n\u001B[0;32m   2005\u001B[0m \u001B[38;5;28mnext\u001B[39m(output)\n\u001B[1;32m-> 2007\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m output \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mreturn_generator \u001B[38;5;28;01melse\u001B[39;00m \u001B[38;5;28;43mlist\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43moutput\u001B[49m\u001B[43m)\u001B[49m\n",
      "File \u001B[1;32mD:\\ANACONDA\\envs\\infs7410\\lib\\site-packages\\joblib\\parallel.py:1650\u001B[0m, in \u001B[0;36mParallel._get_outputs\u001B[1;34m(self, iterator, pre_dispatch)\u001B[0m\n\u001B[0;32m   1647\u001B[0m     \u001B[38;5;28;01myield\u001B[39;00m\n\u001B[0;32m   1649\u001B[0m     \u001B[38;5;28;01mwith\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_backend\u001B[38;5;241m.\u001B[39mretrieval_context():\n\u001B[1;32m-> 1650\u001B[0m         \u001B[38;5;28;01myield from\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_retrieve()\n\u001B[0;32m   1652\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mGeneratorExit\u001B[39;00m:\n\u001B[0;32m   1653\u001B[0m     \u001B[38;5;66;03m# The generator has been garbage collected before being fully\u001B[39;00m\n\u001B[0;32m   1654\u001B[0m     \u001B[38;5;66;03m# consumed. This aborts the remaining tasks if possible and warn\u001B[39;00m\n\u001B[0;32m   1655\u001B[0m     \u001B[38;5;66;03m# the user if necessary.\u001B[39;00m\n\u001B[0;32m   1656\u001B[0m     \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_exception \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mTrue\u001B[39;00m\n",
      "File \u001B[1;32mD:\\ANACONDA\\envs\\infs7410\\lib\\site-packages\\joblib\\parallel.py:1762\u001B[0m, in \u001B[0;36mParallel._retrieve\u001B[1;34m(self)\u001B[0m\n\u001B[0;32m   1757\u001B[0m \u001B[38;5;66;03m# If the next job is not ready for retrieval yet, we just wait for\u001B[39;00m\n\u001B[0;32m   1758\u001B[0m \u001B[38;5;66;03m# async callbacks to progress.\u001B[39;00m\n\u001B[0;32m   1759\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m ((\u001B[38;5;28mlen\u001B[39m(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_jobs) \u001B[38;5;241m==\u001B[39m \u001B[38;5;241m0\u001B[39m) \u001B[38;5;129;01mor\u001B[39;00m\n\u001B[0;32m   1760\u001B[0m     (\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_jobs[\u001B[38;5;241m0\u001B[39m]\u001B[38;5;241m.\u001B[39mget_status(\n\u001B[0;32m   1761\u001B[0m         timeout\u001B[38;5;241m=\u001B[39m\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mtimeout) \u001B[38;5;241m==\u001B[39m TASK_PENDING)):\n\u001B[1;32m-> 1762\u001B[0m     \u001B[43mtime\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43msleep\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m0.01\u001B[39;49m\u001B[43m)\u001B[49m\n\u001B[0;32m   1763\u001B[0m     \u001B[38;5;28;01mcontinue\u001B[39;00m\n\u001B[0;32m   1765\u001B[0m \u001B[38;5;66;03m# We need to be careful: the job list can be filling up as\u001B[39;00m\n\u001B[0;32m   1766\u001B[0m \u001B[38;5;66;03m# we empty it and Python list are not thread-safe by\u001B[39;00m\n\u001B[0;32m   1767\u001B[0m \u001B[38;5;66;03m# default hence the use of the lock\u001B[39;00m\n",
      "\u001B[1;31mKeyboardInterrupt\u001B[0m: "
     ]
    }
   ],
   "source": [
    "rfc = RandomForestClassifier(random_state=42)\n",
    "\n",
    "param_grid_rfc = {\n",
    "    'n_estimators': [100, 200, 300, 500],\n",
    "    'criterion': ['gini', 'entropy'],\n",
    "    'max_depth': [None, 30, 40, 50],\n",
    "    'min_samples_split': [2, 5, 10, 15],\n",
    "    'min_samples_leaf': [1, 2, 4, 10],\n",
    "    'max_features': ['sqrt']\n",
    "}\n",
    "\n",
    "#f1 = 0.7768 gini,'criterion': 'gini', 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100\n",
    "# param_grid_rfc = {\n",
    "#     'n_estimators': [50, 100, 150, 200],\n",
    "#     'criterion': ['gini', 'entropy'],\n",
    "#     'max_depth': [None, 10, 20, 30],\n",
    "#     'min_samples_split': [2, 5, 10],\n",
    "#     'min_samples_leaf': [1, 2, 4]\n",
    "# }\n",
    "\n",
    "grid_search_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid_rfc, cv=5, scoring='f1', n_jobs=-1)\n",
    "grid_search_rfc.fit(X, y)\n",
    "\n",
    "print(f\"Best parameters for RandomForestClassifier: {grid_search_rfc.best_params_}\")\n",
    "print(f\"Best F1 Score for RandomForestClassifier: {grid_search_rfc.best_score_}\")\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-10-23T10:41:26.414752Z",
     "start_time": "2024-10-23T10:37:06.347046Z"
    }
   }
  },
  {
   "cell_type": "markdown",
   "source": [],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Panda method"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 1600 entries, 0 to 1599\n",
      "Columns: 103 entries, Num (Col 1) to Num (Col 103)\n",
      "dtypes: float64(103)\n",
      "memory usage: 1.3 MB\n",
      "None\n",
      "      Nom (Col 104)  Nom (Col 105)\n",
      "0               0.0            0.0\n",
      "1               0.0            1.0\n",
      "2               0.0            0.0\n",
      "3               0.0            1.0\n",
      "4               0.0            0.0\n",
      "...             ...            ...\n",
      "1595            0.0            0.0\n",
      "1596            0.0            0.0\n",
      "1597            0.0            0.0\n",
      "1598            NaN            0.0\n",
      "1599            NaN            0.0\n",
      "\n",
      "[1600 rows x 2 columns]\n"
     ]
    }
   ],
   "source": [
    "df= pd.read_csv('DM_Project_24.csv')\n",
    "features = df.iloc[:,:-1]\n",
    "label = df.iloc[:,-1]\n",
    "numberical = df.iloc[:,:103]\n",
    "nominal = df.iloc[:,103:-1]\n",
    "\n",
    "\n",
    "# print(df.info())\n",
    "# print(df.describe())\n",
    "# df.head(10)\n",
    "# df_impu_all = df.copy()\n",
    "# df_impu_all.iloc[:,:3] = df_impu_all.iloc[:,:3].fillna(df_impu_all.iloc[:,:3].mean())\n",
    "# print(df_impu_all.info())\n",
    "# print(\"------------------------\")\n",
    "# print(df.info())"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-10-21T08:09:42.579065200Z",
     "start_time": "2024-10-21T08:09:42.543904900Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [],
   "metadata": {
    "collapsed": false
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}

这段代码主要实现了对一个数据集(从DM_Project_24.csv文件读取)的数据预处理、异常值检测、数据重构以及使用多种机器学习模型进行评估和调优的功能,具体如下:

  1. 数据读取与基本信息统计
    • DM_Project_24.csv文件读取数据,计算数据集中缺失值的总数、标签列缺失值数量、每个特征的缺失值数量,并统计了标签中0和1的数量。
    • 将数据集按特征类型进行了划分,包括所有特征、数值特征、名义特征和标签列。
  2. 数据预处理
    • 使用SimpleImputer对数值特征和名义特征的缺失值分别进行中位数填充和众数填充。
    • 对填充后的数值特征进行了归一化处理(MinMaxScaler)和标准化处理(StandardScaler)。
  3. 异常值检测
    • 使用DBSCAN算法分别对归一化和标准化后的数值特征进行异常值检测,根据给定的epsmin_samples参数确定异常值,并返回去除异常值后的数据集。
    • 使用IsolationForest算法对归一化和标准化后的数值特征进行异常值检测,根据设定的n_estimatorscontamination参数确定异常值,得到去除异常值后的数据集。
    • 使用LocalOutlierFactor算法对归一化和标准化后的数值特征进行异常值检测,通过计算每个数据点的局部离群因子来确定异常值,返回删除异常值后的数据集。
  4. 数据重构
    • 将不同处理阶段的数据进行重新组合,包括原始数据填充后、归一化处理后、标准化处理后、经过DBSCANIsolationForestLocalOutlierFactor算法处理后的数据与标签列重新合并。
  5. 模型评估与调优
    • 使用DecisionTreeClassifierKNeighborsClassifierRandomForestClassifierGaussianNB等模型对处理后的数据(如if_std_data)进行交叉验证评估,计算F1分数。
    • DecisionTreeClassifierKNeighborsClassifierRandomForestClassifier模型进行了超参数调优,使用GridSearchCV寻找最佳参数组合,以提高模型性能,并输出最佳参数和最佳F1分数。

总体而言,代码涵盖了数据处理、异常值检测、模型评估和调优等机器学习项目中的常见操作流程,用于分析和处理给定数据集以构建合适的预测模型。


原文地址:https://blog.csdn.net/huanghm88/article/details/143371033

免责声明:本站文章内容转载自网络资源,如本站内容侵犯了原著者的合法权益,可联系本站删除。更多内容请关注自学内容网(zxcms.com)!