python实现了对一个数据集(从csv文件读取)的数据预处理、异常值检测、数据重构以及使用多种机器学习模型进行评估和调优的功能
代码主要实现了对一个数据集(从DM_Project_24.csv
文件读取)的数据预处理、异常值检测、数据重构以及使用多种机器学习模型进行评估和调优的功能
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2024-10-23T03:53:51.550153600Z",
"start_time": "2024-10-23T03:53:51.542117400Z"
}
},
"outputs": [],
"source": [
"#1. imputation\n",
"#2. outlier detection\n",
"#3. Normalization\n",
"#4."
]
},
{
"cell_type": "code",
"execution_count": 315,
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"np.random.seed(42)\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn import metrics"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-10-23T11:04:48.194406300Z",
"start_time": "2024-10-23T11:04:48.177847900Z"
}
}
},
{
"cell_type": "code",
"execution_count": 316,
"outputs": [],
"source": [
"data = np.genfromtxt('DM_Project_24.csv',delimiter=\",\",skip_header=1)\n"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-10-23T11:04:48.750530300Z",
"start_time": "2024-10-23T11:04:48.650819800Z"
}
}
},
{
"cell_type": "code",
"execution_count": 317,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total Missing Values is:9499\n",
"Total Missing Values on Label is :0\n",
"Number of missing values by feature:\n",
" [ 67 101 86 83 101 108 78 90 88 91 81 110 83 94 78 80 107 85\n",
" 87 100 96 90 84 96 83 90 90 78 82 95 107 98 80 82 97 83\n",
" 83 86 98 75 83 79 81 95 90 71 100 84 88 77 98 98 91 99\n",
" 104 73 85 103 78 104 84 87 81 85 89 89 94 81 83 88 95 75\n",
" 95 82 81 90 87 84 84 87 86 99 92 85 80 98 102 106 81 101\n",
" 88 101 95 101 95 89 95 98 89 82 109 79 81 151 143 0]\n",
"Numeber of label 0: 1406\n",
"Number of label 1: 194\n"
]
}
],
"source": [
"#calculate total number of missing values\n",
"total_Value = np.sum(np.isnan(data))\n",
"print(f'Total Missing Values is:{total_Value}')\n",
"\n",
"#calculate count of missing values on labels\n",
"missing_on_label = np.sum(np.isnan(data[:,-1]))\n",
"print(f'Total Missing Values on Label is :{missing_on_label}')\n",
"\n",
"#show number of missing values by each feature\n",
"missing_on_feature = np.sum(np.isnan(data),axis=0)\n",
"print(f'Number of missing values by feature:\\n {missing_on_feature}')\n",
"\n",
"#calculate number of 1 and0\n",
"num_label_zero = np.sum(data[:,-1] == 0)\n",
"num_label_one = np.sum(data[:,-1] == 1)\n",
"print(f'Numeber of label 0: {num_label_zero}')\n",
"print(f'Number of label 1: {num_label_one}')\n"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-10-23T11:04:49.245104700Z",
"start_time": "2024-10-23T11:04:49.212110900Z"
}
}
},
{
"cell_type": "code",
"execution_count": 318,
"outputs": [],
"source": [
"col_features = data[:,:-1]\n",
"col_label = data[:,-1]\n",
"col_numerical = data[:,:103]\n",
"col_nominal = data[:,103:-1]"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-10-23T11:04:49.774289600Z",
"start_time": "2024-10-23T11:04:49.742662500Z"
}
}
},
{
"cell_type": "code",
"execution_count": 319,
"outputs": [
{
"data": {
"text/plain": "(1600, 2)"
},
"execution_count": 319,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Missing value\n",
"from sklearn.impute import SimpleImputer\n",
"numerical_imputer = SimpleImputer(missing_values = np.nan, strategy = 'median')\n",
"nominal_imputer = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')\n",
"impute_col_numerical = numerical_imputer.fit_transform(col_numerical)\n",
"impute_col_nominal = nominal_imputer.fit_transform(col_nominal)\n",
"impute_col_nominal.shape"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-10-23T11:04:50.262671100Z",
"start_time": "2024-10-23T11:04:50.230889900Z"
}
}
},
{
"cell_type": "code",
"execution_count": 320,
"outputs": [],
"source": [
"#Normalisaion\n",
"from sklearn.preprocessing import MinMaxScaler, StandardScaler\n",
"min_max_scaler = MinMaxScaler()\n",
"standard_scaler = StandardScaler()\n",
"\n",
"minmax_col_numerical = min_max_scaler.fit_transform(impute_col_numerical)\n",
"std_col_numerical = standard_scaler.fit_transform(impute_col_numerical)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-10-23T11:04:50.965009600Z",
"start_time": "2024-10-23T11:04:50.949314900Z"
}
}
},
{
"cell_type": "markdown",
"source": [
"# Z-score"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 320,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-10-23T11:04:52.112475400Z",
"start_time": "2024-10-23T11:04:52.082097100Z"
}
}
},
{
"cell_type": "markdown",
"source": [
"# DBSCAN"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 321,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of samples in remove_minmax_numerical: 1555\n",
"Number of samples in remove_std_numerical: 1552\n"
]
}
],
"source": [
"from sklearn.cluster import DBSCAN\n",
"# MinMax\n",
"dbscan = DBSCAN(eps=0.2, min_samples=5)\n",
"db_minmax_outliers = dbscan.fit_predict(minmax_col_numerical)\n",
"db_minmax_inliers_mask = db_minmax_outliers != -1 # -1 表示噪声点,即异常值\n",
"remove_minmax_db_numerical = minmax_col_numerical[db_minmax_inliers_mask]\n",
"remove_minmax_db_nominal = impute_col_nominal[db_minmax_inliers_mask]\n",
"db_minmax_labels = col_label[db_minmax_inliers_mask]\n",
"print(f\"Number of samples in remove_minmax_numerical: {remove_minmax_db_numerical.shape[0]}\")\n",
"\n",
"#std\n",
"dbscan = DBSCAN(eps=4, min_samples=5)\n",
"db_std_outliers = dbscan.fit_predict(std_col_numerical)\n",
"db_std_inliers_mask = db_std_outliers != -1 # -1 表示噪声点,即异常值\n",
"remove_std_db_numerical = std_col_numerical[db_std_inliers_mask]\n",
"remove_std_db_nominal = impute_col_nominal[db_std_inliers_mask]\n",
"db_std_labels = col_label[db_std_inliers_mask]\n",
"print(f\"Number of samples in remove_std_numerical: {remove_std_db_numerical.shape[0]}\")"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-10-23T11:04:53.459760700Z",
"start_time": "2024-10-23T11:04:53.365329400Z"
}
}
},
{
"cell_type": "code",
"execution_count": 186,
"outputs": [
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[1;31mKeyboardInterrupt\u001B[0m Traceback (most recent call last)",
"Cell \u001B[1;32mIn[186], line 19\u001B[0m\n\u001B[0;32m 16\u001B[0m dbscan \u001B[38;5;241m=\u001B[39m DBSCAN(eps\u001B[38;5;241m=\u001B[39meps, min_samples\u001B[38;5;241m=\u001B[39mmin_samples)\n\u001B[0;32m 18\u001B[0m \u001B[38;5;66;03m# 拟合模型并进行聚类\u001B[39;00m\n\u001B[1;32m---> 19\u001B[0m clusters \u001B[38;5;241m=\u001B[39m \u001B[43mdbscan\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mfit_predict\u001B[49m\u001B[43m(\u001B[49m\u001B[43mminmax_col_numerical\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 21\u001B[0m \u001B[38;5;66;03m# 如果形成的簇数量大于 1,则计算轮廓系数\u001B[39;00m\n\u001B[0;32m 22\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mlen\u001B[39m(\u001B[38;5;28mset\u001B[39m(clusters)) \u001B[38;5;241m>\u001B[39m \u001B[38;5;241m1\u001B[39m:\n",
"File \u001B[1;32mD:\\ANACONDA\\envs\\infs7410\\lib\\site-packages\\sklearn\\cluster\\_dbscan.py:454\u001B[0m, in \u001B[0;36mDBSCAN.fit_predict\u001B[1;34m(self, X, y, sample_weight)\u001B[0m\n\u001B[0;32m 429\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mfit_predict\u001B[39m(\u001B[38;5;28mself\u001B[39m, X, y\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mNone\u001B[39;00m, sample_weight\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mNone\u001B[39;00m):\n\u001B[0;32m 430\u001B[0m \u001B[38;5;250m \u001B[39m\u001B[38;5;124;03m\"\"\"Compute clusters from a data or distance matrix and predict labels.\u001B[39;00m\n\u001B[0;32m 431\u001B[0m \n\u001B[0;32m 432\u001B[0m \u001B[38;5;124;03m Parameters\u001B[39;00m\n\u001B[1;32m (...)\u001B[0m\n\u001B[0;32m 452\u001B[0m \u001B[38;5;124;03m Cluster labels. Noisy samples are given the label -1.\u001B[39;00m\n\u001B[0;32m 453\u001B[0m \u001B[38;5;124;03m \"\"\"\u001B[39;00m\n\u001B[1;32m--> 454\u001B[0m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mfit\u001B[49m\u001B[43m(\u001B[49m\u001B[43mX\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43msample_weight\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43msample_weight\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 455\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mlabels_\n",
"File \u001B[1;32mD:\\ANACONDA\\envs\\infs7410\\lib\\site-packages\\sklearn\\base.py:1152\u001B[0m, in \u001B[0;36m_fit_context.<locals>.decorator.<locals>.wrapper\u001B[1;34m(estimator, *args, **kwargs)\u001B[0m\n\u001B[0;32m 1145\u001B[0m estimator\u001B[38;5;241m.\u001B[39m_validate_params()\n\u001B[0;32m 1147\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m config_context(\n\u001B[0;32m 1148\u001B[0m skip_parameter_validation\u001B[38;5;241m=\u001B[39m(\n\u001B[0;32m 1149\u001B[0m prefer_skip_nested_validation \u001B[38;5;129;01mor\u001B[39;00m global_skip_validation\n\u001B[0;32m 1150\u001B[0m )\n\u001B[0;32m 1151\u001B[0m ):\n\u001B[1;32m-> 1152\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mfit_method\u001B[49m\u001B[43m(\u001B[49m\u001B[43mestimator\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n",
"File \u001B[1;32mD:\\ANACONDA\\envs\\infs7410\\lib\\site-packages\\sklearn\\cluster\\_dbscan.py:418\u001B[0m, in \u001B[0;36mDBSCAN.fit\u001B[1;34m(self, X, y, sample_weight)\u001B[0m\n\u001B[0;32m 415\u001B[0m core_samples \u001B[38;5;241m=\u001B[39m np\u001B[38;5;241m.\u001B[39masarray(n_neighbors \u001B[38;5;241m>\u001B[39m\u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mmin_samples, dtype\u001B[38;5;241m=\u001B[39mnp\u001B[38;5;241m.\u001B[39muint8)\n\u001B[0;32m 416\u001B[0m dbscan_inner(core_samples, neighborhoods, labels)\n\u001B[1;32m--> 418\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcore_sample_indices_ \u001B[38;5;241m=\u001B[39m \u001B[43mnp\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mwhere\u001B[49m\u001B[43m(\u001B[49m\u001B[43mcore_samples\u001B[49m\u001B[43m)\u001B[49m[\u001B[38;5;241m0\u001B[39m]\n\u001B[0;32m 419\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mlabels_ \u001B[38;5;241m=\u001B[39m labels\n\u001B[0;32m 421\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mlen\u001B[39m(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcore_sample_indices_):\n\u001B[0;32m 422\u001B[0m \u001B[38;5;66;03m# fix for scipy sparse indexing issue\u001B[39;00m\n",
"File \u001B[1;32m<__array_function__ internals>:200\u001B[0m, in \u001B[0;36mwhere\u001B[1;34m(*args, **kwargs)\u001B[0m\n",
"\u001B[1;31mKeyboardInterrupt\u001B[0m: "
]
}
],
"source": [],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-10-23T05:53:51.644205Z",
"start_time": "2024-10-23T05:53:43.489842700Z"
}
}
},
{
"cell_type": "markdown",
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"## Outlier detection using Isolation Forest"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 271,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of samples in remove_minmax_numerical: 1508\n",
"Number of samples in remove_std_numerical: 1508\n"
]
}
],
"source": [
"from sklearn.ensemble import IsolationForest\n",
"iso_forest = IsolationForest(n_estimators=118, contamination=0.0575, max_features=1)\n",
"if_minmax_outliers = iso_forest.fit_predict(minmax_col_numerical)\n",
"if_minmax_inliers_mask = if_minmax_outliers == 1\n",
"remove_minmax_if_numerical = minmax_col_numerical[if_minmax_inliers_mask]\n",
"remove_minmax_if_nominal = impute_col_nominal[if_minmax_inliers_mask]\n",
"if_minmax_labels = col_label[if_minmax_inliers_mask]\n",
"print(f\"Number of samples in remove_minmax_numerical: {remove_minmax_if_numerical.shape[0]}\")\n",
"\n",
"\n",
"if_std_outliers = iso_forest.fit_predict(std_col_numerical)\n",
"if_std_inliers_mask = if_std_outliers ==1\n",
"remove_std_if_numerical = std_col_numerical[if_std_inliers_mask]\n",
"remove_std_if_nominal = impute_col_nominal[if_std_inliers_mask]\n",
"if_std_lable = col_label[if_std_inliers_mask]\n",
"print(f\"Number of samples in remove_std_numerical: {remove_std_if_numerical.shape[0]}\")\n",
"\n",
"\n",
"\n"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-10-23T06:36:27.531422400Z",
"start_time": "2024-10-23T06:36:27.337119700Z"
}
}
},
{
"cell_type": "markdown",
"source": [
"## Outlier detection using LOF"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 272,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"MinMax删除异常值后的数据点数量: 1549\n",
"std删除异常值后的数据点数量: 1555\n"
]
}
],
"source": [
"from sklearn.neighbors import LocalOutlierFactor\n",
"from collections import Counter\n",
"\n",
"lofer = LocalOutlierFactor(n_neighbors=95)\n",
"\n",
"minmax_outliers = lofer.fit_predict(minmax_col_numerical)\n",
"minmax_outlier_count = Counter(minmax_outliers)\n",
"lof_minmax_inliers_mask = minmax_outliers == 1\n",
"remove_minmax_lof_numerical = minmax_col_numerical[lof_minmax_inliers_mask]\n",
"minmax_lof_labels = col_label[lof_minmax_inliers_mask]\n",
"remove_minmax_lof_nominal = impute_col_nominal[lof_minmax_inliers_mask]\n",
"print(f\"MinMax删除异常值后的数据点数量: {remove_minmax_lof_numerical.shape[0]}\")\n",
"\n",
"\n",
"std_outliers = lofer.fit_predict(std_col_numerical)\n",
"std_outlier_count = Counter(std_outliers)\n",
"lof_std_inliers_mask = std_outliers == 1\n",
"remove_std_numerical = std_col_numerical[lof_std_inliers_mask]\n",
"std_labels = col_label[lof_std_inliers_mask]\n",
"remove_std_lof_nominal = impute_col_nominal[lof_std_inliers_mask]\n",
"print(f\"std删除异常值后的数据点数量: {remove_std_numerical.shape[0]}\")\n"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-10-23T06:36:33.175575200Z",
"start_time": "2024-10-23T06:36:33.005551900Z"
}
}
},
{
"cell_type": "markdown",
"source": [
"# Recontructure Data"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 322,
"outputs": [],
"source": [
"#Missing value\n",
"data_impute_without_label = np.concatenate((impute_col_numerical,impute_col_nominal),axis=1)\n",
"impute_data = np.concatenate((data_impute_without_label,col_label.reshape(-1,1)),axis=1)\n",
"\n",
"#Missing value, Minmax\n",
"data_minmax_without_label = np.concatenate((minmax_col_numerical,impute_col_nominal),axis=1)\n",
"minmax_data = np.concatenate((data_minmax_without_label,col_label.reshape(-1, 1)),axis=1)\n",
"\n",
"#Missing value, standard\n",
"data_std_without_label = np.concatenate((std_col_numerical,impute_col_nominal),axis=1)\n",
"std_data = np.concatenate((data_std_without_label,col_label.reshape(-1, 1)),axis=1)\n",
"#impute_data.shape\n",
"\n",
"#Missing value, Min-Max, DBSCAN\n",
"data_db_without_lable = np.concatenate((remove_minmax_db_numerical,remove_minmax_db_nominal),axis=1)\n",
"db_minmax_data = np.concatenate((data_db_without_lable,db_minmax_labels.reshape(-1, 1)),axis=1)\n",
"#db_minmax_data.shape\n",
"\n",
"data_db_std_without_lable = np.concatenate((remove_std_db_numerical,remove_std_db_nominal),axis=1)\n",
"db_std_data = np.concatenate((data_db_std_without_lable,db_std_labels.reshape(-1, 1)),axis=1)\n",
"\n",
"\n",
"\n",
"\n",
"#Missing value, Min-Max, Isolation Forest\n",
"data_if_without_lable = np.concatenate((remove_minmax_if_numerical,remove_minmax_if_nominal),axis=1)\n",
"if_minmax_data = np.concatenate((data_if_without_lable,if_minmax_labels.reshape(-1, 1)),axis=1)\n",
"#if_minmax_data.shape\n",
"\n",
"#Missing value, Standard, Isolation Forest\n",
"data_if_std_without_lable = np.concatenate((remove_std_if_numerical,remove_std_if_nominal),axis=1)\n",
"if_std_data = np.concatenate((data_if_std_without_lable,if_std_lable.reshape(-1,1)),axis=1)\n",
"#if_std_data.shape\n",
"\n",
"#Missing value, Min-Max, LOF\n",
"data_without_lable = np.concatenate((remove_minmax_lof_numerical,remove_minmax_lof_nominal),axis=1)\n",
"#data_without_lable = np.concatenate((remove_minmax_numerical,impute_col_nominal),axis=1)\n",
"lof_minmax_data = np.concatenate((data_without_lable,minmax_lof_labels.reshape(-1, 1)),axis=1)\n",
"#lof_minmax_data.shape\n",
"\n",
"#Missing value, Standard, LOF\n",
"data_without_lable = np.concatenate((remove_std_numerical,remove_std_lof_nominal),axis=1)\n",
"lof_std_data = np.concatenate((data_without_lable,std_labels.reshape(-1, 1)),axis=1)\n",
"#lof_std_data.shape\n"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-10-23T11:05:00.838247800Z",
"start_time": "2024-10-23T11:05:00.806645100Z"
}
}
},
{
"cell_type": "code",
"execution_count": 322,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-10-23T11:05:01.567003100Z",
"start_time": "2024-10-23T11:05:01.549144800Z"
}
}
},
{
"cell_type": "code",
"execution_count": 325,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.672668604141718\n"
]
}
],
"source": [
"# Decision Tress Implementation\n",
"from sklearn import tree\n",
"from sklearn.model_selection import cross_val_score\n",
"from sklearn.model_selection import GridSearchCV\n",
"\n",
"dt = tree.DecisionTreeClassifier(random_state=42 )\n",
"X = if_std_data[:, :-1]\n",
"y = if_std_data[:,-1]\n",
"dt_score = cross_val_score(dt,X,y,scoring=\"f1\",cv = 5)\n",
"print(sum(dt_score)/5)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-10-23T11:05:42.246363800Z",
"start_time": "2024-10-23T11:05:40.384686100Z"
}
}
},
{
"cell_type": "code",
"execution_count": 297,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.07141067616800681\n"
]
}
],
"source": [
"from sklearn.neighbors import KNeighborsClassifier\n",
"neigh = KNeighborsClassifier()\n",
"knn_score = cross_val_score(neigh,X,y,scoring=\"f1\",cv = 5)\n",
"print(sum(knn_score)/5)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-10-23T06:44:26.602704300Z",
"start_time": "2024-10-23T06:44:26.478424600Z"
}
}
},
{
"cell_type": "code",
"execution_count": 298,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.7311561426901243\n"
]
}
],
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"rfc = RandomForestClassifier()\n",
"rfc_score = cross_val_score(rfc,X,y,scoring=\"f1\",cv = 5)\n",
"print(sum(rfc_score)/5)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-10-23T06:44:35.550518Z",
"start_time": "2024-10-23T06:44:26.602704300Z"
}
}
},
{
"cell_type": "code",
"execution_count": 299,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.21726542893236117\n"
]
}
],
"source": [
"from sklearn.naive_bayes import GaussianNB\n",
"gnb = GaussianNB()\n",
"gnb_score = cross_val_score(gnb,X,y,scoring=\"f1\",cv = 5)\n",
"print(sum(gnb_score)/5)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-10-23T06:44:35.603433700Z",
"start_time": "2024-10-23T06:44:35.552596900Z"
}
}
},
{
"cell_type": "code",
"execution_count": 330,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best parameters: {'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'best'}\n",
"Best F1 Score: 0.773772741931235\n"
]
}
],
"source": [
"from sklearn.metrics import make_scorer, f1_score\n",
"dt = tree.DecisionTreeClassifier(random_state=42,ccp_alpha=0.01)\n",
"\n",
"\n",
"f1_scorer = make_scorer(f1_score, average='binary')\n",
"X = if_minmax_data[:, :-1]\n",
"y = if_minmax_data[:,-1]\n",
"\n",
"param_grid = {\n",
" 'criterion':['gini','entropy','log_loss'],\n",
" 'splitter' : ['best'],\n",
" 'max_depth': [3,5,7,10,15, 20, 30],\n",
" 'min_samples_split': [2,3,4, 5, 9,10,15],\n",
" 'min_samples_leaf': [1, 2,3, 4,6,10]\n",
"}\n",
"#criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'best f1:0.7716\n",
"# param_grid = {\n",
"# 'criterion':['gini','entropy','log_loss'],\n",
"# 'splitter' : ['best'],\n",
"# 'max_depth': [3, 5, 10, 15, 20, 30],\n",
"# 'min_samples_split': [2,3,4, 5, 9,10,15],\n",
"# 'min_samples_leaf': [1, 2,3, 4,6,10]\n",
"# }\n",
"grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=10, scoring=f1_scorer, n_jobs=-1)\n",
"grid_search.fit(X, y)\n",
"print(f\"Best parameters: {grid_search.best_params_}\")\n",
"print(f\"Best F1 Score: {grid_search.best_score_}\")\n",
"# cv = [5,10,15,20,25,30]\n",
"# for c in cv:\n",
"# grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=c, scoring='f1', n_jobs=-1)\n",
"# grid_search.fit(X, y)\n",
"# print(f'cv:{c}')\n",
"# print(f\"Best parameters: {grid_search.best_params_}\")\n",
"# print(f\"Best F1 Score: {grid_search.best_score_}\")"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-10-23T11:41:29.914445Z",
"start_time": "2024-10-23T11:40:09.879004100Z"
}
}
},
{
"cell_type": "code",
"execution_count": 303,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best parameters for KNN: {'algorithm': 'auto', 'metric': 'minkowski', 'n_neighbors': 6, 'p': 2, 'weights': 'uniform'}\n",
"Best F1 Score for KNN: 0.7780195374467731\n"
]
}
],
"source": [
"from sklearn.neighbors import KNeighborsClassifier\n",
"neigh = KNeighborsClassifier()\n",
"param_grid_knn = {\n",
" 'n_neighbors': [1,2,3,4, 5,6, 7,8, 9,10,11],\n",
" 'weights': ['uniform', 'distance'],\n",
" 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],\n",
" 'metric': ['minkowski', 'euclidean', 'manhattan'],\n",
" 'p': [1, 2] # 1 for Manhattan distance, 2 for Euclidean distance\n",
"}\n",
"\n",
"grid_search_knn = GridSearchCV(estimator=neigh, param_grid=param_grid_knn, cv=5, scoring='f1', n_jobs=-1)\n",
"grid_search_knn.fit(X, y)\n",
"\n",
"print(f\"Best parameters for KNN: {grid_search_knn.best_params_}\")\n",
"print(f\"Best F1 Score for KNN: {grid_search_knn.best_score_}\")"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-10-23T07:02:54.568379900Z",
"start_time": "2024-10-23T07:02:51.141055600Z"
}
}
},
{
"cell_type": "code",
"execution_count": 310,
"outputs": [
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[1;31mKeyboardInterrupt\u001B[0m Traceback (most recent call last)",
"Cell \u001B[1;32mIn[310], line 22\u001B[0m\n\u001B[0;32m 12\u001B[0m \u001B[38;5;66;03m#f1 = 0.7768 gini,'criterion': 'gini', 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100\u001B[39;00m\n\u001B[0;32m 13\u001B[0m \u001B[38;5;66;03m# param_grid_rfc = {\u001B[39;00m\n\u001B[0;32m 14\u001B[0m \u001B[38;5;66;03m# 'n_estimators': [50, 100, 150, 200],\u001B[39;00m\n\u001B[1;32m (...)\u001B[0m\n\u001B[0;32m 18\u001B[0m \u001B[38;5;66;03m# 'min_samples_leaf': [1, 2, 4]\u001B[39;00m\n\u001B[0;32m 19\u001B[0m \u001B[38;5;66;03m# }\u001B[39;00m\n\u001B[0;32m 21\u001B[0m grid_search_rfc \u001B[38;5;241m=\u001B[39m GridSearchCV(estimator\u001B[38;5;241m=\u001B[39mrfc, param_grid\u001B[38;5;241m=\u001B[39mparam_grid_rfc, cv\u001B[38;5;241m=\u001B[39m\u001B[38;5;241m5\u001B[39m, scoring\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mf1\u001B[39m\u001B[38;5;124m'\u001B[39m, n_jobs\u001B[38;5;241m=\u001B[39m\u001B[38;5;241m-\u001B[39m\u001B[38;5;241m1\u001B[39m)\n\u001B[1;32m---> 22\u001B[0m \u001B[43mgrid_search_rfc\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mfit\u001B[49m\u001B[43m(\u001B[49m\u001B[43mX\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43my\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 24\u001B[0m \u001B[38;5;28mprint\u001B[39m(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mBest parameters for RandomForestClassifier: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mgrid_search_rfc\u001B[38;5;241m.\u001B[39mbest_params_\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m)\n\u001B[0;32m 25\u001B[0m \u001B[38;5;28mprint\u001B[39m(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mBest F1 Score for RandomForestClassifier: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mgrid_search_rfc\u001B[38;5;241m.\u001B[39mbest_score_\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m)\n",
"File \u001B[1;32mD:\\ANACONDA\\envs\\infs7410\\lib\\site-packages\\sklearn\\base.py:1152\u001B[0m, in \u001B[0;36m_fit_context.<locals>.decorator.<locals>.wrapper\u001B[1;34m(estimator, *args, **kwargs)\u001B[0m\n\u001B[0;32m 1145\u001B[0m estimator\u001B[38;5;241m.\u001B[39m_validate_params()\n\u001B[0;32m 1147\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m config_context(\n\u001B[0;32m 1148\u001B[0m skip_parameter_validation\u001B[38;5;241m=\u001B[39m(\n\u001B[0;32m 1149\u001B[0m prefer_skip_nested_validation \u001B[38;5;129;01mor\u001B[39;00m global_skip_validation\n\u001B[0;32m 1150\u001B[0m )\n\u001B[0;32m 1151\u001B[0m ):\n\u001B[1;32m-> 1152\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mfit_method\u001B[49m\u001B[43m(\u001B[49m\u001B[43mestimator\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n",
"File \u001B[1;32mD:\\ANACONDA\\envs\\infs7410\\lib\\site-packages\\sklearn\\model_selection\\_search.py:898\u001B[0m, in \u001B[0;36mBaseSearchCV.fit\u001B[1;34m(self, X, y, groups, **fit_params)\u001B[0m\n\u001B[0;32m 892\u001B[0m results \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_format_results(\n\u001B[0;32m 893\u001B[0m all_candidate_params, n_splits, all_out, all_more_results\n\u001B[0;32m 894\u001B[0m )\n\u001B[0;32m 896\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m results\n\u001B[1;32m--> 898\u001B[0m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_run_search\u001B[49m\u001B[43m(\u001B[49m\u001B[43mevaluate_candidates\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 900\u001B[0m \u001B[38;5;66;03m# multimetric is determined here because in the case of a callable\u001B[39;00m\n\u001B[0;32m 901\u001B[0m \u001B[38;5;66;03m# self.scoring the return type is only known after calling\u001B[39;00m\n\u001B[0;32m 902\u001B[0m first_test_score \u001B[38;5;241m=\u001B[39m all_out[\u001B[38;5;241m0\u001B[39m][\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mtest_scores\u001B[39m\u001B[38;5;124m\"\u001B[39m]\n",
"File \u001B[1;32mD:\\ANACONDA\\envs\\infs7410\\lib\\site-packages\\sklearn\\model_selection\\_search.py:1422\u001B[0m, in \u001B[0;36mGridSearchCV._run_search\u001B[1;34m(self, evaluate_candidates)\u001B[0m\n\u001B[0;32m 1420\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21m_run_search\u001B[39m(\u001B[38;5;28mself\u001B[39m, evaluate_candidates):\n\u001B[0;32m 1421\u001B[0m \u001B[38;5;250m \u001B[39m\u001B[38;5;124;03m\"\"\"Search all candidates in param_grid\"\"\"\u001B[39;00m\n\u001B[1;32m-> 1422\u001B[0m \u001B[43mevaluate_candidates\u001B[49m\u001B[43m(\u001B[49m\u001B[43mParameterGrid\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mparam_grid\u001B[49m\u001B[43m)\u001B[49m\u001B[43m)\u001B[49m\n",
"File \u001B[1;32mD:\\ANACONDA\\envs\\infs7410\\lib\\site-packages\\sklearn\\model_selection\\_search.py:845\u001B[0m, in \u001B[0;36mBaseSearchCV.fit.<locals>.evaluate_candidates\u001B[1;34m(candidate_params, cv, more_results)\u001B[0m\n\u001B[0;32m 837\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mverbose \u001B[38;5;241m>\u001B[39m \u001B[38;5;241m0\u001B[39m:\n\u001B[0;32m 838\u001B[0m \u001B[38;5;28mprint\u001B[39m(\n\u001B[0;32m 839\u001B[0m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mFitting \u001B[39m\u001B[38;5;132;01m{0}\u001B[39;00m\u001B[38;5;124m folds for each of \u001B[39m\u001B[38;5;132;01m{1}\u001B[39;00m\u001B[38;5;124m candidates,\u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[0;32m 840\u001B[0m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m totalling \u001B[39m\u001B[38;5;132;01m{2}\u001B[39;00m\u001B[38;5;124m fits\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;241m.\u001B[39mformat(\n\u001B[0;32m 841\u001B[0m n_splits, n_candidates, n_candidates \u001B[38;5;241m*\u001B[39m n_splits\n\u001B[0;32m 842\u001B[0m )\n\u001B[0;32m 843\u001B[0m )\n\u001B[1;32m--> 845\u001B[0m out \u001B[38;5;241m=\u001B[39m \u001B[43mparallel\u001B[49m\u001B[43m(\u001B[49m\n\u001B[0;32m 846\u001B[0m \u001B[43m \u001B[49m\u001B[43mdelayed\u001B[49m\u001B[43m(\u001B[49m\u001B[43m_fit_and_score\u001B[49m\u001B[43m)\u001B[49m\u001B[43m(\u001B[49m\n\u001B[0;32m 847\u001B[0m \u001B[43m \u001B[49m\u001B[43mclone\u001B[49m\u001B[43m(\u001B[49m\u001B[43mbase_estimator\u001B[49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 848\u001B[0m \u001B[43m \u001B[49m\u001B[43mX\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 849\u001B[0m \u001B[43m \u001B[49m\u001B[43my\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 850\u001B[0m \u001B[43m \u001B[49m\u001B[43mtrain\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mtrain\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 851\u001B[0m \u001B[43m \u001B[49m\u001B[43mtest\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mtest\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 852\u001B[0m \u001B[43m \u001B[49m\u001B[43mparameters\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mparameters\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 853\u001B[0m \u001B[43m \u001B[49m\u001B[43msplit_progress\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43msplit_idx\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mn_splits\u001B[49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 854\u001B[0m \u001B[43m \u001B[49m\u001B[43mcandidate_progress\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43mcand_idx\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mn_candidates\u001B[49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 855\u001B[0m \u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mfit_and_score_kwargs\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 856\u001B[0m \u001B[43m \u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 857\u001B[0m \u001B[43m \u001B[49m\u001B[38;5;28;43;01mfor\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[43m(\u001B[49m\u001B[43mcand_idx\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mparameters\u001B[49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43m(\u001B[49m\u001B[43msplit_idx\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43m(\u001B[49m\u001B[43mtrain\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mtest\u001B[49m\u001B[43m)\u001B[49m\u001B[43m)\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;129;43;01min\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[43mproduct\u001B[49m\u001B[43m(\u001B[49m\n\u001B[0;32m 858\u001B[0m \u001B[43m \u001B[49m\u001B[38;5;28;43menumerate\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43mcandidate_params\u001B[49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43menumerate\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43mcv\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43msplit\u001B[49m\u001B[43m(\u001B[49m\u001B[43mX\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43my\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mgroups\u001B[49m\u001B[43m)\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 859\u001B[0m \u001B[43m \u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 860\u001B[0m \u001B[43m\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 862\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mlen\u001B[39m(out) \u001B[38;5;241m<\u001B[39m \u001B[38;5;241m1\u001B[39m:\n\u001B[0;32m 863\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\n\u001B[0;32m 864\u001B[0m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mNo fits were performed. \u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[0;32m 865\u001B[0m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mWas the CV iterator empty? \u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[0;32m 866\u001B[0m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mWere there no candidates?\u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[0;32m 867\u001B[0m )\n",
"File \u001B[1;32mD:\\ANACONDA\\envs\\infs7410\\lib\\site-packages\\sklearn\\utils\\parallel.py:65\u001B[0m, in \u001B[0;36mParallel.__call__\u001B[1;34m(self, iterable)\u001B[0m\n\u001B[0;32m 60\u001B[0m config \u001B[38;5;241m=\u001B[39m get_config()\n\u001B[0;32m 61\u001B[0m iterable_with_config \u001B[38;5;241m=\u001B[39m (\n\u001B[0;32m 62\u001B[0m (_with_config(delayed_func, config), args, kwargs)\n\u001B[0;32m 63\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m delayed_func, args, kwargs \u001B[38;5;129;01min\u001B[39;00m iterable\n\u001B[0;32m 64\u001B[0m )\n\u001B[1;32m---> 65\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43msuper\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[38;5;21;43m__call__\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43miterable_with_config\u001B[49m\u001B[43m)\u001B[49m\n",
"File \u001B[1;32mD:\\ANACONDA\\envs\\infs7410\\lib\\site-packages\\joblib\\parallel.py:2007\u001B[0m, in \u001B[0;36mParallel.__call__\u001B[1;34m(self, iterable)\u001B[0m\n\u001B[0;32m 2001\u001B[0m \u001B[38;5;66;03m# The first item from the output is blank, but it makes the interpreter\u001B[39;00m\n\u001B[0;32m 2002\u001B[0m \u001B[38;5;66;03m# progress until it enters the Try/Except block of the generator and\u001B[39;00m\n\u001B[0;32m 2003\u001B[0m \u001B[38;5;66;03m# reaches the first `yield` statement. This starts the asynchronous\u001B[39;00m\n\u001B[0;32m 2004\u001B[0m \u001B[38;5;66;03m# dispatch of the tasks to the workers.\u001B[39;00m\n\u001B[0;32m 2005\u001B[0m \u001B[38;5;28mnext\u001B[39m(output)\n\u001B[1;32m-> 2007\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m output \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mreturn_generator \u001B[38;5;28;01melse\u001B[39;00m \u001B[38;5;28;43mlist\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43moutput\u001B[49m\u001B[43m)\u001B[49m\n",
"File \u001B[1;32mD:\\ANACONDA\\envs\\infs7410\\lib\\site-packages\\joblib\\parallel.py:1650\u001B[0m, in \u001B[0;36mParallel._get_outputs\u001B[1;34m(self, iterator, pre_dispatch)\u001B[0m\n\u001B[0;32m 1647\u001B[0m \u001B[38;5;28;01myield\u001B[39;00m\n\u001B[0;32m 1649\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_backend\u001B[38;5;241m.\u001B[39mretrieval_context():\n\u001B[1;32m-> 1650\u001B[0m \u001B[38;5;28;01myield from\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_retrieve()\n\u001B[0;32m 1652\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mGeneratorExit\u001B[39;00m:\n\u001B[0;32m 1653\u001B[0m \u001B[38;5;66;03m# The generator has been garbage collected before being fully\u001B[39;00m\n\u001B[0;32m 1654\u001B[0m \u001B[38;5;66;03m# consumed. This aborts the remaining tasks if possible and warn\u001B[39;00m\n\u001B[0;32m 1655\u001B[0m \u001B[38;5;66;03m# the user if necessary.\u001B[39;00m\n\u001B[0;32m 1656\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_exception \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mTrue\u001B[39;00m\n",
"File \u001B[1;32mD:\\ANACONDA\\envs\\infs7410\\lib\\site-packages\\joblib\\parallel.py:1762\u001B[0m, in \u001B[0;36mParallel._retrieve\u001B[1;34m(self)\u001B[0m\n\u001B[0;32m 1757\u001B[0m \u001B[38;5;66;03m# If the next job is not ready for retrieval yet, we just wait for\u001B[39;00m\n\u001B[0;32m 1758\u001B[0m \u001B[38;5;66;03m# async callbacks to progress.\u001B[39;00m\n\u001B[0;32m 1759\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m ((\u001B[38;5;28mlen\u001B[39m(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_jobs) \u001B[38;5;241m==\u001B[39m \u001B[38;5;241m0\u001B[39m) \u001B[38;5;129;01mor\u001B[39;00m\n\u001B[0;32m 1760\u001B[0m (\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_jobs[\u001B[38;5;241m0\u001B[39m]\u001B[38;5;241m.\u001B[39mget_status(\n\u001B[0;32m 1761\u001B[0m timeout\u001B[38;5;241m=\u001B[39m\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mtimeout) \u001B[38;5;241m==\u001B[39m TASK_PENDING)):\n\u001B[1;32m-> 1762\u001B[0m \u001B[43mtime\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43msleep\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m0.01\u001B[39;49m\u001B[43m)\u001B[49m\n\u001B[0;32m 1763\u001B[0m \u001B[38;5;28;01mcontinue\u001B[39;00m\n\u001B[0;32m 1765\u001B[0m \u001B[38;5;66;03m# We need to be careful: the job list can be filling up as\u001B[39;00m\n\u001B[0;32m 1766\u001B[0m \u001B[38;5;66;03m# we empty it and Python list are not thread-safe by\u001B[39;00m\n\u001B[0;32m 1767\u001B[0m \u001B[38;5;66;03m# default hence the use of the lock\u001B[39;00m\n",
"\u001B[1;31mKeyboardInterrupt\u001B[0m: "
]
}
],
"source": [
"rfc = RandomForestClassifier(random_state=42)\n",
"\n",
"param_grid_rfc = {\n",
" 'n_estimators': [100, 200, 300, 500],\n",
" 'criterion': ['gini', 'entropy'],\n",
" 'max_depth': [None, 30, 40, 50],\n",
" 'min_samples_split': [2, 5, 10, 15],\n",
" 'min_samples_leaf': [1, 2, 4, 10],\n",
" 'max_features': ['sqrt']\n",
"}\n",
"\n",
"#f1 = 0.7768 gini,'criterion': 'gini', 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100\n",
"# param_grid_rfc = {\n",
"# 'n_estimators': [50, 100, 150, 200],\n",
"# 'criterion': ['gini', 'entropy'],\n",
"# 'max_depth': [None, 10, 20, 30],\n",
"# 'min_samples_split': [2, 5, 10],\n",
"# 'min_samples_leaf': [1, 2, 4]\n",
"# }\n",
"\n",
"grid_search_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid_rfc, cv=5, scoring='f1', n_jobs=-1)\n",
"grid_search_rfc.fit(X, y)\n",
"\n",
"print(f\"Best parameters for RandomForestClassifier: {grid_search_rfc.best_params_}\")\n",
"print(f\"Best F1 Score for RandomForestClassifier: {grid_search_rfc.best_score_}\")\n"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-10-23T10:41:26.414752Z",
"start_time": "2024-10-23T10:37:06.347046Z"
}
}
},
{
"cell_type": "markdown",
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"# Panda method"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 29,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 1600 entries, 0 to 1599\n",
"Columns: 103 entries, Num (Col 1) to Num (Col 103)\n",
"dtypes: float64(103)\n",
"memory usage: 1.3 MB\n",
"None\n",
" Nom (Col 104) Nom (Col 105)\n",
"0 0.0 0.0\n",
"1 0.0 1.0\n",
"2 0.0 0.0\n",
"3 0.0 1.0\n",
"4 0.0 0.0\n",
"... ... ...\n",
"1595 0.0 0.0\n",
"1596 0.0 0.0\n",
"1597 0.0 0.0\n",
"1598 NaN 0.0\n",
"1599 NaN 0.0\n",
"\n",
"[1600 rows x 2 columns]\n"
]
}
],
"source": [
"df= pd.read_csv('DM_Project_24.csv')\n",
"features = df.iloc[:,:-1]\n",
"label = df.iloc[:,-1]\n",
"numberical = df.iloc[:,:103]\n",
"nominal = df.iloc[:,103:-1]\n",
"\n",
"\n",
"# print(df.info())\n",
"# print(df.describe())\n",
"# df.head(10)\n",
"# df_impu_all = df.copy()\n",
"# df_impu_all.iloc[:,:3] = df_impu_all.iloc[:,:3].fillna(df_impu_all.iloc[:,:3].mean())\n",
"# print(df_impu_all.info())\n",
"# print(\"------------------------\")\n",
"# print(df.info())"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-10-21T08:09:42.579065200Z",
"start_time": "2024-10-21T08:09:42.543904900Z"
}
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
这段代码主要实现了对一个数据集(从DM_Project_24.csv
文件读取)的数据预处理、异常值检测、数据重构以及使用多种机器学习模型进行评估和调优的功能,具体如下:
- 数据读取与基本信息统计
- 从
DM_Project_24.csv
文件读取数据,计算数据集中缺失值的总数、标签列缺失值数量、每个特征的缺失值数量,并统计了标签中0和1的数量。 - 将数据集按特征类型进行了划分,包括所有特征、数值特征、名义特征和标签列。
- 从
- 数据预处理
- 使用
SimpleImputer
对数值特征和名义特征的缺失值分别进行中位数填充和众数填充。 - 对填充后的数值特征进行了归一化处理(
MinMaxScaler
)和标准化处理(StandardScaler
)。
- 使用
- 异常值检测
- 使用
DBSCAN
算法分别对归一化和标准化后的数值特征进行异常值检测,根据给定的eps
和min_samples
参数确定异常值,并返回去除异常值后的数据集。 - 使用
IsolationForest
算法对归一化和标准化后的数值特征进行异常值检测,根据设定的n_estimators
和contamination
参数确定异常值,得到去除异常值后的数据集。 - 使用
LocalOutlierFactor
算法对归一化和标准化后的数值特征进行异常值检测,通过计算每个数据点的局部离群因子来确定异常值,返回删除异常值后的数据集。
- 使用
- 数据重构
- 将不同处理阶段的数据进行重新组合,包括原始数据填充后、归一化处理后、标准化处理后、经过
DBSCAN
、IsolationForest
、LocalOutlierFactor
算法处理后的数据与标签列重新合并。
- 将不同处理阶段的数据进行重新组合,包括原始数据填充后、归一化处理后、标准化处理后、经过
- 模型评估与调优
- 使用
DecisionTreeClassifier
、KNeighborsClassifier
、RandomForestClassifier
、GaussianNB
等模型对处理后的数据(如if_std_data
)进行交叉验证评估,计算F1
分数。 - 对
DecisionTreeClassifier
、KNeighborsClassifier
、RandomForestClassifier
模型进行了超参数调优,使用GridSearchCV
寻找最佳参数组合,以提高模型性能,并输出最佳参数和最佳F1
分数。
- 使用
总体而言,代码涵盖了数据处理、异常值检测、模型评估和调优等机器学习项目中的常见操作流程,用于分析和处理给定数据集以构建合适的预测模型。
原文地址:https://blog.csdn.net/huanghm88/article/details/143371033
免责声明:本站文章内容转载自网络资源,如本站内容侵犯了原著者的合法权益,可联系本站删除。更多内容请关注自学内容网(zxcms.com)!