自学内容网 自学内容网

6 回归集成:xgb、lgb、cat

 这个代码是从kaggle上拷贝过来的:

  1. 如何使用三个树模型模块化训练;
  2. 文本特征如何做,如何挖掘;
  3. 时间特征的处理;
  4. 模型权重集成;
import pandas as pd 
import math
import numpy as np 
import joblib 
import optuna

from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import *
from sklearn.metrics import *
from sklearn.model_selection import *

from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

import datetime
import gc
from sklearn.base import clone

pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings("ignore")


d_s = pd.read_csv('/kaggle/input/rohlik-orders-forecasting-challenge/solution_example.csv')

te_d = pd.read_csv('/kaggle/input/rohlik-orders-forecasting-challenge/test.csv')

tr_d = pd.read_csv('/kaggle/input/rohlik-orders-forecasting-challenge/train.csv')


tr_d.drop('id',axis=1,inplace=True)
te_d.drop('id',axis=1,inplace=True)

tr_d['holiday_name'].fillna('None', inplace=True)
te_d['holiday_name'].fillna('None', inplace=True)

def Process_Date(Df):

    Df['date'] = pd.to_datetime(Df['date'])

    Df['year'] = Df['date'].dt.year

    Df['day'] = Df['date'].dt.day

    Df['month'] = Df['date'].dt.month

    Df['month_name'] = Df['date'].dt.month_name()

    Df['day_of_week'] = Df['date'].dt.day_name()

    Df['week'] = Df['date'].dt.isocalendar().week
    
    Df['year_sin'] = np.sin(2 * np.pi * Df['year'])
    Df['year_cos'] = np.cos(2 * np.pi * Df['year'])
    Df['month_sin'] = np.sin(2 * np.pi * Df['month'] / 12) 
    Df['month_cos'] = np.cos(2 * np.pi * Df['month'] / 12)
    Df['day_sin'] = np.sin(2 * np.pi * Df['day'] / 31)  
    Df['day_cos'] = np.cos(2 * np.pi * Df['day'] / 31)
    Df['group']=(Df['year']-2020)*48+Df['month']*4+Df['day']//7
    
    Df['total_holidays_month'] = Df.groupby(['year', 'month'])['holiday'].transform('sum')
    Df['total_shops_closed_week'] = Df.groupby(['year', 'week'])['shops_closed'].transform('sum')

    Df['group_sin'] = np.sin(2 * np.pi * Df['group'] / Df['group'].max())
    Df['group_cos'] = np.cos(2 * np.pi * Df['group'] / Df['group'].max())


    return Df

tr_d = Process_Date(tr_d)
te_d = Process_Date(te_d)

tr_d = tr_d[['warehouse', 'date', 'holiday_name', 'holiday', 'shops_closed',
       'winter_school_holidays', 'school_holidays', 'year', 'day', 'month',
       'month_name', 'day_of_week', 'week', 'year_sin', 'year_cos',
       'month_sin', 'month_cos', 'day_sin', 'day_cos', 'group',
       'total_holidays_month', 'total_shops_closed_week',
       'group_sin', 'group_cos',
       'orders']]

le_month = LabelEncoder()
le_week = LabelEncoder()
le_war = LabelEncoder()

tr_d['month_name'] = le_month.fit_transform(tr_d['month_name'])
tr_d['day_of_week'] = le_week.fit_transform(tr_d['day_of_week'])
tr_d['warehouse'] = le_war.fit_transform(tr_d['warehouse'])

te_d['month_name'] = le_month.transform(te_d['month_name'])
te_d['day_of_week'] = le_week.transform(te_d['day_of_week'])
te_d['warehouse'] = le_war.transform(te_d['warehouse'])

def apply_tfidf_svd(df, text_column, max_features=1000, n_components=10):

    vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english')

    vectors = vectorizer.fit_transform(df[text_column])

    svd = TruncatedSVD(n_components)

    x_sv = svd.fit_transform(vectors)

    tfidf_df = pd.DataFrame(x_sv)

    cols = [(text_column + "_tfidf_" + str(f)) for f in tfidf_df.columns.to_list()]

    tfidf_df.columns = cols

    df = df.reset_index(drop=True)

    df = pd.concat([df, tfidf_df], axis="columns")

    return df

tr_d = apply_tfidf_svd(tr_d,'holiday_name')
te_d = apply_tfidf_svd(te_d,'holiday_name')

tr_d.drop(['date','holiday_name'],axis=1,inplace=True)
te_d.drop(['date','holiday_name'],axis=1,inplace=True)

print(f"Shape Of Train Data is {tr_d.shape}")
print(f"Shape Of Test Data is {te_d.shape}")


%%time 

X = tr_d.drop('orders',axis=1)
y =tr_d['orders']

def cross_validate(model, n_splits=15):
    
    scores = []
    test_preds = np.zeros(len(te_d))
    
    groups = X['group']
    
    kfold = GroupKFold(n_splits=n_splits)
    
    for fold, (train_index, valid_index) in enumerate(kfold.split(X, y, groups=groups)):
        
        X_train = X.iloc[train_index]
        y_train = y.iloc[train_index]
        X_val = X.iloc[valid_index]
        y_val = y.iloc[valid_index]
                    
        m = clone(model)
        m.fit(X_train, y_train, eval_set=[(X_val, y_val)])
        
        y_pred = m.predict(X_val)
        score = mean_absolute_percentage_error(y_val, y_pred)
        
        scores.append(score)
        
        test_preds += m.predict(te_d) / n_splits

        gc.collect()

    print(f" MAPE mean: {np.array(scores).mean():.7f} (+- {np.array(scores).std():.7f})")

    return test_preds


%%time

SEED = 2375

cat = CatBoostRegressor(verbose=0,learning_rate=0.01,iterations=2000,
    random_state = SEED)
cat_test_preds = cross_validate(cat)

SEED = 1023
xgb = XGBRegressor(n_estimators=1000,learning_rate=0.05,verbosity=0,
            random_state=SEED)
xgb_test_preds = cross_validate(xgb)


%%time

lgb = LGBMRegressor(verbose=-1,
                    random_state = SEED
                   )
lgb_test_preds = cross_validate(lgb)


%%time 

weights = {
    'cat_test_preds': 0.45,  
    'lgb_test_preds': 0.45,
    'xgb_test_preds': 0.1,
    
}

cat_test_preds_weighted = cat_test_preds * weights['cat_test_preds']
lgb_test_preds_weighted = lgb_test_preds * weights['lgb_test_preds']
xgb_test_preds_weighted = xgb_test_preds * weights['xgb_test_preds']


ensemble_preds = cat_test_preds_weighted + lgb_test_preds_weighted + xgb_test_preds_weighted

d_s['orders'] = ensemble_preds
d_s['id'] = d_s['id']

d_s.to_csv('Submission.csv', index=False)

print(d_s.head())


原文地址:https://blog.csdn.net/qq_28611929/article/details/140541838

免责声明:本站文章内容转载自网络资源,如本站内容侵犯了原著者的合法权益,可联系本站删除。更多内容请关注自学内容网(zxcms.com)!