自学内容网 自学内容网

sklearn pipeline

示例代码

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import numpy as np
import scipy.linalg
from sklearn.preprocessing import LabelEncoder, StandardScaler
import optuna
import scipy.linalg
from sklearn.linear_model import BayesianRidge
import pandas as pd
from sklearn.model_selection import LeaveOneOut, cross_val_score

class EmscScaler(object):
    def __init__(self, order=1):
        self.order = order
        self._mx = None
    def mlr(self, x, y):
        """Multiple linear regression fit of the columns of matrix x
        (dependent variables) to constituent vector y (independent variables)

        order -     order of a smoothing polynomial, which can be included
                    in the set of independent variables. If order is
                    not specified, no background will be included.
        b -         fit coeffs
        f -         fit result (m x 1 column vector)
        r -         residual   (m x 1 column vector)
        """
        if self.order > 0:
            s = np.ones((len(y), 1))
            for j in range(self.order):
                s = np.concatenate((s, (np.arange(0, 1 + (1.0 / (len(y) - 1)), 1.0 / (len(y) - 1)) ** j).reshape(-1,1)[0:len(y)]),1)
            X = np.concatenate((x.reshape(-1,1), s), 1)
        else:
            X = x
        # calc fit b=fit coefficients
        b = np.dot(np.dot(scipy.linalg.pinv(np.dot(X.T, X)), X.T), y)
        f = np.dot(X, b)
        r = y - f
        return b, f, r
    def fit(self, X, y=None):
        """fit to X (get average spectrum), y is a passthrough for pipeline compatibility"""
        self._mx = np.mean(X, axis=0)
    def transform(self, X, y=None, copy=None):
        if type(self._mx) == type(None):
            print("EMSC not fit yet. run .fit method on reference spectra")
        else:
            # do fitting
            corr = np.zeros(X.shape)
            for i in range(len(X)):
                b, f, r = self.mlr(self._mx, X[i, :])
                corr[i, :] = np.reshape((r / b[0]) + self._mx, (corr.shape[1],))
            return corr
    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)



from sklearn.base import BaseEstimator, TransformerMixin
class SpectraPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, emsc_order=3,X_ref=None):
        self.emsc_order = emsc_order
        self.emsc_scalers = [EmscScaler(order=emsc_order) for _ in range(4)]
        self.X_ref = X_ref

    def fit(self, X, y=None):
        X_ref = self.X_ref
        if X_ref is None:
            X_ref = X.copy()
        # Define the column ranges for each segment
        ranges = [(0, 251), (281, 482), (482, 683), (683, 854)]
        
        # Fit EmscScaler for each segment
        for i, (start, end) in enumerate(ranges):
            self.emsc_scalers[i].fit(X_ref[:, start:end])
        
        return self

    def transform(self, X, y=None):
        # Define the column ranges for each segment
        ranges = [(0, 251), (281, 482), (482, 683), (683, 854)]
        
        # Transform each segment
        transformed_segments = []
        for i, (start, end) in enumerate(ranges):
            segment = X[:, start:end]
            transformed_segment = self.emsc_scalers[i].transform(segment)

            transformed_segments.append(transformed_segment)
        
        # Concatenate all transformed segments
        return np.concatenate(transformed_segments, axis=1)

    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)
    


def bayesian_ridge_optuna_for_emsc_data(x_train, y_train, pipeline_):
    def objective(trial):
        try:
            alpha_1 = trial.suggest_float('alpha_1', 0.001, 1, log=True)
            alpha_2 = trial.suggest_float('alpha_2', 0.001, 1, log=True)
            lambda_1 = trial.suggest_float('lambda_1', 0.001, 1, log=True)
            lambda_2 = trial.suggest_float('lambda_2', 0.001, 1, log=True)
            model = pipeline_.set_params(
                bayesian_ridge__alpha_1=alpha_1,
                bayesian_ridge__alpha_2=alpha_2,
                bayesian_ridge__lambda_1=lambda_1,
                bayesian_ridge__lambda_2=lambda_2
            )
            model.fit(x_train, y_train)
            score = cross_val_score(model, x_train, y_train, cv=10, n_jobs=-1, scoring='r2')
            return np.mean(score)
        except ValueError as e:
            return -np.inf
    
    optuna.logging.set_verbosity(optuna.logging.WARNING)
    pruner = optuna.pruners.MedianPruner()
    study = optuna.create_study(direction="maximize", pruner=pruner)
    study.optimize(objective, n_trials=500, show_progress_bar=True, n_jobs=1)
    return study.best_params


def getdata(filenamex, filenamey):
    x = pd.read_csv(filenamex, header=None)
    y = pd.read_csv(filenamey)

    data = pd.concat([x, y], axis=1)
    return data

    
name = 'test'
x, y = np.random.rand(100,884), np.random.rand(100)
x_ref =  np.random.rand(30,884)
pipeline = Pipeline([
    ('preprocessor', SpectraPreprocessor(emsc_order=3, X_ref=None)),
    ('scaler', StandardScaler()),
    ('bayesian_ridge', BayesianRidge())
])

pipeline.set_params(preprocessor__X_ref=x_ref)

############################################################################################################################################################
best_params = bayesian_ridge_optuna_for_emsc_data(x, y, pipeline)
############################################################################################################################################################

pipeline.set_params(
    bayesian_ridge__alpha_1=best_params['alpha_1'],
    bayesian_ridge__alpha_2=best_params['alpha_2'],
    bayesian_ridge__lambda_1=best_params['lambda_1'],
    bayesian_ridge__lambda_2=best_params['lambda_2']
)
pipeline.fit(x, y)
y_pred = pipeline.predict(x)
print(y_pred)

原文地址:https://blog.csdn.net/qq_41685627/article/details/142882227

免责声明:本站文章内容转载自网络资源,如本站内容侵犯了原著者的合法权益,可联系本站删除。更多内容请关注自学内容网(zxcms.com)!