Custom Transforms Function

2.2.1. Custom Transforms Function#

2.2.1.1. FunctionTransformer#

from sklearn.preprocessing import FunctionTransformer

# remove columns with few unique values
def cust_transform(X, min_values=3):
    X_obj = (X.dtypes == 'object')
    X_few_unique = X.loc[:,X_obj].nunique() < min_values
    return X.loc[:,(X_obj & X_few_unique)]
    
# define the transformer
trans = FunctionTransformer(cust_transform, kw_args={'min_values':5})

# # apply the transform
X_train = trans.fit_transform(X_train)
X_test = trans.transform(X_test)

# or use decorator
from sklearn.preprocessing import FunctionTransformer

@FunctionTransformer
def cust_transform(X, min_values=3):
    X_obj = (X.dtypes == 'object')
    X_few_unique = X.loc[:,X_obj].nunique() < min_values
    return X.loc[:,(X_obj & X_few_unique)]

# # apply the transform
X_train = cust_transform.fit_transform(X_train)
X_test = cust_transform.transform(X_test)

# in pipeline
col_trans = ColumnTransformer(
    [
        ("cust_transform", cust_transform, ),
        ("label encoding", OrdinalEncoder(), ["country", "store", "product"]),
    ]
)

pipe = Pipeline([("preprocessing", col_trans), ("regression", LinearRegression())])

2.2.1.2. Class transformation#

These are 4 configurations that always used when creating custom transformer:

Add BaseEstimator & TransformerMix into inherit get_params and fit_transform function. Personally, I always use fit_transform when I need to have a quick view on the output.
Define self.variables attributes so that you can easily select which columns to be applied on later
Define fit method. Depending on the transformation, if it doesn’t require fitting, just create a dummy fit function.
Define transform method. This is used to transform original dataset to modified dataset based on your transformation method.

# import packages
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

Example 1: Custom transformer without requiring fit method

# Example 1: Custom transformer without requiring fit method

class DropFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, variables):
        self.variables = variables
    def fit(self, X, y = None):
        return self
    def transform(self, X):
        X_dropped = X.drop(self.variables, axis = 1)
        self.columns = X_dropped.columns
        return X_dropped

Example 2: Custom transformer requiring fit method

# Example 2: Custom transformer requiring fit method

class OneHotEncodercustom(BaseEstimator, TransformerMixin):
    def __init__(self, variables):
        self.variables = variables
        self.ohe = OneHotEncoder(drop='first', handle_unknown = 'ignore')

    def fit(self, X, y = None):
        X_ = X.loc[:,self.variables]
        self.ohe.fit(X_)
        return self

    def transform(self, X):
        X_ = X.loc[:,self.variables]
        # get one-hot encoded feature in df format
        X_transformed = pd.DataFrame(self.ohe.transform(X_).toarray(), columns= self.ohe.get_feature_names_out())
        
        # Remove columns that are one hot encoded in original df
        X.drop(self.variables, axis= 1, inplace=True)
        
        # Add one hot encoded feature to original df
        X[self.ohe.get_feature_names_out()] = X_transformed[self.ohe.get_feature_names_out()].values
        return X

# create custom transformer

class SimpleImputerCustom(BaseEstimator, TransformerMixin):
    """Improvise onsklearn.impute.SimpleImputer function so it 
    returns as dataframe instead of np.array."""
    def __init__(self, variables, strategy):
        self.variables = variables
        self.strategy = strategy
        self.imp = SimpleImputer(missing_values=np.nan,   
                    strategy=self.strategy)
        
    def fit(self, X, y = None):
        X_ = X.loc[:,self.variables]
        self.imp.fit(X_)
        return self
    
    def transform(self, X):
        X_ = X.loc[:,self.variables]
        X_transformed = pd.DataFrame(self.imp.transform(X_), columns= self.variables)
        X.drop(self.variables, axis= 1, inplace=True)
        X[self.variables] = X_transformed[self.variables].values
        return X


class DomainNumFE(BaseEstimator, TransformerMixin):
    """Feature engineering technique for numerical features based on domain knowledge"""
    def __init__(self, variables = None):
        self.variables = variables

    def fit(self, X, y =None):
        return self
    
    def transform(self, X):
        # source: https://www.kaggle.com/lavanyashukla01/how-i-made-top-0-3-on-a-kaggle-competition#Feature-Engineering
        X_ = X.copy()
        X_['HasWoodDeck'] = (X_['WoodDeckSF'] == 0) * 1
        X_['HasOpenPorch'] = (X_['OpenPorchSF'] == 0) * 1
        X_['HasEnclosedPorch'] = (X_['EnclosedPorch'] == 0) * 1     
        return X_

# custom columns selection by AUC score
from sklearn.metrics import roc_auc_score

class AucSelection(BaseEstimator, TransformerMixin):

    def __init__(self, remain_threshold = 0.501, nan_strategy = 'remain'):
        assert (remain_threshold > 0.5) and (remain_threshold <= 1)
        self.remain_threshold = remain_threshold
        self.nan_strategy = nan_strategy
        self.droped_feature_ = None
        self.droped_index_ = None

    def fit(self, X, y = None):
        self.auc = X.apply(lambda c: self.get_auc(y, c))
        auc_nan = self.auc.fillna(self.remain_threshold) if \
            self.nan_strategy == 'remain' else self.auc.fillna(0.5) # drop
        self.droped_index_ = (auc_nan < self.remain_threshold) 
        self.droped_feature_ = X.columns[self.droped_index_]
        return self

    def transform(self, X):
        if self.droped_index_ is None:
            raise ValueError("AucSelection is not be fitted" )
        X_ = X.loc[:,~self.droped_index_]
        return X_
    
    def get_auc(y, var, flexible_sign=True):
        """
        AUC the hien kha nang predictive cua model voi bien Y,
        do vay khi AUC(y, var) ~ 0.5 (random guess) the hien var 
        khong co kha nang giai thich bien Y
        """
        try: # numeric data
            nan_idx = np.isnan(np.array(var)) # filter NaN
            var_ = var[~nan_idx]
            y_ = y[~nan_idx]
            # if label not only 1s/0s
            auc = roc_auc_score(y_score=var_, y_true=y_) if (var_.std() > 0) else 0.5
            # for evaluation only
            if (auc < 0.5) & (flexible_sign):
                auc = 1.0 - auc
            return auc
        except: # categorical
            return np.nan
    
auc_selector = AucSelection(remain_threshold = 0.502)
X_train = auc_selector.fit_transform(X_train, y_train)
X_test = auc_selector.transform(X_test)

2.2.1.3. Custom make columns selector#

from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector

class custom_column_seletor(make_column_selector):

    def __init__(self, pattern=None, dtype_include=None, dtype_exclude=None, add_vars = []):
        super().__init__(pattern, dtype_include, dtype_exclude)
        self.add_vars = add_vars

    def __call__(self, df):
        if not hasattr(df, "iloc"):
            raise ValueError(
                "make_column_selector can only be applied to pandas dataframes"
            )
        df_row = df.iloc[:1]
        if self.dtype_include is not None or self.dtype_exclude is not None:
            df_row = df_row.select_dtypes(
                include=self.dtype_include, exclude=self.dtype_exclude
            )
        cols = df_row.columns
        if self.pattern is not None:
            cols = cols[cols.str.contains(self.pattern, regex=True)]

        df_ = df[cols.tolist()].copy()
        df_ = self.filter_columns(df_)
        return list(set().union(df_.columns.tolist(), self.add_vars))
    
    def filter_columns(df):
        """
        make custom process to get specific columns
        """
        # example
        nuni = df.nunique() < 30
        return df.loc[:,nuni]
    

col_trans = ColumnTransformer(
    [
        ("cust_transform", cust_transform, custom_column_seletor(dtype_include=float, add_vars=["store", "product"]) ),
    ]
)

col_trans = make_column_transformer(
        ("cust_transform", cust_transform, custom_column_seletor(dtype_include=float, add_vars=["store", "product"]) ),
        ("cust_transform2", cust_transform2, custom_column_seletor(dtype_exclude=np.number, add_vars=[ "product"]) ),
)

Custom Transforms Function

Contents

2.2.1. Custom Transforms Function#

2.2.1.1. FunctionTransformer#

2.2.1.2. Class transformation#

2.2.1.3. Custom make columns selector#