2.2.1. Custom Transforms Function#

2.2.1.1. FunctionTransformer#

from sklearn.preprocessing import FunctionTransformer

# remove columns with few unique values
def cust_transform(X, min_values=3):
    X_obj = (X.dtypes == 'object')
    X_few_unique = X.loc[:,X_obj].nunique() < min_values
    return X.loc[:,(X_obj & X_few_unique)]
    
# define the transformer
trans = FunctionTransformer(cust_transform, kw_args={'min_values':5})

# # apply the transform
X_train = trans.fit_transform(X_train)
X_test = trans.transform(X_test)
# or use decorator
from sklearn.preprocessing import FunctionTransformer

@FunctionTransformer
def cust_transform(X, min_values=3):
    X_obj = (X.dtypes == 'object')
    X_few_unique = X.loc[:,X_obj].nunique() < min_values
    return X.loc[:,(X_obj & X_few_unique)]

# # apply the transform
X_train = cust_transform.fit_transform(X_train)
X_test = cust_transform.transform(X_test)
# in pipeline
col_trans = ColumnTransformer(
    [
        ("cust_transform", cust_transform, ),
        ("label encoding", OrdinalEncoder(), ["country", "store", "product"]),
    ]
)

pipe = Pipeline([("preprocessing", col_trans), ("regression", LinearRegression())])

2.2.1.2. Class transformation#

These are 4 configurations that always used when creating custom transformer:

  • Add BaseEstimator & TransformerMix into inherit get_params and fit_transform function. Personally, I always use fit_transform when I need to have a quick view on the output.

  • Define self.variables attributes so that you can easily select which columns to be applied on later

  • Define fit method. Depending on the transformation, if it doesn’t require fitting, just create a dummy fit function.

  • Define transform method. This is used to transform original dataset to modified dataset based on your transformation method.

# import packages
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

Example 1: Custom transformer without requiring fit method

# Example 1: Custom transformer without requiring fit method

class DropFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, variables):
        self.variables = variables
    def fit(self, X, y = None):
        return self
    def transform(self, X):
        X_dropped = X.drop(self.variables, axis = 1)
        self.columns = X_dropped.columns
        return X_dropped

Example 2: Custom transformer requiring fit method

# Example 2: Custom transformer requiring fit method

class OneHotEncodercustom(BaseEstimator, TransformerMixin):
    def __init__(self, variables):
        self.variables = variables
        self.ohe = OneHotEncoder(drop='first', handle_unknown = 'ignore')

    def fit(self, X, y = None):
        X_ = X.loc[:,self.variables]
        self.ohe.fit(X_)
        return self

    def transform(self, X):
        X_ = X.loc[:,self.variables]
        # get one-hot encoded feature in df format
        X_transformed = pd.DataFrame(self.ohe.transform(X_).toarray(), columns= self.ohe.get_feature_names_out())
        
        # Remove columns that are one hot encoded in original df
        X.drop(self.variables, axis= 1, inplace=True)
        
        # Add one hot encoded feature to original df
        X[self.ohe.get_feature_names_out()] = X_transformed[self.ohe.get_feature_names_out()].values
        return X
# create custom transformer

class SimpleImputerCustom(BaseEstimator, TransformerMixin):
    """Improvise onsklearn.impute.SimpleImputer function so it 
    returns as dataframe instead of np.array."""
    def __init__(self, variables, strategy):
        self.variables = variables
        self.strategy = strategy
        self.imp = SimpleImputer(missing_values=np.nan,   
                    strategy=self.strategy)
        
    def fit(self, X, y = None):
        X_ = X.loc[:,self.variables]
        self.imp.fit(X_)
        return self
    
    def transform(self, X):
        X_ = X.loc[:,self.variables]
        X_transformed = pd.DataFrame(self.imp.transform(X_), columns= self.variables)
        X.drop(self.variables, axis= 1, inplace=True)
        X[self.variables] = X_transformed[self.variables].values
        return X


class DomainNumFE(BaseEstimator, TransformerMixin):
    """Feature engineering technique for numerical features based on domain knowledge"""
    def __init__(self, variables = None):
        self.variables = variables

    def fit(self, X, y =None):
        return self
    
    def transform(self, X):
        # source: https://www.kaggle.com/lavanyashukla01/how-i-made-top-0-3-on-a-kaggle-competition#Feature-Engineering
        X_ = X.copy()
        X_['HasWoodDeck'] = (X_['WoodDeckSF'] == 0) * 1
        X_['HasOpenPorch'] = (X_['OpenPorchSF'] == 0) * 1
        X_['HasEnclosedPorch'] = (X_['EnclosedPorch'] == 0) * 1     
        return X_
# custom columns selection by AUC score
from sklearn.metrics import roc_auc_score

class AucSelection(BaseEstimator, TransformerMixin):

    def __init__(self, remain_threshold = 0.501, nan_strategy = 'remain'):
        assert (remain_threshold > 0.5) and (remain_threshold <= 1)
        self.remain_threshold = remain_threshold
        self.nan_strategy = nan_strategy
        self.droped_feature_ = None
        self.droped_index_ = None

    def fit(self, X, y = None):
        self.auc = X.apply(lambda c: self.get_auc(y, c))
        auc_nan = self.auc.fillna(self.remain_threshold) if \
            self.nan_strategy == 'remain' else self.auc.fillna(0.5) # drop
        self.droped_index_ = (auc_nan < self.remain_threshold) 
        self.droped_feature_ = X.columns[self.droped_index_]
        return self

    def transform(self, X):
        if self.droped_index_ is None:
            raise ValueError("AucSelection is not be fitted" )
        X_ = X.loc[:,~self.droped_index_]
        return X_
    
    def get_auc(y, var, flexible_sign=True):
        """
        AUC the hien kha nang predictive cua model voi bien Y,
        do vay khi AUC(y, var) ~ 0.5 (random guess) the hien var 
        khong co kha nang giai thich bien Y
        """
        try: # numeric data
            nan_idx = np.isnan(np.array(var)) # filter NaN
            var_ = var[~nan_idx]
            y_ = y[~nan_idx]
            # if label not only 1s/0s
            auc = roc_auc_score(y_score=var_, y_true=y_) if (var_.std() > 0) else 0.5
            # for evaluation only
            if (auc < 0.5) & (flexible_sign):
                auc = 1.0 - auc
            return auc
        except: # categorical
            return np.nan
    
auc_selector = AucSelection(remain_threshold = 0.502)
X_train = auc_selector.fit_transform(X_train, y_train)
X_test = auc_selector.transform(X_test)

2.2.1.3. Custom make columns selector#

from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector

class custom_column_seletor(make_column_selector):

    def __init__(self, pattern=None, dtype_include=None, dtype_exclude=None, add_vars = []):
        super().__init__(pattern, dtype_include, dtype_exclude)
        self.add_vars = add_vars

    def __call__(self, df):
        if not hasattr(df, "iloc"):
            raise ValueError(
                "make_column_selector can only be applied to pandas dataframes"
            )
        df_row = df.iloc[:1]
        if self.dtype_include is not None or self.dtype_exclude is not None:
            df_row = df_row.select_dtypes(
                include=self.dtype_include, exclude=self.dtype_exclude
            )
        cols = df_row.columns
        if self.pattern is not None:
            cols = cols[cols.str.contains(self.pattern, regex=True)]

        df_ = df[cols.tolist()].copy()
        df_ = self.filter_columns(df_)
        return list(set().union(df_.columns.tolist(), self.add_vars))
    
    def filter_columns(df):
        """
        make custom process to get specific columns
        """
        # example
        nuni = df.nunique() < 30
        return df.loc[:,nuni]
    
col_trans = ColumnTransformer(
    [
        ("cust_transform", cust_transform, custom_column_seletor(dtype_include=float, add_vars=["store", "product"]) ),
    ]
)
col_trans = make_column_transformer(
        ("cust_transform", cust_transform, custom_column_seletor(dtype_include=float, add_vars=["store", "product"]) ),
        ("cust_transform2", cust_transform2, custom_column_seletor(dtype_exclude=np.number, add_vars=[ "product"]) ),
)