2.2.1. Custom Transforms Function#
2.2.1.1. FunctionTransformer#
from sklearn.preprocessing import FunctionTransformer
# remove columns with few unique values
def cust_transform(X, min_values=3):
X_obj = (X.dtypes == 'object')
X_few_unique = X.loc[:,X_obj].nunique() < min_values
return X.loc[:,(X_obj & X_few_unique)]
# define the transformer
trans = FunctionTransformer(cust_transform, kw_args={'min_values':5})
# # apply the transform
X_train = trans.fit_transform(X_train)
X_test = trans.transform(X_test)
# or use decorator
from sklearn.preprocessing import FunctionTransformer
@FunctionTransformer
def cust_transform(X, min_values=3):
X_obj = (X.dtypes == 'object')
X_few_unique = X.loc[:,X_obj].nunique() < min_values
return X.loc[:,(X_obj & X_few_unique)]
# # apply the transform
X_train = cust_transform.fit_transform(X_train)
X_test = cust_transform.transform(X_test)
# in pipeline
col_trans = ColumnTransformer(
[
("cust_transform", cust_transform, ),
("label encoding", OrdinalEncoder(), ["country", "store", "product"]),
]
)
pipe = Pipeline([("preprocessing", col_trans), ("regression", LinearRegression())])
2.2.1.2. Class transformation#
These are 4 configurations that always used when creating custom transformer:
Add
BaseEstimator
&TransformerMix
into inheritget_params
andfit_transform
function. Personally, I always usefit_transform
when I need to have a quick view on the output.Define
self.variables
attributes so that you can easily select which columns to be applied on laterDefine
fit
method. Depending on the transformation, if it doesn’t require fitting, just create a dummy fit function.Define
transform
method. This is used to transform original dataset to modified dataset based on your transformation method.
# import packages
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
Example 1: Custom transformer without requiring fit
method
# Example 1: Custom transformer without requiring fit method
class DropFeatureSelector(BaseEstimator, TransformerMixin):
def __init__(self, variables):
self.variables = variables
def fit(self, X, y = None):
return self
def transform(self, X):
X_dropped = X.drop(self.variables, axis = 1)
self.columns = X_dropped.columns
return X_dropped
Example 2: Custom transformer requiring fit
method
# Example 2: Custom transformer requiring fit method
class OneHotEncodercustom(BaseEstimator, TransformerMixin):
def __init__(self, variables):
self.variables = variables
self.ohe = OneHotEncoder(drop='first', handle_unknown = 'ignore')
def fit(self, X, y = None):
X_ = X.loc[:,self.variables]
self.ohe.fit(X_)
return self
def transform(self, X):
X_ = X.loc[:,self.variables]
# get one-hot encoded feature in df format
X_transformed = pd.DataFrame(self.ohe.transform(X_).toarray(), columns= self.ohe.get_feature_names_out())
# Remove columns that are one hot encoded in original df
X.drop(self.variables, axis= 1, inplace=True)
# Add one hot encoded feature to original df
X[self.ohe.get_feature_names_out()] = X_transformed[self.ohe.get_feature_names_out()].values
return X
# create custom transformer
class SimpleImputerCustom(BaseEstimator, TransformerMixin):
"""Improvise onsklearn.impute.SimpleImputer function so it
returns as dataframe instead of np.array."""
def __init__(self, variables, strategy):
self.variables = variables
self.strategy = strategy
self.imp = SimpleImputer(missing_values=np.nan,
strategy=self.strategy)
def fit(self, X, y = None):
X_ = X.loc[:,self.variables]
self.imp.fit(X_)
return self
def transform(self, X):
X_ = X.loc[:,self.variables]
X_transformed = pd.DataFrame(self.imp.transform(X_), columns= self.variables)
X.drop(self.variables, axis= 1, inplace=True)
X[self.variables] = X_transformed[self.variables].values
return X
class DomainNumFE(BaseEstimator, TransformerMixin):
"""Feature engineering technique for numerical features based on domain knowledge"""
def __init__(self, variables = None):
self.variables = variables
def fit(self, X, y =None):
return self
def transform(self, X):
# source: https://www.kaggle.com/lavanyashukla01/how-i-made-top-0-3-on-a-kaggle-competition#Feature-Engineering
X_ = X.copy()
X_['HasWoodDeck'] = (X_['WoodDeckSF'] == 0) * 1
X_['HasOpenPorch'] = (X_['OpenPorchSF'] == 0) * 1
X_['HasEnclosedPorch'] = (X_['EnclosedPorch'] == 0) * 1
return X_
# custom columns selection by AUC score
from sklearn.metrics import roc_auc_score
class AucSelection(BaseEstimator, TransformerMixin):
def __init__(self, remain_threshold = 0.501, nan_strategy = 'remain'):
assert (remain_threshold > 0.5) and (remain_threshold <= 1)
self.remain_threshold = remain_threshold
self.nan_strategy = nan_strategy
self.droped_feature_ = None
self.droped_index_ = None
def fit(self, X, y = None):
self.auc = X.apply(lambda c: self.get_auc(y, c))
auc_nan = self.auc.fillna(self.remain_threshold) if \
self.nan_strategy == 'remain' else self.auc.fillna(0.5) # drop
self.droped_index_ = (auc_nan < self.remain_threshold)
self.droped_feature_ = X.columns[self.droped_index_]
return self
def transform(self, X):
if self.droped_index_ is None:
raise ValueError("AucSelection is not be fitted" )
X_ = X.loc[:,~self.droped_index_]
return X_
def get_auc(y, var, flexible_sign=True):
"""
AUC the hien kha nang predictive cua model voi bien Y,
do vay khi AUC(y, var) ~ 0.5 (random guess) the hien var
khong co kha nang giai thich bien Y
"""
try: # numeric data
nan_idx = np.isnan(np.array(var)) # filter NaN
var_ = var[~nan_idx]
y_ = y[~nan_idx]
# if label not only 1s/0s
auc = roc_auc_score(y_score=var_, y_true=y_) if (var_.std() > 0) else 0.5
# for evaluation only
if (auc < 0.5) & (flexible_sign):
auc = 1.0 - auc
return auc
except: # categorical
return np.nan
auc_selector = AucSelection(remain_threshold = 0.502)
X_train = auc_selector.fit_transform(X_train, y_train)
X_test = auc_selector.transform(X_test)
2.2.1.3. Custom make columns selector#
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
class custom_column_seletor(make_column_selector):
def __init__(self, pattern=None, dtype_include=None, dtype_exclude=None, add_vars = []):
super().__init__(pattern, dtype_include, dtype_exclude)
self.add_vars = add_vars
def __call__(self, df):
if not hasattr(df, "iloc"):
raise ValueError(
"make_column_selector can only be applied to pandas dataframes"
)
df_row = df.iloc[:1]
if self.dtype_include is not None or self.dtype_exclude is not None:
df_row = df_row.select_dtypes(
include=self.dtype_include, exclude=self.dtype_exclude
)
cols = df_row.columns
if self.pattern is not None:
cols = cols[cols.str.contains(self.pattern, regex=True)]
df_ = df[cols.tolist()].copy()
df_ = self.filter_columns(df_)
return list(set().union(df_.columns.tolist(), self.add_vars))
def filter_columns(df):
"""
make custom process to get specific columns
"""
# example
nuni = df.nunique() < 30
return df.loc[:,nuni]
col_trans = ColumnTransformer(
[
("cust_transform", cust_transform, custom_column_seletor(dtype_include=float, add_vars=["store", "product"]) ),
]
)
col_trans = make_column_transformer(
("cust_transform", cust_transform, custom_column_seletor(dtype_include=float, add_vars=["store", "product"]) ),
("cust_transform2", cust_transform2, custom_column_seletor(dtype_exclude=np.number, add_vars=[ "product"]) ),
)