pipeline templates

[1]:
# data cleaning
from gators.data_cleaning import (
    ConvertColumnDatatype,
    DropHighNaNRatio,
    DropLowCardinality,
    DropHighCardinality,
    DropDatatypeColumns,
)
# imputers
from gators.imputers import (
    NumericsImputer,
    ObjectImputer,
)
# encoders
from gators.encoders import (
    WOEEncoder,
    TargetEncoder,
)
# binning
from gators.binning import (
    BinSingleTargetClassCategories,
    BinRareCategories,
    TreeBinning,
)
# feature generation
from gators.feature_generation import (
    PolynomialObjectFeatures,
)
# datetime feature generation
from gators.feature_generation_dt import (
    CyclicHourOfDay,
    CyclicDayOfMonth,
    CyclicDayOfWeek,
    CyclicMonthOfYear,
    OrdinalDayOfMonth,
    OrdinalMonthOfYear,
    OrdinalDayOfWeek,
    OrdinalHourOfDay,
)
# feature selection
from gators.feature_selection import (
    InformationValue,
    SelectFromModel,
)

data cleaning

[2]:
max_ratio = 0.9
min_categories = 2
max_categories =256
min_ratio = 0.1
strategy_num = 'mean'
strategy_obj = 'constant'

data_cleaning_steps = [
    ("DropHighNaNRatio", DropHighNaNRatio(max_ratio=max_ratio)),
    ("DropLowCardinality", DropLowCardinality(min_categories=min_categories)),
    ("DropHighCardinality", DropHighCardinality(max_categories=max_categories)),
    ("BinRareCategories", BinRareCategories(min_ratio=min_ratio)),
    ("NumericsImputer", NumericsImputer(strategy=strategy_num, value=0)),
    ("ObjectImputer", ObjectImputer(strategy=strategy_obj, value="MISSING")),
]

datetime feature generation

[3]:
datetime_columns = ['Date']
datetime_steps = [
    ('ConvertColumnDatatype', ConvertColumnDatatype(
        columns=datetime_columns, datatype='datetime64[ns]')),
    ('OrdinalHourOfDay', OrdinalHourOfDay(columns=datetime_columns)),
    ('OrdinalDayOfWeek', OrdinalDayOfWeek(columns=datetime_columns)),
    ('OrdinalDayOfMonth', OrdinalDayOfMonth(columns=datetime_columns)),
    ('OrdinalMonthOfYear', OrdinalMonthOfYear(columns=datetime_columns)),
    ('CyclicHourOfDay', CyclicHourOfDay(columns=datetime_columns)),
    ('CyclicDayOfWeek', CyclicDayOfWeek(columns=datetime_columns)),
    ('CyclicDayOfMonth', CyclicDayOfMonth(columns=datetime_columns)),
    ('CyclicMonthOfYear', CyclicMonthOfYear(columns=datetime_columns)),
]

polynomial object features

[4]:
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

tree = DecisionTreeClassifier(random_state=0, min_samples_leaf=100)

columns = list('columns')
poynomial_object_features_steps = [
    ('BinSingleTargetClassCategories', BinSingleTargetClassCategories()),
    ('TreeBinning', TreeBinning(tree=tree)),
    ('PolynomialObjectFeatures',
         PolynomialObjectFeatures(columns=columns, degree=2)),
    ('CleanCategories', BinRareCategories(min_ratio=0.1)), # min_ratio can be set to 0.
    ('Encoder', WOEEncoder()), # if WOEEncoder binary problem or TargetEncoder() if regression problem.
]

feature selection

univariate feature selection pipeline - binary classification

[5]:
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

tree = DecisionTreeClassifier(random_state=0, min_samples_leaf=100)
k = 50

univariate_feature_selection_steps = [
    ('BinSingleTargetClassCategories', BinSingleTargetClassCategories()),
    ('TreeBinning', TreeBinning(tree=tree)),
    ('InformationValue', InformationValue(k=k)),
]

feature selection with the wrapper method pipeline

[6]:
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from xgboost import XGBClassifier, XGBRegressor
tree = DecisionTreeClassifier(random_state=0, min_samples_leaf=100)
model = XGBClassifier(random_seed=0)

wrapper_feature_selection = [
    ('BinSingleTargetClassCategories', BinSingleTargetClassCategories()),
    ('TreeBinning', TreeBinning(tree=tree)), # tree clssifier or tree regressor
    ('Encoder', WOEEncoder()), # if binary problem or TargetEncoder() if regression problem.
    ('SelectFromModel', SelectFromModel(model=model, k=k))
]
/Users/cpoli/gators38/lib/python3.8/site-packages/xgboost/compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
  from pandas import MultiIndex, Int64Index
[ ]:

[ ]: