pipeline templates¶
[1]:
# data cleaning
from gators.data_cleaning import (
ConvertColumnDatatype,
DropHighNaNRatio,
DropLowCardinality,
DropHighCardinality,
DropDatatypeColumns,
)
# imputers
from gators.imputers import (
NumericsImputer,
ObjectImputer,
)
# encoders
from gators.encoders import (
WOEEncoder,
TargetEncoder,
)
# binning
from gators.binning import (
BinSingleTargetClassCategories,
BinRareCategories,
TreeBinning,
)
# feature generation
from gators.feature_generation import (
PolynomialObjectFeatures,
)
# datetime feature generation
from gators.feature_generation_dt import (
CyclicHourOfDay,
CyclicDayOfMonth,
CyclicDayOfWeek,
CyclicMonthOfYear,
OrdinalDayOfMonth,
OrdinalMonthOfYear,
OrdinalDayOfWeek,
OrdinalHourOfDay,
)
# feature selection
from gators.feature_selection import (
InformationValue,
SelectFromModel,
)
data cleaning¶
[2]:
max_ratio = 0.9
min_categories = 2
max_categories =256
min_ratio = 0.1
strategy_num = 'mean'
strategy_obj = 'constant'
data_cleaning_steps = [
("DropHighNaNRatio", DropHighNaNRatio(max_ratio=max_ratio)),
("DropLowCardinality", DropLowCardinality(min_categories=min_categories)),
("DropHighCardinality", DropHighCardinality(max_categories=max_categories)),
("BinRareCategories", BinRareCategories(min_ratio=min_ratio)),
("NumericsImputer", NumericsImputer(strategy=strategy_num, value=0)),
("ObjectImputer", ObjectImputer(strategy=strategy_obj, value="MISSING")),
]
datetime feature generation¶
[3]:
datetime_columns = ['Date']
datetime_steps = [
('ConvertColumnDatatype', ConvertColumnDatatype(
columns=datetime_columns, datatype='datetime64[ns]')),
('OrdinalHourOfDay', OrdinalHourOfDay(columns=datetime_columns)),
('OrdinalDayOfWeek', OrdinalDayOfWeek(columns=datetime_columns)),
('OrdinalDayOfMonth', OrdinalDayOfMonth(columns=datetime_columns)),
('OrdinalMonthOfYear', OrdinalMonthOfYear(columns=datetime_columns)),
('CyclicHourOfDay', CyclicHourOfDay(columns=datetime_columns)),
('CyclicDayOfWeek', CyclicDayOfWeek(columns=datetime_columns)),
('CyclicDayOfMonth', CyclicDayOfMonth(columns=datetime_columns)),
('CyclicMonthOfYear', CyclicMonthOfYear(columns=datetime_columns)),
]
polynomial object features¶
[4]:
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
tree = DecisionTreeClassifier(random_state=0, min_samples_leaf=100)
columns = list('columns')
poynomial_object_features_steps = [
('BinSingleTargetClassCategories', BinSingleTargetClassCategories()),
('TreeBinning', TreeBinning(tree=tree)),
('PolynomialObjectFeatures',
PolynomialObjectFeatures(columns=columns, degree=2)),
('CleanCategories', BinRareCategories(min_ratio=0.1)), # min_ratio can be set to 0.
('Encoder', WOEEncoder()), # if WOEEncoder binary problem or TargetEncoder() if regression problem.
]
feature selection¶
univariate feature selection pipeline - binary classification¶
[5]:
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
tree = DecisionTreeClassifier(random_state=0, min_samples_leaf=100)
k = 50
univariate_feature_selection_steps = [
('BinSingleTargetClassCategories', BinSingleTargetClassCategories()),
('TreeBinning', TreeBinning(tree=tree)),
('InformationValue', InformationValue(k=k)),
]
feature selection with the wrapper method pipeline¶
[6]:
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from xgboost import XGBClassifier, XGBRegressor
tree = DecisionTreeClassifier(random_state=0, min_samples_leaf=100)
model = XGBClassifier(random_seed=0)
wrapper_feature_selection = [
('BinSingleTargetClassCategories', BinSingleTargetClassCategories()),
('TreeBinning', TreeBinning(tree=tree)), # tree clssifier or tree regressor
('Encoder', WOEEncoder()), # if binary problem or TargetEncoder() if regression problem.
('SelectFromModel', SelectFromModel(model=model, k=k))
]
/Users/cpoli/gators38/lib/python3.8/site-packages/xgboost/compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
from pandas import MultiIndex, Int64Index
[ ]:
[ ]: