Source code for gators.scalers.standard_scaler

# License: Apache-2.0


from ._base_scaler import _BaseScaler
from ..util import util

from gators import DataFrame, Series


[docs]class StandardScaler(_BaseScaler): """Scale each column by setting the mean to 0 and the standard deviation to 1. Parameters ---------- inplace : bool, default True. If True, perform the scaling in-place. If False, create new columns. Examples -------- Imports and initialization: >>> from gators.scalers import StandardScaler >>> obj = StandardScaler() The `fit`, `transform`, and `fit_transform` methods accept: * `dask` dataframes: >>> import dask.dataframe as dd >>> import pandas as pd >>> X = dd.from_pandas(pd.DataFrame({'A': [1, 2, 3], 'B': [-0.1, 0.2, 0.3]}), npartitions=1) * `koalas` dataframes: >>> import databricks.koalas as ks >>> X = ks.DataFrame({'A': [1, 2, 3], 'B': [-0.1, 0.2, 0.3]}) * and `pandas` dataframes: >>> import pandas as pd >>> X = pd.DataFrame({'A': [1, 2, 3], 'B': [-0.1, 0.2, 0.3]}) The result is a transformed dataframe belonging to the same dataframe library. >>> obj.fit_transform(X) A B 0 -1.0 -1.120897 1 0.0 0.320256 2 1.0 0.800641 >>> X = pd.DataFrame({'A': [1, 2, 3], 'B': [-0.1, 0.2, 0.3]}) >>> _ = obj.fit(X) >>> obj.transform_numpy(X.to_numpy()) array([[-1. , -1.12089708], [ 0. , 0.32025631], [ 1. , 0.80064077]]) """ def __init__(self, inplace: bool = True): _BaseScaler.__init__(self, inplace=inplace)
[docs] def fit(self, X: DataFrame, y: Series = None) -> "StandardScaler": """Fit the transformer on the pandas/koalas dataframe X. Parameters ---------- X : DataFrame. Input dataframe. y : Series, default None. Target values. Returns ------- self : 'StandardScaler' Instance of itself. """ self.check_dataframe(X) self.columns = util.get_numerical_columns(X) self.idx_columns = util.get_idx_columns(X, self.columns) self.column_names = self.get_column_names(self.inplace, self.columns, "scale") self.X_offset = ( util.get_function(X).to_pandas(X[self.columns].mean()).astype(float) ) self.X_scale = ( 1.0 / util.get_function(X).to_pandas(X[self.columns].std()) ).astype(float) self.X_offset_np = util.get_function(self.X_offset).to_numpy(self.X_offset) self.X_scale_np = util.get_function(self.X_scale).to_numpy(self.X_scale) self.X_offset = self.X_offset.to_dict() self.X_scale = self.X_scale.to_dict() return self