# Licence Apache-2.0
from typing import List
import numpy as np
import feature_gen_dt
from ..transformers import Transformer
from ..util import util
from gators import DataFrame, Series
[docs]class DeltaTime(Transformer):
"""Create new columns based on the time difference in sec. between two columns.
Parameters
----------
theta_vec : List[float]
List of columns.
Examples
---------
Imports and initialization:
>>> from gators.feature_generation_dt import DeltaTime
>>> obj = DeltaTime(columns_a=['C'], columns_b=['A'])
The `fit`, `transform`, and `fit_transform` methods accept:
* `dask` dataframes:
>>> import dask.dataframe as dd
>>> import pandas as pd
>>> X = dd.from_pandas(
... pd.DataFrame({
... 'A': ['2020-01-01T23', '2020-01-02T00', None],
... 'B': [0, 1, 0],
... 'C': ['2020-01-15T23', '2020-01-03T05', None]}), npartitions=1)
>>> X[['A', 'C']] = X[['A', 'C']].astype('datetime64[ns]')
* `koalas` dataframes:
>>> import databricks.koalas as ks
>>> X = ks.DataFrame({
... 'A': ['2020-01-01T23', '2020-01-02T00', None],
... 'B': [0, 1, 0],
... 'C': ['2020-01-15T23', '2020-01-03T05', None]})
>>> X[['A', 'C']] = X[['A', 'C']].astype('datetime64[ns]')
* and `pandas` dataframes:
>>> import pandas as pd
>>> X = pd.DataFrame({
... 'A': ['2020-01-01T23', '2020-01-02T00', None],
... 'B': [0, 1, 0],
... 'C': ['2020-01-15T23', '2020-01-03T05', None]})
>>> X[['A', 'C']] = X[['A', 'C']].astype('datetime64[ns]')
The result is a transformed dataframe belonging to the same dataframe library.
>>> obj.fit_transform(X)
A B C C__A__Deltatime[s]
0 2020-01-01 23:00:00 0 2020-01-15 23:00:00 1209600.0
1 2020-01-02 00:00:00 1 2020-01-03 05:00:00 104400.0
2 NaT 0 NaT NaN
>>> X = pd.DataFrame({
... 'A': ['2020-01-01T23', '2020-01-02T00', None],
... 'B': [0, 1, 0],
... 'C': ['2020-01-15T23', '2020-01-03T05', None]})
>>> X[['A', 'C']] = X[['A', 'C']].astype('datetime64[ns]')
>>> _ = obj.fit(X)
>>> obj.transform_numpy(X.to_numpy())
array([[Timestamp('2020-01-01 23:00:00'), 0,
Timestamp('2020-01-15 23:00:00'), 1209600.0],
[Timestamp('2020-01-02 00:00:00'), 1,
Timestamp('2020-01-03 05:00:00'), 104400.0],
[NaT, 0, NaT, nan]], dtype=object)
"""
def __init__(self, columns_a: List[str], columns_b: List[str]):
Transformer.__init__(self)
if not isinstance(columns_a, (list, np.ndarray)):
raise TypeError("`columns_a` should be a list.")
if not columns_a:
raise ValueError("`columns_a` should not be empty.")
if not isinstance(columns_b, (list, np.ndarray)):
raise TypeError("`columns_b` should be a list.")
if not columns_b:
raise ValueError("`columns_b` should not be empty.")
if len(columns_b) != len(columns_a):
raise ValueError("`columns_a` and `columns_b` should have the same length.")
self.unit = "s"
self.columns_a = columns_a
self.columns_b = columns_b
self.deltatime_dtype = f"timedelta64[{self.unit}]"
self.column_names = [
f"{c_a}__{c_b}__Deltatime[{self.unit}]"
for c_a, c_b in zip(columns_a, columns_b)
]
[docs] def fit(self, X: DataFrame, y: Series = None) -> "DeltaTime":
"""Fit the transformer on the dataframe `X`.
Parameters
----------
X : pd.DataFrame
Input dataframe.
y : Series, default None.
Target values.
Returns
-------
self : DeltaTime
Instance of itself.
"""
self.check_dataframe(X)
columns = list(set(self.columns_a + self.columns_b))
columns = [c for c in X.columns if c in columns]
X_datetime_dtype = X.dtypes
for column in columns:
if not np.issubdtype(X_datetime_dtype[column], np.datetime64):
raise TypeError(
"""
Datetime columns should be of subtype np.datetime64.
Use `ConvertColumnDatatype` to convert the dtype.
"""
)
self.idx_columns = util.get_idx_columns(
columns=X.columns,
selected_columns=columns,
)
self.idx_columns_a = util.get_idx_columns(
# columns=X.columns,
columns=columns,
selected_columns=self.columns_a,
)
self.idx_columns_b = util.get_idx_columns(
# columns=X.columns,
columns=columns,
selected_columns=self.columns_b,
)
return self