-
-
Notifications
You must be signed in to change notification settings - Fork 331
New feature: Lag or windows features grouped by #727
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 9 commits
4d653a9
4e9d849
7f40391
b476748
02c59bd
0dd92cc
47de2d6
dd43c27
7459811
12aa825
c3bee66
67725dc
9cb01ea
72ce43c
9d999b0
b7b8bc9
ba375a4
90f08f4
66baa75
92f996d
152c037
5343e50
ef1eaa8
09db782
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
import datetime | ||
from typing import List, Union | ||
from typing import List, Union, Dict | ||
|
||
import numpy as np | ||
import pandas as pd | ||
|
@@ -475,7 +475,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None): | |
threshold_cat = self.threshold | ||
|
||
# Compute the PSI by looping over the features | ||
self.psi_values_ = {} | ||
self.psi_values_: Dict = {} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We resolved this in a different PR. Could we remove this change from here please? |
||
self.features_to_drop_ = [] | ||
|
||
# Compute PSI for numerical features | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,30 +5,21 @@ | |
from sklearn.utils.validation import check_is_fitted | ||
|
||
from feature_engine._base_transformers.mixins import GetFeatureNamesOutMixin | ||
from feature_engine._check_init_parameters.check_variables import ( | ||
_check_variables_input_value, | ||
) | ||
from feature_engine._check_init_parameters.check_variables import \ | ||
_check_variables_input_value | ||
Ezzaldin97 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
from feature_engine._docstrings.fit_attributes import ( | ||
_feature_names_in_docstring, | ||
_n_features_in_docstring, | ||
) | ||
_feature_names_in_docstring, _n_features_in_docstring) | ||
from feature_engine._docstrings.init_parameters.all_trasnformers import ( | ||
_drop_original_docstring, | ||
_missing_values_docstring, | ||
) | ||
_drop_original_docstring, _missing_values_docstring) | ||
from feature_engine._docstrings.methods import _fit_not_learn_docstring | ||
from feature_engine._docstrings.substitute import Substitution | ||
from feature_engine.dataframe_checks import ( | ||
_check_contains_inf, | ||
_check_contains_na, | ||
_check_X_matches_training_df, | ||
check_X, | ||
) | ||
from feature_engine.dataframe_checks import (_check_contains_inf, | ||
|
||
_check_contains_na, | ||
_check_X_matches_training_df, | ||
check_X) | ||
from feature_engine.tags import _return_tags | ||
from feature_engine.variable_handling import ( | ||
check_numerical_variables, | ||
find_numerical_variables, | ||
) | ||
from feature_engine.variable_handling import (check_numerical_variables, | ||
find_numerical_variables) | ||
|
||
|
||
@Substitution( | ||
|
@@ -51,6 +42,9 @@ class BaseForecastTransformer(BaseEstimator, TransformerMixin, GetFeatureNamesOu | |
|
||
{drop_original} | ||
|
||
group_by_variables: str, list of str, default=None | ||
|
||
variable of list of variables to create lag features based on. | ||
|
||
Attributes | ||
---------- | ||
{feature_names_in_} | ||
|
@@ -64,6 +58,7 @@ def __init__( | |
variables: Union[None, int, str, List[Union[str, int]]] = None, | ||
missing_values: str = "raise", | ||
drop_original: bool = False, | ||
group_by_variables: Optional[Union[str, List[str]]] = None, | ||
) -> None: | ||
|
||
if missing_values not in ["raise", "ignore"]: | ||
|
@@ -78,9 +73,26 @@ def __init__( | |
f"Got {drop_original} instead." | ||
) | ||
|
||
# check validity if group by variables passed | ||
if group_by_variables: | ||
# check group by variables data-types | ||
if not ( | ||
isinstance(group_by_variables, str) | ||
or isinstance(group_by_variables, list) | ||
): | ||
raise ValueError( | ||
"group_by_variables must be an string or a list of strings. " | ||
f"Got {group_by_variables} instead." | ||
) | ||
# check if passed list has duplicates. | ||
if isinstance(group_by_variables, list): | ||
if len(set(group_by_variables)) != len(group_by_variables): | ||
raise ValueError("group_by_variables contains duplicate values") | ||
|
||
|
||
|
||
self.variables = _check_variables_input_value(variables) | ||
self.missing_values = missing_values | ||
self.drop_original = drop_original | ||
self.group_by_variables = group_by_variables | ||
|
||
def _check_index(self, X: pd.DataFrame): | ||
""" | ||
|
@@ -165,6 +177,18 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): | |
if self.missing_values == "raise": | ||
self._check_na_and_inf(X) | ||
|
||
if self.group_by_variables: | ||
|
||
# check if input group by variables is in input dataframe variables. | ||
if isinstance(self.group_by_variables, list): | ||
diff = set(self.group_by_variables).difference(X.columns.tolist()) | ||
if len(diff) != 0: | ||
raise ValueError(f"{list(diff)} not exist in dataframe") | ||
else: | ||
if self.group_by_variables not in X.columns.tolist(): | ||
raise ValueError( | ||
f"{self.group_by_variables} not exists in dataframe" | ||
) | ||
|
||
self._get_feature_names_in(X) | ||
|
||
return self | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,27 +3,20 @@ | |
|
||
from __future__ import annotations | ||
|
||
from typing import List | ||
from typing import List, Optional, Union | ||
|
||
import pandas as pd | ||
|
||
from feature_engine._docstrings.fit_attributes import ( | ||
_feature_names_in_docstring, | ||
_n_features_in_docstring, | ||
) | ||
_feature_names_in_docstring, _n_features_in_docstring) | ||
from feature_engine._docstrings.init_parameters.all_trasnformers import ( | ||
_drop_original_docstring, | ||
_missing_values_docstring, | ||
_variables_numerical_docstring, | ||
) | ||
from feature_engine._docstrings.methods import ( | ||
_fit_not_learn_docstring, | ||
_fit_transform_docstring, | ||
) | ||
_drop_original_docstring, _missing_values_docstring, | ||
_variables_numerical_docstring) | ||
from feature_engine._docstrings.methods import (_fit_not_learn_docstring, | ||
_fit_transform_docstring) | ||
from feature_engine._docstrings.substitute import Substitution | ||
from feature_engine.timeseries.forecasting.base_forecast_transformers import ( | ||
BaseForecastTransformer, | ||
) | ||
from feature_engine.timeseries.forecasting.base_forecast_transformers import \ | ||
BaseForecastTransformer | ||
|
||
|
||
@Substitution( | ||
|
@@ -139,6 +132,36 @@ class ExpandingWindowFeatures(BaseForecastTransformer): | |
2 2022-09-20 3 8 1.5 6.5 | ||
3 2022-09-21 4 9 2.0 7.0 | ||
4 2022-09-22 5 10 2.5 7.5 | ||
create expanding window features based on other variables. | ||
|
||
>>> import pandas as pd | ||
>>> from feature_engine.timeseries.forecasting import ExpandingWindowFeatures | ||
>>> X = pd.DataFrame(dict(date = ["2022-09-18", | ||
>>> "2022-09-19", | ||
>>> "2022-09-20", | ||
>>> "2022-09-21", | ||
>>> "2022-09-22", | ||
>>> "2022-09-18", | ||
>>> "2022-09-19", | ||
>>> "2022-09-20", | ||
>>> "2022-09-21", | ||
>>> "2022-09-22"], | ||
>>> x1 = [1,2,3,4,5, 3,5,6,8,11], | ||
>>> x2 = [6,7,8,9,10, 2,9,10,15,2], | ||
>>> x3=['a','a','a','a','a', 'b','b','b','b','b'] | ||
>>> )) | ||
>>> ewf = ExpandingWindowFeatures(group_by_variables='x3') | ||
>>> ewf.fit_transform(X) | ||
date x1 x2 x3 x1_expanding_mean x2_expanding_mean | ||
0 2022-09-18 1 6 a NaN NaN | ||
1 2022-09-19 2 7 a 1.000000 6.0 | ||
2 2022-09-20 3 8 a 1.500000 6.5 | ||
3 2022-09-21 4 9 a 2.000000 7.0 | ||
4 2022-09-22 5 10 a 2.500000 7.5 | ||
5 2022-09-18 3 2 b NaN NaN | ||
6 2022-09-19 5 9 b 3.000000 2.0 | ||
7 2022-09-20 6 10 b 4.000000 5.5 | ||
8 2022-09-21 8 15 b 4.666667 7.0 | ||
9 2022-09-22 11 2 b 5.500000 9.0 | ||
""" | ||
|
||
def __init__( | ||
|
@@ -151,6 +174,7 @@ def __init__( | |
sort_index: bool = True, | ||
missing_values: str = "raise", | ||
drop_original: bool = False, | ||
group_by_variables: Optional[Union[str, List[str]]] = None, | ||
) -> None: | ||
|
||
if not isinstance(functions, (str, list)) or not all( | ||
|
@@ -168,7 +192,7 @@ def __init__( | |
f"periods must be a non-negative integer. Got {periods} instead." | ||
) | ||
|
||
super().__init__(variables, missing_values, drop_original) | ||
super().__init__(variables, missing_values, drop_original, group_by_variables) | ||
|
||
self.min_periods = min_periods | ||
self.functions = functions | ||
|
@@ -193,12 +217,17 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: | |
# Common dataframe checks and setting up. | ||
X = self._check_transform_input_and_state(X) | ||
|
||
tmp = ( | ||
X[self.variables_] | ||
.expanding(min_periods=self.min_periods) | ||
.agg(self.functions) | ||
.shift(periods=self.periods, freq=self.freq) | ||
) | ||
if self.group_by_variables: | ||
tmp = self._agg_expanding_window_features( | ||
grouped_df=X.groupby(self.group_by_variables) | ||
) | ||
else: | ||
tmp = ( | ||
X[self.variables_] | ||
.expanding(min_periods=self.min_periods) | ||
.agg(self.functions) | ||
.shift(periods=self.periods, freq=self.freq) | ||
) | ||
|
||
tmp.columns = self._get_new_features_name() | ||
|
||
|
@@ -224,3 +253,30 @@ def _get_new_features_name(self) -> List: | |
] | ||
|
||
return feature_names | ||
|
||
def _agg_expanding_window_features( | ||
self, | ||
grouped_df: pd.core.groupby.generic.DataFrameGroupBy, | ||
) -> Union[pd.Series, pd.DataFrame]: | ||
"""generate expanding window features based on groups | ||
Parameters | ||
---------- | ||
grouped_df : pd.core.groupby.generic.DataFrameGroupBy | ||
dataframe of groups | ||
solegalli marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
Returns | ||
------- | ||
Union[pd.Series, pd.DataFrame] | ||
returned expanding window features | ||
""" | ||
tmp_data = [] | ||
for _, group in grouped_df: | ||
|
||
tmp = ( | ||
|
||
group[self.variables_] | ||
.expanding(min_periods=self.min_periods) | ||
.agg(self.functions) | ||
.shift(periods=self.periods, freq=self.freq) | ||
) | ||
tmp_data.append(tmp) | ||
tmp = pd.concat(tmp_data).sort_index() | ||
return tmp |
Uh oh!
There was an error while loading. Please reload this page.