Skip to content

Commit 2f22a2b

Browse files
authored
Adds drop_original option to CombineWithReferenceFeatures, solves issue #331 (#350)
* adds option to drop original vars to CombineWithReferenceFeature transf * changes wording of docstring * makes an error msg more informative * replaces pd.unique with set * adds drop_original option to MathematicalCombination transf
1 parent da3ec45 commit 2f22a2b

File tree

4 files changed

+90
-0
lines changed

4 files changed

+90
-0
lines changed

feature_engine/creation/combine_with_reference_feature.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,9 @@ class CombineWithReferenceFeature(BaseEstimator, TransformerMixin):
6565
transformer will return an error if the training or the datasets to transform
6666
contain missing values.
6767
68+
drop_original: bool, default=False
69+
If True, the original variables will be dropped from the dataframe
70+
after their combination.
6871
6972
Attributes
7073
----------
@@ -98,6 +101,7 @@ def __init__(
98101
operations: List[str] = ["sub"],
99102
new_variables_names: Optional[List[str]] = None,
100103
missing_values: str = "ignore",
104+
drop_original: bool = False,
101105
) -> None:
102106

103107
# check input types
@@ -158,11 +162,18 @@ def __init__(
158162
if missing_values not in ["raise", "ignore"]:
159163
raise ValueError("missing_values takes only values 'raise' or 'ignore'")
160164

165+
if not isinstance(drop_original, bool):
166+
raise TypeError(
167+
"drop_original takes only boolean values True and False. "
168+
f"Got {drop_original} instead."
169+
)
170+
161171
self.reference_variables = reference_variables
162172
self.variables_to_combine = variables_to_combine
163173
self.new_variables_names = new_variables_names
164174
self.operations = operations
165175
self.missing_values = missing_values
176+
self.drop_original = drop_original
166177

167178
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
168179
"""
@@ -286,6 +297,12 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
286297
if self.new_variables_names:
287298
X.columns = original_col_names + self.new_variables_names
288299

300+
if self.drop_original:
301+
X.drop(
302+
columns=set(self.variables_to_combine + self.reference_variables),
303+
inplace=True,
304+
)
305+
289306
return X
290307

291308
def _more_tags(self):

feature_engine/creation/mathematical_combination.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ def __init__(
103103
math_operations: Optional[List[str]] = None,
104104
new_variables_names: Optional[List[str]] = None,
105105
missing_values: str = "raise",
106+
drop_original: bool = False,
106107
) -> None:
107108

108109
# check input types
@@ -158,10 +159,17 @@ def __init__(
158159
"combine."
159160
)
160161

162+
if not isinstance(drop_original, bool):
163+
raise TypeError(
164+
"drop_original takes only boolean values True and False. "
165+
f"Got {drop_original} instead."
166+
)
167+
161168
self.variables_to_combine = variables_to_combine
162169
self.new_variables_names = new_variables_names
163170
self.math_operations = math_operations
164171
self.missing_values = missing_values
172+
self.drop_original = drop_original
165173

166174
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
167175
"""
@@ -251,6 +259,9 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
251259
for new_variable_name, operation in self.combination_dict_.items():
252260
X[new_variable_name] = X[self.variables_to_combine].agg(operation, axis=1)
253261

262+
if self.drop_original:
263+
X.drop(columns=self.variables_to_combine, inplace=True)
264+
254265
return X
255266

256267
def _more_tags(self):

tests/test_creation/test_combine_with_reference_feature.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,15 @@ def test_error_if_new_variable_names_of_wrong_type():
7070
)
7171

7272

73+
def test_error_when_drop_original_not_bool():
74+
with pytest.raises(TypeError):
75+
CombineWithReferenceFeature(
76+
variables_to_combine=["Age"],
77+
reference_variables=["Marks"],
78+
drop_original="not_a_bool"
79+
)
80+
81+
7382
def test_error_when_variables_to_combine_not_numeric(df_vartypes):
7483
transformer = CombineWithReferenceFeature(
7584
variables_to_combine=["Name", "Age", "Marks"],
@@ -191,3 +200,27 @@ def test_user_enters_output_variable_names(df_vartypes):
191200

192201
# transform params
193202
pd.testing.assert_frame_equal(X, ref)
203+
204+
205+
def test_drop_original_variables(df_vartypes):
206+
transformer = CombineWithReferenceFeature(
207+
variables_to_combine=["Age", "Marks"],
208+
reference_variables=["Age", "Marks"],
209+
drop_original=True
210+
)
211+
212+
X = transformer.fit_transform(df_vartypes)
213+
214+
ref = pd.DataFrame.from_dict(
215+
{
216+
"Name": ["tom", "nick", "krish", "jack"],
217+
"City": ["London", "Manchester", "Liverpool", "Bristol"],
218+
"dob": pd.date_range("2020-02-24", periods=4, freq="T"),
219+
"Age_sub_Age": [0, 0, 0, 0],
220+
"Marks_sub_Age": [-19.1, -20.2, -18.3, -17.4],
221+
"Age_sub_Marks": [19.1, 20.2, 18.3, 17.4],
222+
"Marks_sub_Marks": [0.0, 0.0, 0.0, 0.0],
223+
}
224+
)
225+
226+
pd.testing.assert_frame_equal(X, ref)

tests/test_creation/test_mathematical_combination.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,13 @@ def test_default_parameters(df_vartypes):
119119
pd.testing.assert_frame_equal(X, ref)
120120

121121

122+
def test_error_when_drop_original_not_bool():
123+
with pytest.raises(TypeError):
124+
MathematicalCombination(
125+
variables_to_combine=["Age", "Marks"], drop_original="not_a_bool"
126+
)
127+
128+
122129
def test_error_when_variables_to_combine_not_numeric(df_vartypes):
123130
transformer = MathematicalCombination(variables_to_combine=["Name", "Age", "Marks"])
124131
with pytest.raises(TypeError):
@@ -324,3 +331,25 @@ def test_no_error_when_null_values_in_variable(df_vartypes):
324331
)
325332
# transform params
326333
pd.testing.assert_frame_equal(X, ref)
334+
335+
336+
def test_drop_original_variables(df_vartypes):
337+
transformer = MathematicalCombination(
338+
variables_to_combine=["Age", "Marks"],
339+
math_operations=["sum", "mean"],
340+
drop_original=True,
341+
)
342+
343+
X = transformer.fit_transform(df_vartypes)
344+
345+
ref = pd.DataFrame.from_dict(
346+
{
347+
"Name": ["tom", "nick", "krish", "jack"],
348+
"City": ["London", "Manchester", "Liverpool", "Bristol"],
349+
"dob": pd.date_range("2020-02-24", periods=4, freq="T"),
350+
"sum(Age-Marks)": [20.9, 21.8, 19.7, 18.6],
351+
"mean(Age-Marks)": [10.45, 10.9, 9.85, 9.3],
352+
}
353+
)
354+
355+
pd.testing.assert_frame_equal(X, ref)

0 commit comments

Comments
 (0)