Skip to content

Commit ebaa2ce

Browse files
authored
Release1.0.2 (#228)
* fix typo * adds missing param to docstrings * fix typos in notebook * fix typos in notebook * fix bug in select by target proxy * fix bug target mean selection * adds kaggle kernels to docs * update version and changelog * fixes docs bugs
1 parent 47a8b7a commit ebaa2ce

File tree

10 files changed

+138
-89
lines changed

10 files changed

+138
-89
lines changed

docs/blogs.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ Blogs
99
- `Feature-engine: A new open-source Python package for feature engineering <https://www.trainindatablog.com/feature-engine-a-new-open-source-python-package-for-feature-engineering/>`_.
1010
- `Practical Code Implementations of Feature Engineering for Machine Learning with Python <https://www.trainindatablog.com/practical-code-implementations-of-feature-engineering-for-machine-learning-with-python/>`_.
1111
- `Streamlining Feature Engineering Pipelines with Feature-engine <https://towardsdatascience.com/streamlining-feature-engineering-pipelines-with-feature-engine-e781d551f470?gi=e0fa6e5c0c1a/>`_.
12-
- `Feature Engineering for Machine Learning: A comprehensive Overvoew <https://www.trainindatablog.com/feature-engineering-for-machine-learning-comprehensive-overview/>`_.
12+
- `Feature Engineering for Machine Learning: A comprehensive Overview <https://www.trainindatablog.com/feature-engineering-for-machine-learning-comprehensive-overview/>`_.
1313
- `Feature Selection for Machine Learning: A comprehensive Overview <https://www.trainindatablog.com/feature-selection-for-machine-learning-comprehensive-overview/>`_.
1414

1515

docs/tutorials.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,14 @@ Code tutorials
77
Coming Soon!
88

99

10+
Kaggle Kernels
11+
--------------
12+
13+
- `Feature selection for bank customer satisfaction prediction <https://www.kaggle.com/solegalli/feature-selection-with-feature-engine>`_
14+
- `Feature engineering and selection for house price prediction <https://www.kaggle.com/solegalli/predict-house-price-with-feature-engine>`_
15+
- `Feature creation for wine quality prediction <https://www.kaggle.com/solegalli/create-new-features-with-feature-engine>`_
16+
17+
1018
Video tutorials
1119
---------------
1220

docs/whats_new/v1.rst

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,32 @@
1+
Version 1.0.2
2+
=============
3+
4+
Deployed: 22th January 2021
5+
6+
Contributors
7+
------------
8+
- Nicolas Galli
9+
- Pradumna Suryawanshi
10+
- Elamraoui Sohayb
11+
- Soledad Galli
12+
13+
New transformers
14+
----------------
15+
- **CombineWithReferenceFeatures**: applies mathematical operations between a group of variables and reference variables (**by Nicolas Galli**)
16+
- **DropMissingData**: removes missing observations from a dataset (**Pradumna Suryawanshi**)
17+
18+
Bug Fix
19+
-------
20+
- Fix bugs in SelectByTargetMeanPerformance.
21+
- Fix documentation and jupyter notebook typos.
22+
23+
Tutorials
24+
---------
25+
26+
- **Creation**: updated "how to" examples on how to combine variables into new features (**by Elamraoui Sohayb and Nicolas Galli**)
27+
- **Kaggle Kernels**: include links to Kaggle kernels
28+
29+
130
Version 1.0.1
231
=============
332

examples/creation/CombineWithReferenceFeature.ipynb

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -824,7 +824,7 @@
824824
"value_pipe = pipe([\n",
825825
" \n",
826826
" # Create new features\n",
827-
" ('substraction', CombineWithReferenceFeature(\n",
827+
" ('subtraction', CombineWithReferenceFeature(\n",
828828
" variables_to_combine=['total sulfur dioxide'],\n",
829829
" reference_variables=['free sulfur dioxide'],\n",
830830
" operations=['sub'],\n",
@@ -864,7 +864,7 @@
864864
{
865865
"data": {
866866
"text/plain": [
867-
"Pipeline(steps=[('substraction',\n",
867+
"Pipeline(steps=[('subtraction',\n",
868868
" CombineWithReferenceFeature(new_variables_names=['non_free_sulfur_dioxide'],\n",
869869
" reference_variables=['free sulfur '\n",
870870
" 'dioxide'],\n",
@@ -920,19 +920,19 @@
920920
"name": "stdout",
921921
"output_type": "stream",
922922
"text": [
923-
"LogisticRegression Model train accuracy score: 0.7477414871438499\n",
923+
"Logistic Regression Model train accuracy score: 0.7477414871438499\n",
924924
"\n",
925-
"LogisticRegression Model train accuracy score: 0.75\n"
925+
"Logistic Regression Model test accuracy score: 0.75\n"
926926
]
927927
}
928928
],
929929
"source": [
930-
"print('LogisticRegression Model train accuracy score: {}'.format(\n",
930+
"print('Logistic Regression Model train accuracy score: {}'.format(\n",
931931
" accuracy_score(y_train, pred_train)))\n",
932932
"\n",
933933
"print()\n",
934934
"\n",
935-
"print('LogisticRegression Model train accuracy score: {}'.format(\n",
935+
"print('Logistic Regression Model test accuracy score: {}'.format(\n",
936936
" accuracy_score(y_test, pred_test)))"
937937
]
938938
},
@@ -945,7 +945,7 @@
945945
"name": "stdout",
946946
"output_type": "stream",
947947
"text": [
948-
"LogisticRegression Model test classification report: \n",
948+
"Logistic Regression Model test classification report: \n",
949949
"\n",
950950
" precision recall f1-score support\n",
951951
"\n",
@@ -960,7 +960,7 @@
960960
}
961961
],
962962
"source": [
963-
"print('LogisticRegression Model test classification report: \\n\\n {}'.format(\n",
963+
"print('Logistic Regression Model test classification report: \\n\\n {}'.format(\n",
964964
" classification_report(y_test, pred_test)))"
965965
]
966966
},

examples/creation/MathematicalCombination.ipynb

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -578,7 +578,7 @@
578578
")\n",
579579
"\n",
580580
"\n",
581-
"# Fit the Mean Combinator on training data\n",
581+
"# Fit the Combinator to the training data\n",
582582
"multiple_combinator.fit(data)\n",
583583
"\n",
584584
"# Transform the data\n",
@@ -793,11 +793,12 @@
793793
"metadata": {},
794794
"source": [
795795
"We can put all these transformations into single pipeline:\n",
796+
"\n",
796797
"1. Create new variables\n",
797798
"2. Scale features\n",
798-
"3. Apply simple LogisticRegression classifier to predict the wine quality range\n",
799+
"3. Train a Logistic Regression model to predict wine quality\n",
799800
"\n",
800-
"See more on how to use Piplines in these **[examples](https://github.com/solegalli/feature_engine/tree/master/examples/Pipelines)**"
801+
"See more on how to use Feature-engine within Scikit-learn Pipelines in these **[examples](https://github.com/solegalli/feature_engine/tree/master/examples/Pipelines)**"
801802
]
802803
},
803804
{
@@ -838,12 +839,12 @@
838839
"outputs": [],
839840
"source": [
840841
"value_pipe = pipe([\n",
841-
" # Create two new features using the min and max combinators\n",
842+
"\n",
843+
" # Create the new features\n",
842844
" ('math_combinator_mean', MathematicalCombination(variables_to_combine=['fixed acidity', 'volatile acidity'],\n",
843845
" math_operations=['mean'],\n",
844846
" new_variables_names=['avg_acidity'])),\n",
845847
"\n",
846-
" # Create three new features using the mean and sum combinators\n",
847848
" ('math_combinator_sum', MathematicalCombination(variables_to_combine=['total sulfur dioxide', 'sulphates'],\n",
848849
" math_operations=['sum'],\n",
849850
" new_variables_names=['total_minerals'])),\n",
@@ -910,15 +911,15 @@
910911
"text": [
911912
"LogisticRegression Model train accuracy score: 0.744266851980542\n",
912913
"\n",
913-
"LogisticRegression Model train accuracy score: 0.75\n"
914+
"LogisticRegression Model test accuracy score: 0.75\n"
914915
]
915916
}
916917
],
917918
"source": [
918-
"print('LogisticRegression Model train accuracy score: {}'.format(\n",
919+
"print('Logistic Regression Model train accuracy score: {}'.format(\n",
919920
" accuracy_score(y_train, pred_train)))\n",
920921
"print()\n",
921-
"print('LogisticRegression Model train accuracy score: {}'.format(\n",
922+
"print('Logistic Regression Model test accuracy score: {}'.format(\n",
922923
" accuracy_score(y_test, pred_test)))"
923924
]
924925
},
@@ -946,7 +947,7 @@
946947
}
947948
],
948949
"source": [
949-
"print('LogisticRegression Model test classification report: \\n\\n {}'.format(\n",
950+
"print('Logistic Regression Model test classification report: \\n\\n {}'.format(\n",
950951
" classification_report(y_test, pred_test)))"
951952
]
952953
},
@@ -1042,7 +1043,12 @@
10421043
"title_cell": "Table of Contents",
10431044
"title_sidebar": "Contents",
10441045
"toc_cell": false,
1045-
"toc_position": {},
1046+
"toc_position": {
1047+
"height": "calc(100% - 180px)",
1048+
"left": "10px",
1049+
"top": "150px",
1050+
"width": "197.6px"
1051+
},
10461052
"toc_section_display": true,
10471053
"toc_window_display": true
10481054
}

feature_engine/VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1.0.1
1+
1.0.2

feature_engine/creation/combine_with_reference_feature.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -104,11 +104,11 @@ class CombineWithReferenceFeature(BaseEstimator, TransformerMixin):
104104
Methods
105105
-------
106106
107-
fit:
107+
fit :
108108
This transformer does not learn parameters.
109-
transform:
109+
transform :
110110
Combine the variables with the mathematical operations.
111-
fit_transform:
111+
fit_transform :
112112
Fit to the data, then transform it.
113113
114114
Notes
@@ -219,8 +219,8 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
219219
If any of the reference variables contain null values and the
220220
mathematical operation is 'div'.
221221
222-
Returns:
223-
--------
222+
Returns
223+
-------
224224
self
225225
"""
226226

feature_engine/selection/smart_correlation_selection.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,22 @@ class SmartCorrelatedSelection(BaseSelector):
5858
Takes values 'raise' and 'ignore'. Whether the missing values should be raised
5959
as error or ignored when determining correlation.
6060
61+
selection_method : str, default= "missing_values"
62+
Takes the values "missing_values", "cardinality", "variance" and
63+
"model_performance".
64+
65+
"missing_values": keeps the feature from the correlated group with least
66+
missing observations
67+
68+
"cardinality": keeps the feature from the correlated group with the highest
69+
cardinality.
70+
71+
"variance": keeps the feature from the correlated group with the highest
72+
variance.
73+
74+
"model_performance": trains a machine learning model using the correlated
75+
feature group and retains the feature with the highest importance.
76+
6177
estimator : object, default = None
6278
A Scikit-learn estimator for regression or classification.
6379

feature_engine/selection/target_mean_selection.py

Lines changed: 14 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import pandas as pd
44
from sklearn.metrics import roc_auc_score, r2_score
5-
from sklearn.model_selection import StratifiedKFold
5+
from sklearn.model_selection import KFold
66
from sklearn.pipeline import Pipeline
77

88
from feature_engine.dataframe_checks import (
@@ -78,15 +78,15 @@ class SelectByTargetMeanPerformance(BaseSelector):
7878
This indicates the metrics score to perform the feature selection.
7979
The current implementation supports 'roc_auc_score' and 'r2_score'.
8080
81-
threshold : float, default = 0.5
81+
threshold : float, default = None
8282
The performance threshold above which a feature will be selected.
8383
8484
bins : int, default = 5
8585
If the dataset contains numerical variables, the number of bins into which
8686
the values will be sorted.
8787
8888
strategy : str, default = equal_width
89-
whether to create the bins for discretisation of numerical variables of
89+
whether to create the bins for discretization of numerical variables of
9090
equal width or equal frequency.
9191
9292
cv : int, default=3
@@ -132,21 +132,9 @@ def __init__(
132132
"'scoring'"
133133
)
134134

135-
if not isinstance(threshold, (int, float)):
135+
if threshold and not isinstance(threshold, (int, float)):
136136
raise ValueError("threshold can only take integer or float")
137137

138-
if scoring == "roc_auc_score" and (threshold < 0.5 or threshold > 1):
139-
raise ValueError(
140-
"roc-auc score should vary between 0.5 and 1. Pick a "
141-
"threshold within this interval."
142-
)
143-
144-
if scoring == "r2_score" and (threshold < 0 or threshold > 1):
145-
raise ValueError(
146-
"r2 score should vary between 0 and 1. Pick a "
147-
"threshold within this interval."
148-
)
149-
150138
if not isinstance(bins, int):
151139
raise TypeError("'bins' takes only integers")
152140

@@ -195,6 +183,8 @@ def fit(self, X: pd.DataFrame, y: pd.Series):
195183
# check if df contains na
196184
_check_contains_na(X, self.variables)
197185

186+
self.input_shape_ = X.shape
187+
198188
# limit df to variables to smooth code below
199189
X = X[self.variables].copy()
200190

@@ -205,7 +195,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series):
205195
)
206196

207197
# obtain cross-validation indeces
208-
skf = StratifiedKFold(
198+
skf = KFold(
209199
n_splits=self.cv, shuffle=True, random_state=self.random_state
210200
)
211201
skf.get_n_splits(X, y)
@@ -245,14 +235,18 @@ def fit(self, X: pd.DataFrame, y: pd.Series):
245235
axis=1
246236
).to_dict()
247237

238+
# select features
239+
if not self.threshold:
240+
threshold = pd.Series(self.feature_performance_).mean()
241+
else:
242+
threshold = self.threshold
243+
248244
self.features_to_drop_ = [
249245
f
250246
for f in self.variables
251-
if self.feature_performance_[f] < self.threshold
247+
if self.feature_performance_[f] < threshold
252248
]
253249

254-
self.input_shape_ = X.shape
255-
256250
return self
257251

258252
def _make_numerical_pipeline(self):

0 commit comments

Comments
 (0)