|
| 1 | +MatchVariables |
| 2 | +============== |
| 3 | + |
| 4 | +API Reference |
| 5 | +------------- |
| 6 | + |
| 7 | +.. autoclass:: feature_engine.preprocessing.MatchVariables |
| 8 | + :members: |
| 9 | + |
| 10 | + |
| 11 | +Example |
| 12 | +------- |
| 13 | + |
| 14 | +MatchVariables() ensures that the columns in the test set are identical to those |
| 15 | +in the train set. |
| 16 | + |
| 17 | +If the test set contains additional columns, they are dropped. Alternatively, if the |
| 18 | +test set lacks columns that were present in the train set, they will be added with a |
| 19 | +value determined by the user, for example np.nan. |
| 20 | + |
| 21 | + |
| 22 | +.. code:: python |
| 23 | +
|
| 24 | + import numpy as np |
| 25 | + import pandas as pd |
| 26 | +
|
| 27 | + from feature_engine.preprocessing import MatchVariables |
| 28 | +
|
| 29 | +
|
| 30 | + # Load dataset |
| 31 | + def load_titanic(): |
| 32 | + data = pd.read_csv('https://www.openml.org/data/get_csv/16826755/phpMYEkMl') |
| 33 | + data = data.replace('?', np.nan) |
| 34 | + data['cabin'] = data['cabin'].astype(str).str[0] |
| 35 | + data['pclass'] = data['pclass'].astype('O') |
| 36 | + data['age'] = data['age'].astype('float') |
| 37 | + data['fare'] = data['fare'].astype('float') |
| 38 | + data['embarked'].fillna('C', inplace=True) |
| 39 | + data.drop( |
| 40 | + labels=['name', 'ticket', 'boat', 'body', 'home.dest'], |
| 41 | + axis=1, inplace=True, |
| 42 | + ) |
| 43 | + return data |
| 44 | +
|
| 45 | + # load data as pandas dataframe |
| 46 | + data = load_titanic() |
| 47 | +
|
| 48 | + # Split test and train |
| 49 | + train = data.iloc[0:1000, :] |
| 50 | + test = data.iloc[1000:, :] |
| 51 | +
|
| 52 | + # set up the transformer |
| 53 | + match_cols = MatchVariables(missing_values="ignore") |
| 54 | +
|
| 55 | + # learn the variables in the train set |
| 56 | + match_cols.fit(train) |
| 57 | +
|
| 58 | + # the transformer stores the input variables |
| 59 | + match_cols.input_features_ |
| 60 | +
|
| 61 | +
|
| 62 | +.. code:: python |
| 63 | +
|
| 64 | + ['pclass', |
| 65 | + 'survived', |
| 66 | + 'sex', |
| 67 | + 'age', |
| 68 | + 'sibsp', |
| 69 | + 'parch', |
| 70 | + 'fare', |
| 71 | + 'cabin', |
| 72 | + 'embarked'] |
| 73 | +
|
| 74 | +
|
| 75 | +.. code:: python |
| 76 | +
|
| 77 | + # Let's drop some columns in the test set for the demo |
| 78 | + test_t = test.drop(["sex", "age"], axis=1) |
| 79 | +
|
| 80 | + test_t.head() |
| 81 | +
|
| 82 | +.. code:: python |
| 83 | +
|
| 84 | + pclass survived sibsp parch fare cabin embarked |
| 85 | + 1000 3 1 0 0 7.7500 n Q |
| 86 | + 1001 3 1 2 0 23.2500 n Q |
| 87 | + 1002 3 1 2 0 23.2500 n Q |
| 88 | + 1003 3 1 2 0 23.2500 n Q |
| 89 | + 1004 3 1 0 0 7.7875 n Q |
| 90 | +
|
| 91 | +
|
| 92 | +.. code:: python |
| 93 | +
|
| 94 | + # the transformer adds the columns back |
| 95 | + test_tt = match_cols.transform(test_t) |
| 96 | +
|
| 97 | + test_tt.head() |
| 98 | +
|
| 99 | +.. code:: python |
| 100 | +
|
| 101 | + The following variables are added to the DataFrame: ['sex', 'age'] |
| 102 | +
|
| 103 | + pclass survived sex age sibsp parch fare cabin embarked |
| 104 | + 1000 3 1 NaN NaN 0 0 7.7500 n Q |
| 105 | + 1001 3 1 NaN NaN 2 0 23.2500 n Q |
| 106 | + 1002 3 1 NaN NaN 2 0 23.2500 n Q |
| 107 | + 1003 3 1 NaN NaN 2 0 23.2500 n Q |
| 108 | + 1004 3 1 NaN NaN 0 0 7.7875 n Q |
| 109 | +
|
| 110 | +
|
| 111 | +
|
| 112 | +Note how the missing columns were added back to the transformed test set, with |
| 113 | +missing values, in the position (i.e., order) in which they were in the train set. |
| 114 | + |
| 115 | +Similarly, if the test set contained additional columns, those would be removed: |
| 116 | + |
| 117 | +.. code:: python |
| 118 | +
|
| 119 | + # let's add some columns for the demo |
| 120 | + test_t[['var_a', 'var_b']] = 0 |
| 121 | +
|
| 122 | + test_t.head() |
| 123 | +
|
| 124 | +.. code:: python |
| 125 | +
|
| 126 | + pclass survived sibsp parch fare cabin embarked var_a var_b |
| 127 | + 1000 3 1 0 0 7.7500 n Q 0 0 |
| 128 | + 1001 3 1 2 0 23.2500 n Q 0 0 |
| 129 | + 1002 3 1 2 0 23.2500 n Q 0 0 |
| 130 | + 1003 3 1 2 0 23.2500 n Q 0 0 |
| 131 | + 1004 3 1 0 0 7.7875 n Q 0 0 |
| 132 | +
|
| 133 | +
|
| 134 | +.. code:: python |
| 135 | +
|
| 136 | + test_tt = match_cols.transform(test_t) |
| 137 | +
|
| 138 | + test_tt.head() |
| 139 | +
|
| 140 | +.. code:: python |
| 141 | +
|
| 142 | + The following variables are added to the DataFrame: ['age', 'sex'] |
| 143 | + The following variables are dropped from the DataFrame: ['var_a', 'var_b'] |
| 144 | +
|
| 145 | + pclass survived sex age sibsp parch fare cabin embarked |
| 146 | + 1000 3 1 NaN NaN 0 0 7.7500 n Q |
| 147 | + 1001 3 1 NaN NaN 2 0 23.2500 n Q |
| 148 | + 1002 3 1 NaN NaN 2 0 23.2500 n Q |
| 149 | + 1003 3 1 NaN NaN 2 0 23.2500 n Q |
| 150 | + 1004 3 1 NaN NaN 0 0 7.7875 n Q |
| 151 | +
|
| 152 | +
|
| 153 | +Now, the transformer simultaneously added the missing columns with NA as values and |
| 154 | +removed the additional columns from the resulting dataset. |
| 155 | + |
| 156 | +These transformer is useful in "predict then optimize type of problems". In such cases, |
| 157 | +a machine learning model is trained on a certain dataset, with certain input features. |
| 158 | +Then, test sets are "post-processed" according to scenarios that want to be modelled. |
| 159 | +For example, "what would have happened if the customer received an email campaign"? |
| 160 | +where the variable "receive_campaign" would be turned from 0 -> 1. |
| 161 | + |
| 162 | +While creating these modelling datasets, a lot of meta data e.g., "scenario number", |
| 163 | +"time scenario was generated", etc, could be added to the data. Then we need to pass |
| 164 | +these data over to the model to obtain the modelled prediction. |
| 165 | + |
| 166 | +MatchVariables() provides an easy an elegant way to remove the additional metadeta, |
| 167 | +while returning datasets with the input features in the correct order, allowing the |
| 168 | +different scenarios to be modelled directly inside a machine learning pipeline. |
0 commit comments