Skip to content

Commit 1a0ccaa

Browse files
committed
Merge branch 'dev' of https://github.com/maks-sh/scikit-uplift into dev
2 parents 8d32fbf + 1811218 commit 1a0ccaa

File tree

4 files changed

+88
-182
lines changed

4 files changed

+88
-182
lines changed
File renamed without changes.

notebooks/pipeline_usage_EN.ipynb

Lines changed: 44 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -51,41 +51,15 @@
5151
"execution_count": 1,
5252
"metadata": {
5353
"ExecuteTime": {
54-
"end_time": "2020-05-30T22:38:40.696778Z",
55-
"start_time": "2020-05-30T22:38:40.692482Z"
54+
"end_time": "2021-02-07T01:01:39.897817Z",
55+
"start_time": "2021-02-07T01:01:39.890409Z"
5656
}
5757
},
5858
"outputs": [],
5959
"source": [
6060
"!pip install scikit-uplift xgboost==1.0.2 category_encoders==2.1.0 -U"
6161
]
6262
},
63-
{
64-
"cell_type": "markdown",
65-
"metadata": {},
66-
"source": [
67-
"Secondly, load the data:"
68-
]
69-
},
70-
{
71-
"cell_type": "code",
72-
"execution_count": 2,
73-
"metadata": {
74-
"ExecuteTime": {
75-
"end_time": "2020-05-30T22:38:40.705782Z",
76-
"start_time": "2020-05-30T22:38:40.701316Z"
77-
}
78-
},
79-
"outputs": [],
80-
"source": [
81-
"import urllib.request\n",
82-
"\n",
83-
"\n",
84-
"csv_path = '/content/Hilstorm.csv'\n",
85-
"url = 'http://www.minethatdata.com/Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv'\n",
86-
"urllib.request.urlretrieve(url, csv_path)"
87-
]
88-
},
8963
{
9064
"cell_type": "markdown",
9165
"metadata": {},
@@ -99,20 +73,21 @@
9973
},
10074
{
10175
"cell_type": "code",
102-
"execution_count": 3,
76+
"execution_count": 2,
10377
"metadata": {
10478
"ExecuteTime": {
105-
"end_time": "2020-05-30T22:38:41.739525Z",
106-
"start_time": "2020-05-30T22:38:40.711390Z"
107-
}
79+
"end_time": "2021-02-07T01:01:42.438253Z",
80+
"start_time": "2021-02-07T01:01:39.901510Z"
81+
},
82+
"scrolled": true
10883
},
10984
"outputs": [
11085
{
11186
"name": "stdout",
11287
"output_type": "stream",
11388
"text": [
114-
"Shape of the dataset before processing: (64000, 12)\n",
115-
"Shape of the dataset after processing: (42693, 10)\n"
89+
"Shape of the dataset before processing: (64000, 8)\n",
90+
"Shape of the dataset after processing: (42693, 8)\n"
11691
]
11792
},
11893
{
@@ -144,8 +119,6 @@
144119
" <th>zip_code</th>\n",
145120
" <th>newbie</th>\n",
146121
" <th>channel</th>\n",
147-
" <th>visit</th>\n",
148-
" <th>treatment</th>\n",
149122
" </tr>\n",
150123
" </thead>\n",
151124
" <tbody>\n",
@@ -159,8 +132,6 @@
159132
" <td>Surburban</td>\n",
160133
" <td>0</td>\n",
161134
" <td>Phone</td>\n",
162-
" <td>0</td>\n",
163-
" <td>1</td>\n",
164135
" </tr>\n",
165136
" <tr>\n",
166137
" <th>1</th>\n",
@@ -172,8 +143,6 @@
172143
" <td>Rural</td>\n",
173144
" <td>1</td>\n",
174145
" <td>Web</td>\n",
175-
" <td>0</td>\n",
176-
" <td>0</td>\n",
177146
" </tr>\n",
178147
" <tr>\n",
179148
" <th>2</th>\n",
@@ -185,8 +154,6 @@
185154
" <td>Surburban</td>\n",
186155
" <td>1</td>\n",
187156
" <td>Web</td>\n",
188-
" <td>0</td>\n",
189-
" <td>1</td>\n",
190157
" </tr>\n",
191158
" <tr>\n",
192159
" <th>4</th>\n",
@@ -198,8 +165,6 @@
198165
" <td>Urban</td>\n",
199166
" <td>0</td>\n",
200167
" <td>Web</td>\n",
201-
" <td>0</td>\n",
202-
" <td>1</td>\n",
203168
" </tr>\n",
204169
" <tr>\n",
205170
" <th>5</th>\n",
@@ -211,49 +176,46 @@
211176
" <td>Surburban</td>\n",
212177
" <td>0</td>\n",
213178
" <td>Phone</td>\n",
214-
" <td>1</td>\n",
215-
" <td>1</td>\n",
216179
" </tr>\n",
217180
" </tbody>\n",
218181
"</table>\n",
219182
"</div>"
220183
],
221184
"text/plain": [
222-
" recency history_segment history mens womens zip_code newbie channel \\\n",
223-
"0 10 2) $100 - $200 142.44 1 0 Surburban 0 Phone \n",
224-
"1 6 3) $200 - $350 329.08 1 1 Rural 1 Web \n",
225-
"2 7 2) $100 - $200 180.65 0 1 Surburban 1 Web \n",
226-
"4 2 1) $0 - $100 45.34 1 0 Urban 0 Web \n",
227-
"5 6 2) $100 - $200 134.83 0 1 Surburban 0 Phone \n",
228-
"\n",
229-
" visit treatment \n",
230-
"0 0 1 \n",
231-
"1 0 0 \n",
232-
"2 0 1 \n",
233-
"4 0 1 \n",
234-
"5 1 1 "
185+
" recency history_segment history mens womens zip_code newbie channel\n",
186+
"0 10 2) $100 - $200 142.44 1 0 Surburban 0 Phone\n",
187+
"1 6 3) $200 - $350 329.08 1 1 Rural 1 Web\n",
188+
"2 7 2) $100 - $200 180.65 0 1 Surburban 1 Web\n",
189+
"4 2 1) $0 - $100 45.34 1 0 Urban 0 Web\n",
190+
"5 6 2) $100 - $200 134.83 0 1 Surburban 0 Phone"
235191
]
236192
},
237-
"execution_count": 3,
193+
"execution_count": 2,
238194
"metadata": {},
239195
"output_type": "execute_result"
240196
}
241197
],
242198
"source": [
243199
"import pandas as pd\n",
200+
"from sklift.datasets import fetch_hillstrom\n",
244201
"\n",
245202
"\n",
246203
"%matplotlib inline\n",
247204
"\n",
248-
"dataset = pd.read_csv(csv_path)\n",
205+
"bunch = fetch_hillstrom(target_col='visit')\n",
206+
"\n",
207+
"dataset, target, treatment = bunch['data'], bunch['target'], bunch['treatment']\n",
208+
"\n",
249209
"print(f'Shape of the dataset before processing: {dataset.shape}')\n",
250-
"dataset = dataset[dataset['segment']!='Mens E-Mail']\n",
251-
"dataset.loc[:, 'treatment'] = dataset['segment'].map({\n",
210+
"\n",
211+
"# Selecting two segments\n",
212+
"dataset = dataset[treatment!='Mens E-Mail']\n",
213+
"target = target[treatment!='Mens E-Mail']\n",
214+
"treatment = treatment[treatment!='Mens E-Mail'].map({\n",
252215
" 'Womens E-Mail': 1,\n",
253216
" 'No E-Mail': 0\n",
254217
"})\n",
255218
"\n",
256-
"dataset = dataset.drop(['segment', 'conversion', 'spend'], axis=1)\n",
257219
"print(f'Shape of the dataset after processing: {dataset.shape}')\n",
258220
"dataset.head()"
259221
]
@@ -267,27 +229,21 @@
267229
},
268230
{
269231
"cell_type": "code",
270-
"execution_count": 4,
232+
"execution_count": 3,
271233
"metadata": {
272234
"ExecuteTime": {
273-
"end_time": "2020-05-30T22:38:42.307545Z",
274-
"start_time": "2020-05-30T22:38:41.743319Z"
235+
"end_time": "2021-02-07T01:01:42.579775Z",
236+
"start_time": "2021-02-07T01:01:42.442595Z"
275237
}
276238
},
277239
"outputs": [],
278240
"source": [
279241
"from sklearn.model_selection import train_test_split\n",
280242
"\n",
281243
"\n",
282-
"Xyt_tr, Xyt_val = train_test_split(dataset, test_size=0.5, random_state=42)\n",
283-
"\n",
284-
"X_tr = Xyt_tr.drop(['visit', 'treatment'], axis=1)\n",
285-
"y_tr = Xyt_tr['visit']\n",
286-
"treat_tr = Xyt_tr['treatment']\n",
287-
"\n",
288-
"X_val = Xyt_val.drop(['visit', 'treatment'], axis=1)\n",
289-
"y_val = Xyt_val['visit']\n",
290-
"treat_val = Xyt_val['treatment']"
244+
"X_tr, X_val, y_tr, y_val, treat_tr, treat_val = train_test_split(\n",
245+
" dataset, target, treatment, test_size=0.5, random_state=42\n",
246+
")"
291247
]
292248
},
293249
{
@@ -299,11 +255,11 @@
299255
},
300256
{
301257
"cell_type": "code",
302-
"execution_count": 5,
258+
"execution_count": 4,
303259
"metadata": {
304260
"ExecuteTime": {
305-
"end_time": "2020-05-30T22:38:42.330862Z",
306-
"start_time": "2020-05-30T22:38:42.310277Z"
261+
"end_time": "2021-02-07T01:01:42.600915Z",
262+
"start_time": "2021-02-07T01:01:42.585066Z"
307263
}
308264
},
309265
"outputs": [
@@ -329,11 +285,11 @@
329285
},
330286
{
331287
"cell_type": "code",
332-
"execution_count": 6,
288+
"execution_count": 5,
333289
"metadata": {
334290
"ExecuteTime": {
335-
"end_time": "2020-05-30T22:38:42.430704Z",
336-
"start_time": "2020-05-30T22:38:42.333721Z"
291+
"end_time": "2021-02-07T01:01:42.703537Z",
292+
"start_time": "2021-02-07T01:01:42.603875Z"
337293
}
338294
},
339295
"outputs": [],
@@ -363,11 +319,11 @@
363319
},
364320
{
365321
"cell_type": "code",
366-
"execution_count": 7,
322+
"execution_count": 6,
367323
"metadata": {
368324
"ExecuteTime": {
369-
"end_time": "2020-05-30T22:38:43.630594Z",
370-
"start_time": "2020-05-30T22:38:42.433041Z"
325+
"end_time": "2021-02-07T01:01:44.020040Z",
326+
"start_time": "2021-02-07T01:01:42.707311Z"
371327
}
372328
},
373329
"outputs": [
@@ -402,11 +358,11 @@
402358
},
403359
{
404360
"cell_type": "code",
405-
"execution_count": 8,
361+
"execution_count": 7,
406362
"metadata": {
407363
"ExecuteTime": {
408-
"end_time": "2020-05-30T22:38:43.777122Z",
409-
"start_time": "2020-05-30T22:38:43.632881Z"
364+
"end_time": "2021-02-07T01:01:44.184968Z",
365+
"start_time": "2021-02-07T01:01:44.047865Z"
410366
}
411367
},
412368
"outputs": [

0 commit comments

Comments
 (0)