|
51 | 51 | "execution_count": 1, |
52 | 52 | "metadata": { |
53 | 53 | "ExecuteTime": { |
54 | | - "end_time": "2020-05-30T22:38:40.696778Z", |
55 | | - "start_time": "2020-05-30T22:38:40.692482Z" |
| 54 | + "end_time": "2021-02-07T01:01:39.897817Z", |
| 55 | + "start_time": "2021-02-07T01:01:39.890409Z" |
56 | 56 | } |
57 | 57 | }, |
58 | 58 | "outputs": [], |
59 | 59 | "source": [ |
60 | 60 | "!pip install scikit-uplift xgboost==1.0.2 category_encoders==2.1.0 -U" |
61 | 61 | ] |
62 | 62 | }, |
63 | | - { |
64 | | - "cell_type": "markdown", |
65 | | - "metadata": {}, |
66 | | - "source": [ |
67 | | - "Secondly, load the data:" |
68 | | - ] |
69 | | - }, |
70 | | - { |
71 | | - "cell_type": "code", |
72 | | - "execution_count": 2, |
73 | | - "metadata": { |
74 | | - "ExecuteTime": { |
75 | | - "end_time": "2020-05-30T22:38:40.705782Z", |
76 | | - "start_time": "2020-05-30T22:38:40.701316Z" |
77 | | - } |
78 | | - }, |
79 | | - "outputs": [], |
80 | | - "source": [ |
81 | | - "import urllib.request\n", |
82 | | - "\n", |
83 | | - "\n", |
84 | | - "csv_path = '/content/Hilstorm.csv'\n", |
85 | | - "url = 'http://www.minethatdata.com/Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv'\n", |
86 | | - "urllib.request.urlretrieve(url, csv_path)" |
87 | | - ] |
88 | | - }, |
89 | 63 | { |
90 | 64 | "cell_type": "markdown", |
91 | 65 | "metadata": {}, |
|
99 | 73 | }, |
100 | 74 | { |
101 | 75 | "cell_type": "code", |
102 | | - "execution_count": 3, |
| 76 | + "execution_count": 2, |
103 | 77 | "metadata": { |
104 | 78 | "ExecuteTime": { |
105 | | - "end_time": "2020-05-30T22:38:41.739525Z", |
106 | | - "start_time": "2020-05-30T22:38:40.711390Z" |
107 | | - } |
| 79 | + "end_time": "2021-02-07T01:01:42.438253Z", |
| 80 | + "start_time": "2021-02-07T01:01:39.901510Z" |
| 81 | + }, |
| 82 | + "scrolled": true |
108 | 83 | }, |
109 | 84 | "outputs": [ |
110 | 85 | { |
111 | 86 | "name": "stdout", |
112 | 87 | "output_type": "stream", |
113 | 88 | "text": [ |
114 | | - "Shape of the dataset before processing: (64000, 12)\n", |
115 | | - "Shape of the dataset after processing: (42693, 10)\n" |
| 89 | + "Shape of the dataset before processing: (64000, 8)\n", |
| 90 | + "Shape of the dataset after processing: (42693, 8)\n" |
116 | 91 | ] |
117 | 92 | }, |
118 | 93 | { |
|
144 | 119 | " <th>zip_code</th>\n", |
145 | 120 | " <th>newbie</th>\n", |
146 | 121 | " <th>channel</th>\n", |
147 | | - " <th>visit</th>\n", |
148 | | - " <th>treatment</th>\n", |
149 | 122 | " </tr>\n", |
150 | 123 | " </thead>\n", |
151 | 124 | " <tbody>\n", |
|
159 | 132 | " <td>Surburban</td>\n", |
160 | 133 | " <td>0</td>\n", |
161 | 134 | " <td>Phone</td>\n", |
162 | | - " <td>0</td>\n", |
163 | | - " <td>1</td>\n", |
164 | 135 | " </tr>\n", |
165 | 136 | " <tr>\n", |
166 | 137 | " <th>1</th>\n", |
|
172 | 143 | " <td>Rural</td>\n", |
173 | 144 | " <td>1</td>\n", |
174 | 145 | " <td>Web</td>\n", |
175 | | - " <td>0</td>\n", |
176 | | - " <td>0</td>\n", |
177 | 146 | " </tr>\n", |
178 | 147 | " <tr>\n", |
179 | 148 | " <th>2</th>\n", |
|
185 | 154 | " <td>Surburban</td>\n", |
186 | 155 | " <td>1</td>\n", |
187 | 156 | " <td>Web</td>\n", |
188 | | - " <td>0</td>\n", |
189 | | - " <td>1</td>\n", |
190 | 157 | " </tr>\n", |
191 | 158 | " <tr>\n", |
192 | 159 | " <th>4</th>\n", |
|
198 | 165 | " <td>Urban</td>\n", |
199 | 166 | " <td>0</td>\n", |
200 | 167 | " <td>Web</td>\n", |
201 | | - " <td>0</td>\n", |
202 | | - " <td>1</td>\n", |
203 | 168 | " </tr>\n", |
204 | 169 | " <tr>\n", |
205 | 170 | " <th>5</th>\n", |
|
211 | 176 | " <td>Surburban</td>\n", |
212 | 177 | " <td>0</td>\n", |
213 | 178 | " <td>Phone</td>\n", |
214 | | - " <td>1</td>\n", |
215 | | - " <td>1</td>\n", |
216 | 179 | " </tr>\n", |
217 | 180 | " </tbody>\n", |
218 | 181 | "</table>\n", |
219 | 182 | "</div>" |
220 | 183 | ], |
221 | 184 | "text/plain": [ |
222 | | - " recency history_segment history mens womens zip_code newbie channel \\\n", |
223 | | - "0 10 2) $100 - $200 142.44 1 0 Surburban 0 Phone \n", |
224 | | - "1 6 3) $200 - $350 329.08 1 1 Rural 1 Web \n", |
225 | | - "2 7 2) $100 - $200 180.65 0 1 Surburban 1 Web \n", |
226 | | - "4 2 1) $0 - $100 45.34 1 0 Urban 0 Web \n", |
227 | | - "5 6 2) $100 - $200 134.83 0 1 Surburban 0 Phone \n", |
228 | | - "\n", |
229 | | - " visit treatment \n", |
230 | | - "0 0 1 \n", |
231 | | - "1 0 0 \n", |
232 | | - "2 0 1 \n", |
233 | | - "4 0 1 \n", |
234 | | - "5 1 1 " |
| 185 | + " recency history_segment history mens womens zip_code newbie channel\n", |
| 186 | + "0 10 2) $100 - $200 142.44 1 0 Surburban 0 Phone\n", |
| 187 | + "1 6 3) $200 - $350 329.08 1 1 Rural 1 Web\n", |
| 188 | + "2 7 2) $100 - $200 180.65 0 1 Surburban 1 Web\n", |
| 189 | + "4 2 1) $0 - $100 45.34 1 0 Urban 0 Web\n", |
| 190 | + "5 6 2) $100 - $200 134.83 0 1 Surburban 0 Phone" |
235 | 191 | ] |
236 | 192 | }, |
237 | | - "execution_count": 3, |
| 193 | + "execution_count": 2, |
238 | 194 | "metadata": {}, |
239 | 195 | "output_type": "execute_result" |
240 | 196 | } |
241 | 197 | ], |
242 | 198 | "source": [ |
243 | 199 | "import pandas as pd\n", |
| 200 | + "from sklift.datasets import fetch_hillstrom\n", |
244 | 201 | "\n", |
245 | 202 | "\n", |
246 | 203 | "%matplotlib inline\n", |
247 | 204 | "\n", |
248 | | - "dataset = pd.read_csv(csv_path)\n", |
| 205 | + "bunch = fetch_hillstrom(target_col='visit')\n", |
| 206 | + "\n", |
| 207 | + "dataset, target, treatment = bunch['data'], bunch['target'], bunch['treatment']\n", |
| 208 | + "\n", |
249 | 209 | "print(f'Shape of the dataset before processing: {dataset.shape}')\n", |
250 | | - "dataset = dataset[dataset['segment']!='Mens E-Mail']\n", |
251 | | - "dataset.loc[:, 'treatment'] = dataset['segment'].map({\n", |
| 210 | + "\n", |
| 211 | + "# Selecting two segments\n", |
| 212 | + "dataset = dataset[treatment!='Mens E-Mail']\n", |
| 213 | + "target = target[treatment!='Mens E-Mail']\n", |
| 214 | + "treatment = treatment[treatment!='Mens E-Mail'].map({\n", |
252 | 215 | " 'Womens E-Mail': 1,\n", |
253 | 216 | " 'No E-Mail': 0\n", |
254 | 217 | "})\n", |
255 | 218 | "\n", |
256 | | - "dataset = dataset.drop(['segment', 'conversion', 'spend'], axis=1)\n", |
257 | 219 | "print(f'Shape of the dataset after processing: {dataset.shape}')\n", |
258 | 220 | "dataset.head()" |
259 | 221 | ] |
|
267 | 229 | }, |
268 | 230 | { |
269 | 231 | "cell_type": "code", |
270 | | - "execution_count": 4, |
| 232 | + "execution_count": 3, |
271 | 233 | "metadata": { |
272 | 234 | "ExecuteTime": { |
273 | | - "end_time": "2020-05-30T22:38:42.307545Z", |
274 | | - "start_time": "2020-05-30T22:38:41.743319Z" |
| 235 | + "end_time": "2021-02-07T01:01:42.579775Z", |
| 236 | + "start_time": "2021-02-07T01:01:42.442595Z" |
275 | 237 | } |
276 | 238 | }, |
277 | 239 | "outputs": [], |
278 | 240 | "source": [ |
279 | 241 | "from sklearn.model_selection import train_test_split\n", |
280 | 242 | "\n", |
281 | 243 | "\n", |
282 | | - "Xyt_tr, Xyt_val = train_test_split(dataset, test_size=0.5, random_state=42)\n", |
283 | | - "\n", |
284 | | - "X_tr = Xyt_tr.drop(['visit', 'treatment'], axis=1)\n", |
285 | | - "y_tr = Xyt_tr['visit']\n", |
286 | | - "treat_tr = Xyt_tr['treatment']\n", |
287 | | - "\n", |
288 | | - "X_val = Xyt_val.drop(['visit', 'treatment'], axis=1)\n", |
289 | | - "y_val = Xyt_val['visit']\n", |
290 | | - "treat_val = Xyt_val['treatment']" |
| 244 | + "X_tr, X_val, y_tr, y_val, treat_tr, treat_val = train_test_split(\n", |
| 245 | + " dataset, target, treatment, test_size=0.5, random_state=42\n", |
| 246 | + ")" |
291 | 247 | ] |
292 | 248 | }, |
293 | 249 | { |
|
299 | 255 | }, |
300 | 256 | { |
301 | 257 | "cell_type": "code", |
302 | | - "execution_count": 5, |
| 258 | + "execution_count": 4, |
303 | 259 | "metadata": { |
304 | 260 | "ExecuteTime": { |
305 | | - "end_time": "2020-05-30T22:38:42.330862Z", |
306 | | - "start_time": "2020-05-30T22:38:42.310277Z" |
| 261 | + "end_time": "2021-02-07T01:01:42.600915Z", |
| 262 | + "start_time": "2021-02-07T01:01:42.585066Z" |
307 | 263 | } |
308 | 264 | }, |
309 | 265 | "outputs": [ |
|
329 | 285 | }, |
330 | 286 | { |
331 | 287 | "cell_type": "code", |
332 | | - "execution_count": 6, |
| 288 | + "execution_count": 5, |
333 | 289 | "metadata": { |
334 | 290 | "ExecuteTime": { |
335 | | - "end_time": "2020-05-30T22:38:42.430704Z", |
336 | | - "start_time": "2020-05-30T22:38:42.333721Z" |
| 291 | + "end_time": "2021-02-07T01:01:42.703537Z", |
| 292 | + "start_time": "2021-02-07T01:01:42.603875Z" |
337 | 293 | } |
338 | 294 | }, |
339 | 295 | "outputs": [], |
|
363 | 319 | }, |
364 | 320 | { |
365 | 321 | "cell_type": "code", |
366 | | - "execution_count": 7, |
| 322 | + "execution_count": 6, |
367 | 323 | "metadata": { |
368 | 324 | "ExecuteTime": { |
369 | | - "end_time": "2020-05-30T22:38:43.630594Z", |
370 | | - "start_time": "2020-05-30T22:38:42.433041Z" |
| 325 | + "end_time": "2021-02-07T01:01:44.020040Z", |
| 326 | + "start_time": "2021-02-07T01:01:42.707311Z" |
371 | 327 | } |
372 | 328 | }, |
373 | 329 | "outputs": [ |
|
402 | 358 | }, |
403 | 359 | { |
404 | 360 | "cell_type": "code", |
405 | | - "execution_count": 8, |
| 361 | + "execution_count": 7, |
406 | 362 | "metadata": { |
407 | 363 | "ExecuteTime": { |
408 | | - "end_time": "2020-05-30T22:38:43.777122Z", |
409 | | - "start_time": "2020-05-30T22:38:43.632881Z" |
| 364 | + "end_time": "2021-02-07T01:01:44.184968Z", |
| 365 | + "start_time": "2021-02-07T01:01:44.047865Z" |
410 | 366 | } |
411 | 367 | }, |
412 | 368 | "outputs": [ |
|
0 commit comments