diff --git a/notebooks/EDA_Hillstrom_dataset_tutorial.ipynb b/notebooks/EDA_Hillstrom_dataset_tutorial.ipynb new file mode 100644 index 0000000..d6538b8 --- /dev/null +++ b/notebooks/EDA_Hillstrom_dataset_tutorial.ipynb @@ -0,0 +1,1938 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c553a794", + "metadata": {}, + "source": [ + "# 🎯 EDA Hillstrom `tutorial`\n", + "\n", + "
\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + " SCIKIT-UPLIFT REPO | \n", + " SCIKIT-UPLIFT DOCS | \n", + " USER GUIDE\n", + "
\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "id": "5e223c64", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "# install uplift library scikit-uplift and other libraries \n", + "!{sys.executable} -m pip install scikit-uplift dill catboost" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "05decfa4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\anatoly\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from requests->scikit-uplift) (2021.5.30)\n", + "Requirement already satisfied: chardet<5,>=3.0.2 in c:\\users\\anatoly\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from requests->scikit-uplift) (4.0.0)\n" + ] + } + ], + "source": [ + "from matplotlib import pyplot as plt\n", + "from sklift.metrics import uplift_at_k\n", + "import seaborn as sns\n", + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "id": "57c436ed", + "metadata": {}, + "source": [ + "# 📝 Load dataset" + ] + }, + { + "cell_type": "markdown", + "id": "02983e22", + "metadata": {}, + "source": [ + "We are going to use a Hillstrom dataset from the MineThatData [hosted](https://blog.minethatdata.com/2008/03/minethatdata-e-mail-analytics-and-data.html) in march 2008 by the president of this company Kevin Hillstrom.\n", + "\n", + "MineThatData is a consulting company that helps CEO understand the complex relationship between Customers, Advertising, Products, Brands, and Channels." + ] + }, + { + "cell_type": "markdown", + "id": "e97e5c2e", + "metadata": {}, + "source": [ + "### Data description\n", + "\n", + "✏️ Dataset can be loaded from sklift.datasets module using fetch_hillstrom function.\n", + "\n", + "Read more about dataset in the [api docs](https://www.uplift-modeling.com/en/latest/api/datasets/fetch_hillstrom.html).\n", + "\n", + "This dataset contains 64,000 customers who last purchased within twelve months. The customers were involved in an e-mail test \n", + "\n", + "#### ✏️ Major columns:\n", + "\n", + "- `conversion` - (binary): target. 1/0 indicator, 1 = Customer purchased merchandise in the following two weeks.\n", + "- `visit` - (binary): target. 1/0 indicator, 1 = Customer visited website in the following two weeks.\n", + "- `spend` - (float): target. Actual dollars spent in the following two weeks.\n", + "- `segment` - (str): treatment. The e-mail campaign the customer received\n", + "\n", + "Read more in the [docs](https://www.uplift-modeling.com/en/latest/api/datasets/fetch_hillstrom.html#hillstrom)\n", + "\n", + "There are 3 types of targets (visit/conversion/spend) that we could get, it depends from the task of the customer. We should input target in the bracket when we download dataset from the library sklift" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "c9882f49", + "metadata": {}, + "outputs": [], + "source": [ + "from sklift.datasets import fetch_hillstrom\n", + "\n", + "# returns sklearn Bunch object\n", + "# with data, target, treatment keys\n", + "# data features (pd.DataFrame), target (pd.Series), treatment (pd.Series) values \n", + "dataset = fetch_hillstrom()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "1bdea302", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset type: \n", + "\n", + "Dataset features shape: (64000, 8)\n", + "Dataset target shape: (64000,)\n", + "Dataset treatment shape: (64000,)\n" + ] + } + ], + "source": [ + "print(f\"Dataset type: {type(dataset)}\\n\")\n", + "print(f\"Dataset features shape: {dataset.data.shape}\")\n", + "print(f\"Dataset target shape: {dataset.target.shape}\")\n", + "print(f\"Dataset treatment shape: {dataset.treatment.shape}\")" + ] + }, + { + "cell_type": "markdown", + "id": "517ed54f", + "metadata": {}, + "source": [ + "# 📝 EDA" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "f468bf6b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
recencyhistory_segmenthistorymenswomenszip_codenewbiechannel
0102) $100 - $200142.4410Surburban0Phone
163) $200 - $350329.0811Rural1Web
272) $100 - $200180.6501Surburban1Web
395) $500 - $750675.8310Rural1Web
421) $0 - $10045.3410Urban0Web
63995102) $100 - $200105.5410Urban0Web
6399651) $0 - $10038.9101Urban1Phone
6399761) $0 - $10029.9910Urban1Phone
6399815) $500 - $750552.9410Surburban1Multichannel
6399914) $350 - $500472.8201Surburban0Web
\n", + "
" + ], + "text/plain": [ + " recency history_segment history mens womens zip_code newbie \\\n", + "0 10 2) $100 - $200 142.44 1 0 Surburban 0 \n", + "1 6 3) $200 - $350 329.08 1 1 Rural 1 \n", + "2 7 2) $100 - $200 180.65 0 1 Surburban 1 \n", + "3 9 5) $500 - $750 675.83 1 0 Rural 1 \n", + "4 2 1) $0 - $100 45.34 1 0 Urban 0 \n", + "63995 10 2) $100 - $200 105.54 1 0 Urban 0 \n", + "63996 5 1) $0 - $100 38.91 0 1 Urban 1 \n", + "63997 6 1) $0 - $100 29.99 1 0 Urban 1 \n", + "63998 1 5) $500 - $750 552.94 1 0 Surburban 1 \n", + "63999 1 4) $350 - $500 472.82 0 1 Surburban 0 \n", + "\n", + " channel \n", + "0 Phone \n", + "1 Web \n", + "2 Web \n", + "3 Web \n", + "4 Web \n", + "63995 Web \n", + "63996 Phone \n", + "63997 Phone \n", + "63998 Multichannel \n", + "63999 Web " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset.data.head().append(dataset.data.tail())" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "e96da0a8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 64000 entries, 0 to 63999\n", + "Data columns (total 8 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 recency 64000 non-null int64 \n", + " 1 history_segment 64000 non-null object \n", + " 2 history 64000 non-null float64\n", + " 3 mens 64000 non-null int64 \n", + " 4 womens 64000 non-null int64 \n", + " 5 zip_code 64000 non-null object \n", + " 6 newbie 64000 non-null int64 \n", + " 7 channel 64000 non-null object \n", + "dtypes: float64(1), int64(4), object(3)\n", + "memory usage: 3.9+ MB\n" + ] + } + ], + "source": [ + "#info about types and null cells in dataset\n", + "dataset.data.info()" + ] + }, + { + "cell_type": "markdown", + "id": "d7e509f1", + "metadata": {}, + "source": [ + "# 📄 Categorical data" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "9dd7253f", + "metadata": {}, + "outputs": [], + "source": [ + "cat_features = ['channel', 'zip_code', 'history_segment', 'newbie']" + ] + }, + { + "cell_type": "markdown", + "id": "443621e7", + "metadata": {}, + "source": [ + "**Channel**" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "cbd0fcd8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Phone', 'Web', 'Multichannel'], dtype=object)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset.data.channel.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "a638b2fd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYMAAAEvCAYAAACnuq2HAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAAWCUlEQVR4nO3df7DddZ3f8eeLZGEZVgTE3qGENahpdwIoYhZwddpbnYWAOwW7rgOlEpSa7QBWdzPtZnc7gwtrx20XtmVH6caSCh3Lj110yEpcmjJcXWtBQCgBlCGDYUk2QjUIBHfU4Lt/fL9Xj+GGnNx78/3ee8/zMXPmnvM+3+/5vs/9Jud1v5/vj5OqQpI02g7quwFJUv8MA0mSYSBJMgwkSRgGkiQMA0kSsLjvBqbr6KOPrqVLl/bdxgHz4osvcthhh/XdhqbBdTe/LfT1d//993+nql67Z33ehsHSpUu57777+m7jgJmYmGB8fLzvNjQNrrv5baGvvyRPTlV3mEiSZBhIkgwDSRKGgSQJw0CShGEgScIwkCRhGEiSmMcnnXVt6drbO13empN2c1FHy9z6iXd3shxJc5dbBpIktwy08C3krTpwy06zwy0DSZJhIEkyDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkMUQYJDkuyV1JHk3ySJKPtPWPJdme5MH2dvbAPL+bZEuSx5KcOVBf2da2JFk7UD8+yT1t/eYkB8/2G5Uk7d0wWwa7gTVVtRw4Hbg0yfL2uT+pqpPb20aA9rnzgBOAlcCnkixKsgj4JHAWsBw4f+B1/qh9rTcCzwIXz9L7kyQNYZ9hUFU7qurr7f0XgG8Ax77CLOcAN1XVD6rqW8AW4NT2tqWqnqiqHwI3AeckCfBO4C/a+a8Hzp3m+5EkTcN+7TNIshR4C3BPW7osyUNJ1ic5sq0dCzw1MNu2tra3+muA71XV7j3qkqSOLB52wiS/ANwKfLSqnk9yLXAlUO3Pq4APHpAuf9rDamA1wNjYGBMTEwdycT9jzUm79z3RLBo7tLtldvl77MNCXnew8Ndf13bt2jWSv9OhwiDJz9EEwWer6nMAVfX0wPOfBr7QPtwOHDcw+5K2xl7q3wWOSLK43ToYnP5nVNU6YB3AihUranx8fJj2Z8VFa2/vbFnQfJhctXnorJ6RrReMd7KcvizkdQcLf/11bWJigi4/W+aKYY4mCnAd8I2qunqgfszAZO8BHm7vbwDOS3JIkuOBZcDXgHuBZe2RQwfT7GTeUFUF3AW8t51/FXDbzN6WJGl/DPPny9uB9wObkzzY1n6P5migk2mGibYCvwlQVY8kuQV4lOZIpEur6iWAJJcBdwCLgPVV9Uj7er8D3JTkD4EHaMJHktSRfYZBVX0FyBRPbXyFeT4OfHyK+sap5quqJ2iONpIk9cAzkCVJhoEkyTCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIYIgySHJfkriSPJnkkyUfa+lFJNiV5vP15ZFtPkmuSbEnyUJJTBl5rVTv940lWDdTfmmRzO881SXIg3qwkaWrDbBnsBtZU1XLgdODSJMuBtcCdVbUMuLN9DHAWsKy9rQauhSY8gMuB04BTgcsnA6Sd5kMD862c+VuTJA1rn2FQVTuq6uvt/ReAbwDHAucA17eTXQ+c294/B7ihGncDRyQ5BjgT2FRVO6vqWWATsLJ97vCquruqCrhh4LUkSR3Yr30GSZYCbwHuAcaqakf71LeBsfb+scBTA7Nta2uvVN82RV2S1JHFw06Y5BeAW4GPVtXzg8P6VVVJ6gD0t2cPq2mGnhgbG2NiYuJAL/In1py0u7NlAYwd2t0yu/w99mEhrztY+Ouva7t27RrJ3+lQYZDk52iC4LNV9bm2/HSSY6pqRzvU80xb3w4cNzD7kra2HRjfoz7R1pdMMf3LVNU6YB3AihUranx8fKrJDoiL1t7e2bKg+TC5avPQWT0jWy8Y72Q5fVnI6w4W/vrr2sTEBF1+tswVwxxNFOA64BtVdfXAUxuAySOCVgG3DdQvbI8qOh14rh1OugM4I8mR7Y7jM4A72ueeT3J6u6wLB15LktSBYf58eTvwfmBzkgfb2u8BnwBuSXIx8CTwvva5jcDZwBbg+8AHAKpqZ5IrgXvb6a6oqp3t/UuAzwCHAl9sb5KkjuwzDKrqK8Dejvt/1xTTF3DpXl5rPbB+ivp9wIn76kWSdGB4BrIkyTCQJBkGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kSQ4RBkvVJnkny8EDtY0m2J3mwvZ098NzvJtmS5LEkZw7UV7a1LUnWDtSPT3JPW785ycGz+QYlSfs2zJbBZ4CVU9T/pKpObm8bAZIsB84DTmjn+VSSRUkWAZ8EzgKWA+e30wL8UftabwSeBS6eyRuSJO2/fYZBVX0Z2Dnk650D3FRVP6iqbwFbgFPb25aqeqKqfgjcBJyTJMA7gb9o578eOHf/3oIkaaYWz2Dey5JcCNwHrKmqZ4FjgbsHptnW1gCe2qN+GvAa4HtVtXuK6V8myWpgNcDY2BgTExMzaH//rDlp974nmkVjh3a3zC5/j31YyOsOFv7669quXbtG8nc63TC4FrgSqPbnVcAHZ6upvamqdcA6gBUrVtT4+PiBXuRPXLT29s6WBc2HyVWbZ5LVw9t6wXgny+nLQl53sPDXX9cmJibo8rNlrpjWv9iqenryfpJPA19oH24HjhuYdElbYy/17wJHJFncbh0MTi9J6si0Di1NcszAw/cAk0cabQDOS3JIkuOBZcDXgHuBZe2RQwfT7GTeUFUF3AW8t51/FXDbdHqSJE3fPrcMktwIjANHJ9kGXA6MJzmZZphoK/CbAFX1SJJbgEeB3cClVfVS+zqXAXcAi4D1VfVIu4jfAW5K8ofAA8B1s/XmJEnD2WcYVNX5U5T3+oFdVR8HPj5FfSOwcYr6EzRHG0mSeuIZyJIkw0CSZBhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkhgiDJKsT/JMkocHakcl2ZTk8fbnkW09Sa5JsiXJQ0lOGZhnVTv940lWDdTfmmRzO881STLbb1KS9MqG2TL4DLByj9pa4M6qWgbc2T4GOAtY1t5WA9dCEx7A5cBpwKnA5ZMB0k7zoYH59lyWJOkA22cYVNWXgZ17lM8Brm/vXw+cO1C/oRp3A0ckOQY4E9hUVTur6llgE7Cyfe7wqrq7qgq4YeC1JEkdme4+g7Gq2tHe/zYw1t4/FnhqYLptbe2V6tumqEuSOrR4pi9QVZWkZqOZfUmymmb4ibGxMSYmJrpYLABrTtrd2bIAxg7tbpld/h77sJDXHSz89de1Xbt2jeTvdLph8HSSY6pqRzvU80xb3w4cNzDdkra2HRjfoz7R1pdMMf2UqmodsA5gxYoVNT4+vrdJZ91Fa2/vbFnQfJhctXnGWT2UrReMd7KcvizkdQcLf/11bWJigi4/W+aK6Q4TbQAmjwhaBdw2UL+wParodOC5djjpDuCMJEe2O47PAO5on3s+yentUUQXDryWJKkj+/zzJcmNNH/VH51kG81RQZ8AbklyMfAk8L528o3A2cAW4PvABwCqameSK4F72+muqKrJndKX0ByxdCjwxfYmSerQPsOgqs7fy1PvmmLaAi7dy+usB9ZPUb8POHFffUiSDhzPQJYkGQaSJMNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJLELHyfgSQdSEt7uAR5l5c93/qJd3e2rFfiloEkyTCQJBkGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJDHDMEiyNcnmJA8mua+tHZVkU5LH259HtvUkuSbJliQPJTll4HVWtdM/nmTVzN6SJGl/zcaWwT+pqpOrakX7eC1wZ1UtA+5sHwOcBSxrb6uBa6EJD+By4DTgVODyyQCRJHXjQAwTnQNc396/Hjh3oH5DNe4GjkhyDHAmsKmqdlbVs8AmYOUB6EuStBczDYMC/meS+5OsbmtjVbWjvf9tYKy9fyzw1MC829ra3uqSpI4snuH876iq7Un+HrApyTcHn6yqSlIzXMZPtIGzGmBsbIyJiYnZeul9WnPS7s6WBTB2aHfL7PL32IeFvO7A9TfbRnX9zSgMqmp7+/OZJJ+nGfN/OskxVbWjHQZ6pp18O3DcwOxL2tp2YHyP+sRelrcOWAewYsWKGh8fn2qyA+Kitbd3tixo/jFetXmmWT2crReMd7KcvizkdQeuv9k2qutv2sNESQ5L8qrJ+8AZwMPABmDyiKBVwG3t/Q3Ahe1RRacDz7XDSXcAZyQ5st1xfEZbkyR1ZCbxNwZ8Psnk6/yPqvqrJPcCtyS5GHgSeF87/UbgbGAL8H3gAwBVtTPJlcC97XRXVNXOGfQlSdpP0w6DqnoCePMU9e8C75qiXsCle3mt9cD66fYiSZoZz0CWJBkGkiTDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJYg6FQZKVSR5LsiXJ2r77kaRRMifCIMki4JPAWcBy4Pwky/vtSpJGx5wIA+BUYEtVPVFVPwRuAs7puSdJGhmpqr57IMl7gZVV9S/bx+8HTquqy/aYbjWwun34D4HHOm20W0cD3+m7CU2L625+W+jr73VV9do9i4v76GS6qmodsK7vPrqQ5L6qWtF3H9p/rrv5bVTX31wZJtoOHDfweElbkyR1YK6Ewb3AsiTHJzkYOA/Y0HNPkjQy5sQwUVXtTnIZcAewCFhfVY/03FbfRmI4bIFy3c1vI7n+5sQOZElSv+bKMJEkqUeGgSTJMJAkGQZzTpLDk7yq7z4kjRZ3IM8RSX4ZWA+8CgjwPeCDVXV/n31peEleByyrqv+V5FBgcVW90Hdf2rskv/1Kz1fV1V310rc5cWipALgOuKSq/hogyTuA/wa8qdeuNJQkH6K5VMpRwBtoTpz8L8C7+uxL++RWeMstgzkiyQNV9ZY9al+vqlP66knDS/IgzQUX75lcj0k2V9VJvTYmDcl9Bj1LckqSU4AvJfmzJONJ/nGSTwETPben4f2gveIuAEkWA/6lNU8k+QdJ7kzycPv4TUn+Xd99dcktg54luesVnq6qemdnzWjakvwHmv08FwIfBi4BHq2q3++zLw0nyZeAfwP82cCW3cNVdWK/nXXHMJBmQZKDgIuBM2gOALgD+K/lf7B5Icm9VfXLg8O1SR6sqpN7bq0z7kCeI5KMAf8e+PtVdVb7TW9vq6rrem5NQ6iqHwOfbm+af76T5A20Q3vtd6zs6LelbrllMEck+SLN0UO/X1VvbsecH3AH5PyQ5O3Ax4DX0fyRFZphvtf32ZeGk+T1NBeo+xXgWeBbwL+oqq199tUlw2COcDN1fkvyTeC3gPuBlybrVfXd3prSfktyGHDQKJ4f4jDR3PFiktfw083U04Hn+m1J++G5qvpi301oepIcAvw6sBRYnASAqrqix7Y6ZRj0LMlHga8C/xa4DXh9kv8NvBb4jR5b0/65K8l/BD4H/GCyWFVf768l7YfbaP74up+B9TdKHCbqWZI/phmn/CXgmzRf9/ll4MaqWshfyr2g7OUQYQ8NnidG7TDSqRgGc0T7dZ8raILhbe3te1W1vNfGpBGQZB3wp1W1ue9e+uIw0dxxKHA48Or29rfAyP7DnG+SvBq4HPhHbelLwBVV5X6f+eEdwEVJvkUzTDR5NNjIXBvMLYOetX+RnAC8ANwD3A3cXVXP9tqY9kuSW4GHgevb0vuBN1fVP+uvKw2rveLsy1TVk1330he3DPr3i8AhwOM0+wu20VzWQPPLG6rq1wce/0F78TrNA1X1ZJJFwBgj+rk4km96LqmqlWmOYzuBZn/BGuDEJDuB/1NVl/faoIb1d0neUVVfgZ+chPZ3PfekISX5MM0w39PAj9tyMUKXkHeYaA5JsgR4O00o/Brwmqo6otemNJQkbwZuoNnfE2AncFFV/d9eG9NQkmwBThvlkwQNg54l+dc0H/6/AvyI5pyDydvm9po3mieSHA5QVc/33YuG1x4a/KtVtbvvXvriMFH/lgJ/DvxWVY3UhbEWEs9gnfeeACaS3M7PnjTo116qG1X1it/Bqnlj5M9gnef+pr0d3N5GjsNE0izwDFbNd24ZSLPjq0lOGuUzWOezJK+luT7YCcDPT9ZH6XIihoE0A0k20xyCuBj4QJInGNEzWOe5zwI30xzF96+AVcD/67WjjjlMJM3A3s5cnTRKZ7DOZ0nur6q3JnloMsAnv2Ok79664paBNDNP0/wl+Uaaa0ldN8qHJ85jP2p/7kjybpprgx3VYz+dc8tAmoEkN9N8kPw1cBbwZFV9pN+utL+S/BrNOjwO+FOai0b+QVVt6LWxDhkG0gwk2Tz5PdXt91Z/rapO6bktab85TCTNzOTwAlW1e/JkM80v7dFEH6I9aXCyXlUf7KunrrllIM1AkpeAFycf0nwvxff56dFEh/fVm4aX5Ks0w0T3Ay9N1qvq1t6a6phhIGnkJXmwqk7uu48+HdR3A5I0B3whydl9N9EntwwkjawkL9CcNBjgMJoTBn/ECA7zGQaSJIeJJCnJe5K8euDxEUnO7bGlzrllIGnkTbUDOckDVfWWnlrqnFsGkjT1Z+FInYdlGEgS3Jfk6iRvaG9X05xzMDIMA0mCDwM/pLmM9c00RxVd2mtHHXOfgSRptMbEJGlQkv9UVR9N8pc05xv8jKr6pz201QvDQNIo++/tzz/utYs5wDCQNLKqanIn8clV9Z8Hn0vyEeBL3XfVD3cgS1Lzncd7uqjrJvrkloGkkZXkfOCfA8cnGfxWs1cBO/vpqh+GgaRR9lVgB3A0cNVA/QXgoV466omHlkqS3DKQNLoGLmH9sqfwEtaSpFHjloGkkZfkF6eqV9XfdN1LX9wykDTykmweePjzwPHAY1V1Qk8tdc4tA0kjr6pOGnyc5BTgkp7a6YVbBpI0hSSb9wyJhcwtA0kjL8lvDzw8CDgF+Nue2umFYSBJzRnHk3YDtwO39tRLLxwmkiS5ZSBpdO1xPaKX8fsMJGk0vA14CrgRuIfmzOOR5DCRpJGVZBHwq8D5wJto9hXcWFWP9NpYD/w+A0kjq6peqqq/qqpVwOnAFmAiyWU9t9Y5h4kkjbQkhwDvptk6WApcA3y+z5764DCRpJGV5AbgRGAjcFNVPdxzS70xDCSNrCQ/Bl5sHw5+GHoJa0nS6HEHsiTJMJAkGQaSJAwDSRKGgSQJ+P8m/8QcQkF4rAAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "dataset.data.channel.value_counts().plot(kind = 'bar', grid=True)" + ] + }, + { + "cell_type": "markdown", + "id": "37991502", + "metadata": {}, + "source": [ + "**Zip code**" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "765cd3f1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Surburban', 'Rural', 'Urban'], dtype=object)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset.data.zip_code.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "b44a7f47", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYMAAAElCAYAAAAGIY7hAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAAWwUlEQVR4nO3df7DddZ3f8efLRJTqKiB6S4EauqbrxGUEvcW0u9Pe1RGCbjduqxZKJSi7cUaYddt0ZuO2U6jI1G0HmdJV2uyYGnaskdF1yEhcNstya+0U+aFIDCzlinFJFqESRCNdafDdP87nyiHecG9uku/33Hufj5kz+Z73+f54n/uB8zrfH+ecVBWSpKXtBX03IEnqn2EgSTIMJEmGgSQJw0CShGEgSWIOYZDkxUnuSPKNJLuS/NtWPyPJV5NMJflskuNa/UXt/lR7fMXQuj7U6g8kOW+ovqbVppJsPAbPU5L0POayZ/Bj4M1V9XrgLGBNktXA7wHXVtVrgCeAS9v8lwJPtPq1bT6SrAIuAF4HrAE+kWRZkmXAx4HzgVXAhW1eSVJHls82Qw0+lba/3X1huxXwZuCftvoW4ErgemBtmwb4HPD7SdLqW6vqx8C3k0wB57T5pqrqIYAkW9u89z1fXyeffHKtWLFi1ie4UP3oRz/iJS95Sd9taB4cu4VtsY/f3Xff/b2qeuXB9VnDAKC9e78beA2Dd/HfAr5fVQfaLHuAU9v0qcDDAFV1IMmTwCta/fah1Q4v8/BB9TfN1tOKFSu466675tL+gjQ5OcnExETfbWgeHLuFbbGPX5LvzFSfUxhU1TPAWUlOAL4AvPbotTZ3SdYD6wHGxsaYnJzso41O7N+/f1E/v8XMsVvYlur4zSkMplXV95PcBvxd4IQky9vewWnA3jbbXuB0YE+S5cDLgceH6tOGlzlU/eDtbwI2AYyPj9diTu/F/u5kMXPsFralOn5zuZrolW2PgCTHA28F7gduA97ZZlsH3NSmt7X7tMf/rJ132AZc0K42OgNYCdwB3AmsbFcnHcfgJPO2o/DcJElzNJc9g1OALe28wQuAG6vqi0nuA7Ym+QjwdeCTbf5PAn/YThDvY/DiTlXtSnIjgxPDB4DL2uEnklwO3AIsAzZX1a6j9gwlSbOay9VE9wJnz1B/iGevBhqu/xXwrkOs62rg6hnq24Htc+hXknQM+AlkSZJhIEkyDCRJHOalpUvZio03d7q9DWce4JKOtrn7o2/vZDuSRpd7BpIkw0CSZBhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kSsLzvBqRjbcXGmzvd3oYzD3BJh9vc/dG3d7YtLV6z7hkkOT3JbUnuS7IryQdb/coke5Pc025vG1rmQ0mmkjyQ5Lyh+ppWm0qycah+RpKvtvpnkxx3tJ+oJOnQ5nKY6ACwoapWAauBy5Ksao9dW1Vntdt2gPbYBcDrgDXAJ5IsS7IM+DhwPrAKuHBoPb/X1vUa4Ang0qP0/CRJczBrGFTVI1X1tTb9Q+B+4NTnWWQtsLWqflxV3wamgHPabaqqHqqqp4GtwNokAd4MfK4tvwV4xzyfjyRpHg7rBHKSFcDZwFdb6fIk9ybZnOTEVjsVeHhosT2tdqj6K4DvV9WBg+qSpI7M+QRykpcCnwd+u6p+kOR64Cqg2r/XAO87Jl0+28N6YD3A2NgYk5OTx3Jzz7HhzAOzz3QUjR3f3Ta7/Dv2YTGPHSz+8eva/v37l+TfdE5hkOSFDILg01X1RwBV9ejQ438AfLHd3QucPrT4aa3GIeqPAyckWd72Dobnf46q2gRsAhgfH6+JiYm5tH9UdHl1CAxeTK7Z2c3FXrsvmuhkO31ZzGMHi3/8ujY5OUmXry2jYi5XEwX4JHB/VX1sqH7K0Gy/DnyzTW8DLkjyoiRnACuBO4A7gZXtyqHjGJxk3lZVBdwGvLMtvw646cieliTpcMzl7csvAe8Bdia5p9V+l8HVQGcxOEy0G3g/QFXtSnIjcB+DK5Euq6pnAJJcDtwCLAM2V9Wutr7fAbYm+QjwdQbhI0nqyKxhUFVfATLDQ9ufZ5mrgatnqG+fabmqeojB1UaSpB74dRSSJMNAkmQYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJKYQxgkOT3JbUnuS7IryQdb/aQkO5I82P49sdWT5LokU0nuTfKGoXWta/M/mGTdUP2NSXa2Za5LkmPxZCVJM5vLnsEBYENVrQJWA5clWQVsBG6tqpXAre0+wPnAynZbD1wPg/AArgDeBJwDXDEdIG2e3xxabs2RPzVJ0lzNGgZV9UhVfa1N/xC4HzgVWAtsabNtAd7RptcCN9TA7cAJSU4BzgN2VNW+qnoC2AGsaY+9rKpur6oCbhhalySpA4d1ziDJCuBs4KvAWFU90h76LjDWpk8FHh5abE+rPV99zwx1SVJHls91xiQvBT4P/HZV/WD4sH5VVZI6Bv0d3MN6BoeeGBsbY3Jy8lhv8qc2nHmgs20BjB3f3Ta7/Dv2YTGPHSz+8eva/v37l+TfdE5hkOSFDILg01X1R638aJJTquqRdqjnsVbfC5w+tPhprbYXmDioPtnqp80w/8+oqk3AJoDx8fGamJiYabZj4pKNN3e2LRi8mFyzc85ZfUR2XzTRyXb6spjHDhb/+HVtcnKSLl9bRsVcriYK8Eng/qr62NBD24DpK4LWATcN1S9uVxWtBp5sh5NuAc5NcmI7cXwucEt77AdJVrdtXTy0LklSB+by9uWXgPcAO5Pc02q/C3wUuDHJpcB3gHe3x7YDbwOmgKeA9wJU1b4kVwF3tvk+XFX72vQHgE8BxwNfajdJUkdmDYOq+gpwqOv+3zLD/AVcdoh1bQY2z1C/C/jF2XqRJB0bfgJZkmQYSJIMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kScwiDJJuTPJbkm0O1K5PsTXJPu71t6LEPJZlK8kCS84bqa1ptKsnGofoZSb7a6p9NctzRfIKSpNnNZc/gU8CaGerXVtVZ7bYdIMkq4ALgdW2ZTyRZlmQZ8HHgfGAVcGGbF+D32rpeAzwBXHokT0iSdPhmDYOq+jKwb47rWwtsraofV9W3gSngnHabqqqHquppYCuwNkmANwOfa8tvAd5xeE9BknSkjuScweVJ7m2HkU5stVOBh4fm2dNqh6q/Avh+VR04qC5J6tDyeS53PXAVUO3fa4D3Ha2mDiXJemA9wNjYGJOTk8d6kz+14cwDs890FI0d3902u/w79mExjx0s/vHr2v79+5fk33ReYVBVj05PJ/kD4Ivt7l7g9KFZT2s1DlF/HDghyfK2dzA8/0zb3QRsAhgfH6+JiYn5tD8vl2y8ubNtweDF5Jqd883qw7P7oolOttOXxTx2sPjHr2uTk5N0+doyKuZ1mCjJKUN3fx2YvtJoG3BBkhclOQNYCdwB3AmsbFcOHcfgJPO2qirgNuCdbfl1wE3z6UmSNH+zvn1J8hlgAjg5yR7gCmAiyVkMDhPtBt4PUFW7ktwI3AccAC6rqmfaei4HbgGWAZuralfbxO8AW5N8BPg68Mmj9eQkSXMzaxhU1YUzlA/5gl1VVwNXz1DfDmyfof4Qg6uNJEk98RPIkiTDQJJkGEiSMAwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSmEMYJNmc5LEk3xyqnZRkR5IH278ntnqSXJdkKsm9Sd4wtMy6Nv+DSdYN1d+YZGdb5rokOdpPUpL0/OayZ/ApYM1BtY3ArVW1Eri13Qc4H1jZbuuB62EQHsAVwJuAc4ArpgOkzfObQ8sdvC1J0jE2axhU1ZeBfQeV1wJb2vQW4B1D9Rtq4HbghCSnAOcBO6pqX1U9AewA1rTHXlZVt1dVATcMrUuS1JH5njMYq6pH2vR3gbE2fSrw8NB8e1rt+ep7ZqhLkjq0/EhXUFWVpI5GM7NJsp7B4SfGxsaYnJzsYrMAbDjzQGfbAhg7vrttdvl37MNiHjtY/OPXtf379y/Jv+l8w+DRJKdU1SPtUM9jrb4XOH1ovtNabS8wcVB9stVPm2H+GVXVJmATwPj4eE1MTBxq1qPuko03d7YtGLyYXLPziLN6TnZfNNHJdvqymMcOFv/4dW1ycpIuX1tGxXwPE20Dpq8IWgfcNFS/uF1VtBp4sh1OugU4N8mJ7cTxucAt7bEfJFndriK6eGhdkqSOzPr2JclnGLyrPznJHgZXBX0UuDHJpcB3gHe32bcDbwOmgKeA9wJU1b4kVwF3tvk+XFXTJ6U/wOCKpeOBL7WbJAGwooc9uy73Jnd/9O2dbev5zBoGVXXhIR56ywzzFnDZIdazGdg8Q/0u4Bdn60OSdOz4CWRJkmEgSTIMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkcYRgk2Z1kZ5J7ktzVaicl2ZHkwfbvia2eJNclmUpyb5I3DK1nXZv/wSTrjuwpSZIO19HYM/iVqjqrqsbb/Y3ArVW1Eri13Qc4H1jZbuuB62EQHsAVwJuAc4ArpgNEktSNY3GYaC2wpU1vAd4xVL+hBm4HTkhyCnAesKOq9lXVE8AOYM0x6EuSdAhHGgYF/EmSu5Osb7WxqnqkTX8XGGvTpwIPDy27p9UOVZckdWT5ES7/y1W1N8mrgB1J/nz4waqqJHWE2/ipFjjrAcbGxpicnDxaq57VhjMPdLYtgLHju9tml3/HPizmsQPH72hbquN3RGFQVXvbv48l+QKDY/6PJjmlqh5ph4Eea7PvBU4fWvy0VtsLTBxUnzzE9jYBmwDGx8drYmJiptmOiUs23tzZtmDwH+M1O480q+dm90UTnWynL4t57MDxO9qW6vjN+zBRkpck+bnpaeBc4JvANmD6iqB1wE1tehtwcbuqaDXwZDucdAtwbpIT24njc1tNktSRI4m/MeALSabX89+q6o+T3AncmORS4DvAu9v824G3AVPAU8B7AapqX5KrgDvbfB+uqn1H0Jck6TDNOwyq6iHg9TPUHwfeMkO9gMsOsa7NwOb59iJJOjJ+AlmSZBhIkgwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRIjFAZJ1iR5IMlUko199yNJS8lIhEGSZcDHgfOBVcCFSVb125UkLR0jEQbAOcBUVT1UVU8DW4G1PfckSUvGqITBqcDDQ/f3tJokqQOpqr57IMk7gTVV9Rvt/nuAN1XV5QfNtx5Y3+7+AvBAp41262Tge303oXlx7Ba2xT5+r66qVx5cXN5HJzPYC5w+dP+0VnuOqtoEbOqqqT4luauqxvvuQ4fPsVvYlur4jcphojuBlUnOSHIccAGwreeeJGnJGIk9g6o6kORy4BZgGbC5qnb13JYkLRkjEQYAVbUd2N53HyNkSRwOW6Qcu4VtSY7fSJxAliT1a1TOGUiSemQYSJIMA0nSCJ1AFiT5e8AKhsalqm7orSEdFsdPC5lhMCKS/CHw88A9wDOtXIAvJguA47cwJfkhg3H6mYeAqqqXddxSb7yaaEQkuR9YVQ7IguT4aaFzz2B0fBP468AjfTeieXH8FoEkrwJePH2/qv6ix3Y6ZRiMjpOB+5LcAfx4ulhVv9ZfSzoMjt8CluTXgGuAvwE8BrwauB94XZ99dckwGB1X9t2AjsiVfTegI3IVsBr406o6O8mvAP+s55465TkDSUve9DeVJvkGcHZV/STJN6rq9X331hU/ZzAikqxOcmeS/UmeTvJMkh/03ZfmxvFb8L6f5KXAl4FPJ/mPwI967qlThsHo+H3gQuBB4HjgNxj8LrQWBsdvYVsLPAX8c+CPgW8B/7DXjjpmGIyQqpoCllXVM1X1X4E1ffekuXP8FqYky4AvVtVPqupAVW2pquuq6vG+e+uSJ5BHx1Pth33uSfLvGVyiaFgvHI7fAlVVzyT5SZKXV9WTfffTF08gj4gkrwYeBY5jsKv6cuAT7d2mRpzjt7AluQk4G9jB0LmCqvqt3prqmGEwQto7y9cy+Hj8A1X1dM8t6TA4fgtXknUz1atqS9e99MUwGBFJ3g78ZwYnrgKcAby/qr7Ua2OaE8dPC51hMCKS/Dnwq9OHFZL8PHBzVb223840F47fwpbk28zwhXVV9bd6aKcXnkAeHT886PjyQ8AP+2pGh83xW9jGh6ZfDLwLOKmnXnrhnkHPkvyjNvlWBt+HciODdyjvAv6iqj7QV2+aneO3eCW5u6re2HcfXXHPoH/DH2x5FPgHbfr/MPTtiRpZjt8ikOQNQ3dfwGBPYUm9PrpnMALah15+q6qu7bsXHT7Hb+FLctvQ3QPAbuA/VNX/7qej7hkGIyLJHVV1Tt99aH4cv8WlBfwFVfXpvnvpimEwIpJcC7wQ+CzP/dDL13prSnPm+C1MSV4GXAacCtwE/Gm7vwG4t6rW9thepwyDEXHQbuq0qqo3d96MDtvQ+E3/DzX9G7qO3whrnzx+AvhfwFuAVzEYuw9W1T09ttY5w0A6Akn+xfRk+7cYnDz+SlV9u5+uNFdJdlbVmW16GYPvlPqbVfVX/XbWvSV1tnyUJfk3M9Wr6sNd96LD8nMz1F4N/KskV1bV1q4b0mH5f9MT7Qvr9izFIAD3DEZGkg1Dd18M/Cpwf1W9r6eWdASSnMTgJxTfMOvM6k2SZ3j2HE8Y/BbFUzx7mO9lffXWNcNgRCV5EXBLVU303YvmJ8nXq+rsvvuQ5sLvWx9dfw04re8mND/tB9Wf6LsPaa48ZzAikuzk2StRlgGvBDxfMOIOGrdpJwF/CVzcfUfS/HiYaES0H0eZdgB4tKoO9NWP5uagcYNBMDxeVUvqx9S18BkGI6R9P8ovM3hB+UpVfb3nliQtEZ4zGBHt0tItwCuAk4FPJfnX/XYlaalwz2BEJHkAeP30Nc5Jjgfuqapf6LczSUuBewaj4y957lcevwjY21MvkpYYrybqWZL/xOAcwZPAriQ72v23Anf02ZukpcPDRD1Lsu75Hq+qLV31ImnpMgxGQPuCrBuq6qK+e5G0NHnOYARU1TPAq5Mc13cvkpYmzxmMjoeA/5lkG8/9cZSP9deSpKXCMBgd32q3FzDz1yJL0jHjOQNJknsGo6L9bOLPJLM/myipC4bB6PiXQ9MvBv4xgy+sk6RjzsNEIyzJHVV1Tt99SFr83DMYEe1nEqe9ABgHXt5TO5KWGMNgdNzNs+cMDgC7gUt760bSkmIY9CzJ3wEerqoz2v11DM4X7Abu67E1SUuIn0Du338BngZI8veBf8fgdw2eBDb12JekJcQ9g/4tq6p9bfqfAJuq6vPA55Pc019bkpYS9wz6tyzJdCi/BfizoccMa0md8MWmf58B/nuS7wH/F/gfAElew+BQkSQdc37OYAQkWQ2cAvxJVf2o1f428NKq+lqvzUlaEgwDSZLnDCRJhoEkCcNAkoRhIEnCMJAkAf8frqRE+/mA42sAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "dataset.data.zip_code.value_counts().plot(kind = 'bar', grid=True)" + ] + }, + { + "cell_type": "markdown", + "id": "6483e5e8", + "metadata": {}, + "source": [ + "**History segment**" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "cbbe9e89", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['2) $100 - $200', '3) $200 - $350', '5) $500 - $750',\n", + " '1) $0 - $100', '6) $750 - $1,000', '4) $350 - $500',\n", + " '7) $1,000 +'], dtype=object)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset.data.history_segment.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "2c53d180", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "dataset.data.history_segment.value_counts().plot(kind = 'bar', grid=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "3c2aca87", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYMAAAD1CAYAAACyaJl6AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAATRElEQVR4nO3dYYxd9Xnn8e8vNqRWslk7gb3y2u4aCe9WTqySZARedV/MEtUY+sJUSiMQCobSuLsxUiNZqzh9sbQQpOQFjYREkFzhxay6cVDaCAucei3KqIpWgKFxbQylzBKz2HJAjQ3UiZbssM++mL93b2ZnPNeemXt97e9HOppzn/M/5z5X9+Dfvef8Z0hVIUm6tH1o0A1IkgbPMJAkGQaSJMNAkoRhIEnCMJAkAYsH3cD5uuKKK2r16tWDbuOi8LOf/YyPfOQjg25Dmpbn5/x68cUX/6GqrpxaH9owWL16NS+88MKg27gojI2NMTo6Oug2pGl5fs6vJG9MV/cykSTJMJAkGQaSJAwDSRKGgSQJw0CSRA9hkORXkjyf5G+THEnyx63+aJIfJznYlmtaPUkeTDKe5FCSz3Qda3OS19qyuav+2SSH2z4PJskCvFZJ0gx6+T2D94Hrq+p0ksuAHyb5Qdv2H6rqe1PG3wisact1wMPAdUk+DtwDjAAFvJhkT1WdamO+BDwH7AU2Aj9AktQXs4ZBTf7fb063h5e15Wz/R5xNwGNtv2eTLE2yHBgF9lfVSYAk+4GNScaAj1XVs63+GHAzF0kYrN7+1KBbmNW2dRPcMQR9Hv3Gbw26Bemi1dNvICdZBLwIXA08VFXPJfn3wP1J/iPwNLC9qt4HVgBvdu1+rNXOVj82TX26PrYAWwA6nQ5jY2O9tD9Q29ZNDLqFWXWWDEefw/B+a/6dPn3a974PegqDqvoAuCbJUuD7ST4FfA34CXA5sAP4KnDvAvV5po8d7bkYGRmpYfgV9WH4xL1t3QQPHL7w/zLJ0dtGB92CBsA/R9Ef5zSbqKreAZ4BNlbViZr0PvCfgGvbsOPAqq7dVrba2eorp6lLkvqkl9lEV7ZvBCRZAvwm8HftPgBt5s/NwEttlz3A7W1W0Xrg3ao6AewDNiRZlmQZsAHY17a9l2R9O9btwBPz+SIlSWfXy7WB5cCudt/gQ8DjVfVkkr9KciUQ4CDw79r4vcBNwDjwc+BOgKo6meQ+4EAbd++Zm8nAl4FHgSVM3ji+KG4eS9Kw6GU20SHg09PUr59hfAFbZ9i2E9g5Tf0F4FOz9SJJWhj+BrIkyTCQJBkGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRI9hEGSX0nyfJK/TXIkyR+3+lVJnksynuS7SS5v9Q+3x+Nt++quY32t1V9NckNXfWOrjSfZvgCvU5J0Fr18M3gfuL6qfh24BtiYZD3wTeBbVXU1cAq4q42/CzjV6t9q40iyFrgF+CSwEfh2kkVJFgEPATcCa4Fb21hJUp/MGgY16XR7eFlbCrge+F6r7wJubuub2mPa9s8lSavvrqr3q+rHwDhwbVvGq+r1qvoFsLuNlST1SU/3DNon+IPA28B+4L8D71TVRBtyDFjR1lcAbwK07e8Cn+iuT9lnprokqU8W9zKoqj4ArkmyFPg+8GsL2dRMkmwBtgB0Oh3GxsYG0cY52bZuYvZBA9ZZMhx9DsP7rfl3+vRp3/s+6CkMzqiqd5I8A/xrYGmSxe3T/0rgeBt2HFgFHEuyGPinwE+76md07zNTferz7wB2AIyMjNTo6Oi5tD8Qd2x/atAtzGrbugkeOHxOp8JAHL1tdNAtaADGxsYYhv/Wh10vs4mubN8ISLIE+E3gFeAZ4PNt2Gbgiba+pz2mbf+rqqpWv6XNNroKWAM8DxwA1rTZSZczeZN5zzy8NklSj3r5OLgc2NVm/XwIeLyqnkzyMrA7ydeBHwGPtPGPAP85yThwksl/3KmqI0keB14GJoCt7fITSe4G9gGLgJ1VdWTeXqEkaVazhkFVHQI+PU39dSZnAk2t/0/gd2Y41v3A/dPU9wJ7e+hXkrQA/A1kSZJhIEk6x9lEki4eq4dgphtMznYbhll5R7/xW4NuYU78ZiBJMgwkSYaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiR6CIMkq5I8k+TlJEeS/EGr/1GS40kOtuWmrn2+lmQ8yatJbuiqb2y18STbu+pXJXmu1b+b5PL5fqGSpJn18s1gAthWVWuB9cDWJGvbtm9V1TVt2QvQtt0CfBLYCHw7yaIki4CHgBuBtcCtXcf5ZjvW1cAp4K55en2SpB7MGgZVdaKq/qat/yPwCrDiLLtsAnZX1ftV9WNgHLi2LeNV9XpV/QLYDWxKEuB64Htt/13Azef5eiRJ5+Gc7hkkWQ18Gniule5OcijJziTLWm0F8GbXbsdabab6J4B3qmpiSl2S1CeLex2Y5KPAnwNfqar3kjwM3AdU+/kA8LsL0uX/62ELsAWg0+kwNja2kE83L7atm5h90IB1lgxHn8Pwfg+TYXjPwfOzX3oKgySXMRkEf1ZVfwFQVW91bf9T4Mn28Diwqmv3la3GDPWfAkuTLG7fDrrH/5Kq2gHsABgZGanR0dFe2h+oO7Y/NegWZrVt3QQPHO75c8HAHL1tdNAtXFSG4dwEz89+6WU2UYBHgFeq6k+66su7hv028FJb3wPckuTDSa4C1gDPAweANW3m0OVM3mTeU1UFPAN8vu2/GXhibi9LknQueonb3wC+CBxOcrDV/pDJ2UDXMHmZ6Cjw+wBVdSTJ48DLTM5E2lpVHwAkuRvYBywCdlbVkXa8rwK7k3wd+BGT4SNJ6pNZw6Cqfghkmk17z7LP/cD909T3TrdfVb3O5GwjSdIA+BvIkiTDQJJkGEiSMAwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJHoIgySrkjyT5OUkR5L8Qat/PMn+JK+1n8taPUkeTDKe5FCSz3Qda3Mb/1qSzV31zyY53PZ5MEkW4sVKkqbXyzeDCWBbVa0F1gNbk6wFtgNPV9Ua4On2GOBGYE1btgAPw2R4APcA1wHXAvecCZA25ktd+22c+0uTJPVq1jCoqhNV9Tdt/R+BV4AVwCZgVxu2C7i5rW8CHqtJzwJLkywHbgD2V9XJqjoF7Ac2tm0fq6pnq6qAx7qOJUnqg3O6Z5BkNfBp4DmgU1Un2qafAJ22vgJ4s2u3Y612tvqxaeqSpD5Z3OvAJB8F/hz4SlW9131Zv6oqSS1Af1N72MLkpSc6nQ5jY2ML/ZRztm3dxKBbmFVnyXD0OQzv9zAZhvccPD/7pacwSHIZk0HwZ1X1F638VpLlVXWiXep5u9WPA6u6dl/ZaseB0Sn1sVZfOc34/09V7QB2AIyMjNTo6Oh0wy4od2x/atAtzGrbugkeONzz54KBOXrb6KBbuKgMw7kJnp/90stsogCPAK9U1Z90bdoDnJkRtBl4oqt+e5tVtB54t11O2gdsSLKs3TjeAOxr295Lsr491+1dx5Ik9UEvcfsbwBeBw0kOttofAt8AHk9yF/AG8IW2bS9wEzAO/By4E6CqTia5DzjQxt1bVSfb+peBR4ElwA/aIknqk1nDoKp+CMw07/9z04wvYOsMx9oJ7Jym/gLwqdl6kSQtDH8DWZJkGEiSDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkiR7CIMnOJG8neamr9kdJjic52JaburZ9Lcl4kleT3NBV39hq40m2d9WvSvJcq383yeXz+QIlSbPr5ZvBo8DGaerfqqpr2rIXIMla4Bbgk22fbydZlGQR8BBwI7AWuLWNBfhmO9bVwCngrrm8IEnSuZs1DKrqr4GTPR5vE7C7qt6vqh8D48C1bRmvqter6hfAbmBTkgDXA99r++8Cbj63lyBJmqu53DO4O8mhdhlpWautAN7sGnOs1WaqfwJ4p6omptQlSX20+Dz3exi4D6j28wHgd+erqZkk2QJsAeh0OoyNjS30U87ZtnUTsw8asM6S4ehzGN7vYTIM7zl4fvbLeYVBVb11Zj3JnwJPtofHgVVdQ1e2GjPUfwosTbK4fTvoHj/d8+4AdgCMjIzU6Ojo+bTfV3dsf2rQLcxq27oJHjh8vp8L+ufobaODbuGiMgznJnh+9st5XSZKsrzr4W8DZ2Ya7QFuSfLhJFcBa4DngQPAmjZz6HImbzLvqaoCngE+3/bfDDxxPj1Jks7frHGb5DvAKHBFkmPAPcBokmuYvEx0FPh9gKo6kuRx4GVgAthaVR+049wN7AMWATur6kh7iq8Cu5N8HfgR8Mh8vThJUm9mDYOqunWa8oz/YFfV/cD909T3Anunqb/O5GwjSdKA+BvIkiTDQJJkGEiSMAwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJHoIgyQ7k7yd5KWu2seT7E/yWvu5rNWT5MEk40kOJflM1z6b2/jXkmzuqn82yeG2z4NJMt8vUpJ0dr18M3gU2Dilth14uqrWAE+3xwA3AmvasgV4GCbDA7gHuA64FrjnTIC0MV/q2m/qc0mSFtisYVBVfw2cnFLeBOxq67uAm7vqj9WkZ4GlSZYDNwD7q+pkVZ0C9gMb27aPVdWzVVXAY13HkiT1yfneM+hU1Ym2/hOg09ZXAG92jTvWamerH5umLknqo8VzPUBVVZKaj2Zmk2QLk5ef6HQ6jI2N9eNp52TbuolBtzCrzpLh6HMY3u9hMgzvOXh+9sv5hsFbSZZX1Yl2qeftVj8OrOoat7LVjgOjU+pjrb5ymvHTqqodwA6AkZGRGh0dnWnoBeOO7U8NuoVZbVs3wQOH5/y5YMEdvW100C1cVIbh3ATPz34538tEe4AzM4I2A0901W9vs4rWA++2y0n7gA1JlrUbxxuAfW3be0nWt1lEt3cdS5LUJ7PGbZLvMPmp/ookx5icFfQN4PEkdwFvAF9ow/cCNwHjwM+BOwGq6mSS+4ADbdy9VXXmpvSXmZyxtAT4QVskSX00axhU1a0zbPrcNGML2DrDcXYCO6epvwB8arY+JEkLx99AliQZBpIkw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJYo5hkORoksNJDiZ5odU+nmR/ktfaz2WtniQPJhlPcijJZ7qOs7mNfy3J5rm9JEnSuZqPbwb/tqquqaqR9ng78HRVrQGebo8BbgTWtGUL8DBMhgdwD3AdcC1wz5kAkST1x0JcJtoE7Grru4Cbu+qP1aRngaVJlgM3APur6mRVnQL2AxsXoC9J0gzmGgYF/NckLybZ0mqdqjrR1n8CdNr6CuDNrn2PtdpMdUlSnyye4/7/pqqOJ/lnwP4kf9e9saoqSc3xOf6vFjhbADqdDmNjY/N16AWzbd3EoFuYVWfJcPQ5DO/3MBmG9xw8P/tlTmFQVcfbz7eTfJ/Ja/5vJVleVSfaZaC32/DjwKqu3Ve22nFgdEp9bIbn2wHsABgZGanR0dHphl1Q7tj+1KBbmNW2dRM8cHiunwsW3tHbRgfdwkVlGM5N8Pzsl/O+TJTkI0n+yZl1YAPwErAHODMjaDPwRFvfA9zeZhWtB95tl5P2ARuSLGs3jje0miSpT+YStx3g+0nOHOe/VNVfJjkAPJ7kLuAN4Att/F7gJmAc+DlwJ0BVnUxyH3Cgjbu3qk7OoS9J0jk67zCoqteBX5+m/lPgc9PUC9g6w7F2AjvPtxdJ0tz4G8iSJMNAkmQYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkLqAwSLIxyatJxpNsH3Q/knQpuSDCIMki4CHgRmAtcGuStYPtSpIuHRdEGADXAuNV9XpV/QLYDWwacE+SdMlIVQ26B5J8HthYVb/XHn8RuK6q7p4ybguwpT38V8CrfW304nUF8A+DbkKagefn/PoXVXXl1OLiQXRyvqpqB7Bj0H1cbJK8UFUjg+5Dmo7nZ39cKJeJjgOruh6vbDVJUh9cKGFwAFiT5KoklwO3AHsG3JMkXTIuiMtEVTWR5G5gH7AI2FlVRwbc1qXES2+6kHl+9sEFcQNZkjRYF8plIknSABkGkiTDQJJ0gdxAliSAJL/G5F8fWNFKx4E9VfXK4Lq6NPjNQL8kyZ2D7kGXpiRfZfJP0QR4vi0BvuMfr1x4zibSL0nyP6rqVwfdhy49Sf4e+GRV/a8p9cuBI1W1ZjCdXRq8THQJSnJopk1Ap5+9SF3+N/DPgTem1Je3bVpAhsGlqQPcAJyaUg/w3/rfjgTAV4Cnk7wGvNlqvwpcDdw9006aH4bBpelJ4KNVdXDqhiRjfe9GAqrqL5P8Syb/pH33DeQDVfXB4Dq7NHjPQJLkbCJJkmEgScIwkCRhGEiSMAwkScD/AWu0yNgEJWXSAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "dataset.data.womens.value_counts().plot(kind = 'bar', grid=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "2e99e8a5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYMAAAD1CAYAAACyaJl6AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAATU0lEQVR4nO3dYYxd9Znf8e9vbZy1kqY4gV65trdGituVE2udZASuti+mRGsM+8KslEYgFAxL460C0kZyqzj7hl0IUiKVRUJKkLzCxVTbOCi7ERY461oso1VUAYaNF2NYypSYYouANjawJCrp0Kcv7t/tzeyM59ozc6+v/f1IR3Puc/7n3Ofmnvh37zn/GVJVSJIubr8y7AYkScNnGEiSDANJkmEgScIwkCRhGEiSgKXDbuBcXXbZZbV27dpht3FB+NnPfsaHP/zhYbchzcjzc2E999xzf1dVl0+vj2wYrF27lmeffXbYbVwQJiYmGB8fH3Yb0ow8PxdWktdmqnuZSJJkGEiSDANJEoaBJAnDQJKEYSBJoo8wSPKrSZ5J8jdJjib5o1Z/KMmPkxxuy8ZWT5L7k0wmeT7JZ3qOtS3JK23Z1lP/bJIjbZ/7k2QRXqskaRb9/J7B+8DVVfVekkuAHyb5Qdv2H6rqe9PGXwusa8tVwAPAVUk+BtwJjAEFPJdkX1WdamO+BDwN7Ae2AD9AkjQQc4ZBdf/rN++1h5e05Uz/RZytwMNtv6eSXJpkJTAOHKyqkwBJDgJbkkwAH62qp1r9YeB6LpAwWLvz8WG3MKcdG6a4ZQT6PPaN3x52C9IFq697BkmWJDkMvEX3H/Sn26Z72qWg+5J8qNVWAa/37H681c5UPz5DXZI0IH39OYqq+gDYmORS4PtJPgV8DfgJsAzYBXwVuGuR+gQgyXZgO0Cn02FiYmIxn25B7NgwNewW5tRZPhp9jsL7rYX33nvv+d4PwFn9baKqejvJk8CWqvqPrfx+kv8E/Pv2+ASwpme31a12gu6lot76RKuvnmH8TM+/i27wMDY2VqPw90pG4fLLjg1T3Hvk/P8zVcduGh92CxoC/zbRYPQzm+jy9o2AJMuB3wL+tt0HoM38uR54oe2yD7i5zSraBLxTVW8AB4DNSVYkWQFsBg60be8m2dSOdTPw6EK+SEnSmfXzcXAlsCfJErrh8UhVPZbkL5NcDgQ4DPy7Nn4/cB0wCfwcuBWgqk4muRs41MbddfpmMvBl4CFgOd0bxxfEzWNJGhX9zCZ6Hvj0DPWrZxlfwO2zbNsN7J6h/izwqbl6kSQtDn8DWZJkGEiSDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkiT7CIMmvJnkmyd8kOZrkj1r9iiRPJ5lM8t0ky1r9Q+3xZNu+tudYX2v1l5Nc01Pf0mqTSXYuwuuUJJ1BP98M3geurqrfADYCW5JsAr4J3FdVnwBOAbe18bcBp1r9vjaOJOuBG4BPAluAbydZkmQJ8C3gWmA9cGMbK0kakDnDoLreaw8vaUsBVwPfa/U9wPVtfWt7TNv+uSRp9b1V9X5V/RiYBK5sy2RVvVpVvwD2trGSpAHp655B+wR/GHgLOAj8D+DtqppqQ44Dq9r6KuB1gLb9HeDjvfVp+8xWlyQNyNJ+BlXVB8DGJJcC3wd+fTGbmk2S7cB2gE6nw8TExDDaOCs7NkzNPWjIOstHo89ReL+18N577z3f+wHoKwxOq6q3kzwJ/Evg0iRL26f/1cCJNuwEsAY4nmQp8I+Bn/bUT+vdZ7b69OffBewCGBsbq/Hx8bNpfyhu2fn4sFuY044NU9x75KxOhaE4dtP4sFvQEExMTDAK/18fdf3MJrq8fSMgyXLgt4CXgCeBz7dh24BH2/q+9pi2/S+rqlr9hjbb6ApgHfAMcAhY12YnLaN7k3nfArw2SVKf+vk4uBLY02b9/ArwSFU9luRFYG+SrwM/Ah5s4x8E/nOSSeAk3X/cqaqjSR4BXgSmgNvb5SeS3AEcAJYAu6vq6IK9QknSnOYMg6p6Hvj0DPVX6c4Eml7/X8C/meVY9wD3zFDfD+zvo19J0iLwN5AlSYaBJMkwkCRxllNLJV041o7AtGfoTn0ehSnax77x28NuYV78ZiBJMgwkSYaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiT6CIMka5I8meTFJEeT/H6r/2GSE0kOt+W6nn2+lmQyyctJrumpb2m1ySQ7e+pXJHm61b+bZNlCv1BJ0uz6+WYwBeyoqvXAJuD2JOvbtvuqamNb9gO0bTcAnwS2AN9OsiTJEuBbwLXAeuDGnuN8sx3rE8Ap4LYFen2SpD7MGQZV9UZV/XVb/3vgJWDVGXbZCuytqver6sfAJHBlWyar6tWq+gWwF9iaJMDVwPfa/nuA68/x9UiSzsFZ3TNIshb4NPB0K92R5Pkku5OsaLVVwOs9ux1vtdnqHwferqqpaXVJ0oAs7Xdgko8AfwZ8pareTfIAcDdQ7ee9wO8uSpf/v4ftwHaATqfDxMTEYj7dgtixYWruQUPWWT4afY7C+z1KRuE9B8/PQekrDJJcQjcI/rSq/hygqt7s2f4nwGPt4QlgTc/uq1uNWeo/BS5NsrR9O+gd/0uqahewC2BsbKzGx8f7aX+obtn5+LBbmNOODVPce6TvzwVDc+ym8WG3cEEZhXMTPD8HpZ/ZRAEeBF6qqj/uqa/sGfY7wAttfR9wQ5IPJbkCWAc8AxwC1rWZQ8vo3mTeV1UFPAl8vu2/DXh0fi9LknQ2+onb3wS+CBxJcrjV/oDubKCNdC8THQN+D6CqjiZ5BHiR7kyk26vqA4AkdwAHgCXA7qo62o73VWBvkq8DP6IbPpKkAZkzDKrqh0Bm2LT/DPvcA9wzQ33/TPtV1at0ZxtJkobA30CWJBkGkiTDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkCcNAkoRhIEmijzBIsibJk0leTHI0ye+3+seSHEzySvu5otWT5P4kk0meT/KZnmNta+NfSbKtp/7ZJEfaPvcnyWK8WEnSzPr5ZjAF7Kiq9cAm4PYk64GdwBNVtQ54oj0GuBZY15btwAPQDQ/gTuAq4ErgztMB0sZ8qWe/LfN/aZKkfs0ZBlX1RlX9dVv/e+AlYBWwFdjThu0Brm/rW4GHq+sp4NIkK4FrgINVdbKqTgEHgS1t20er6qmqKuDhnmNJkgbgrO4ZJFkLfBp4GuhU1Rtt00+ATltfBbzes9vxVjtT/fgMdUnSgCztd2CSjwB/Bnylqt7tvaxfVZWkFqG/6T1sp3vpiU6nw8TExGI/5bzt2DA17Bbm1Fk+Gn2Owvs9SkbhPQfPz0HpKwySXEI3CP60qv68ld9MsrKq3miXet5q9RPAmp7dV7faCWB8Wn2i1VfPMP4fqKpdwC6AsbGxGh8fn2nYeeWWnY8Pu4U57dgwxb1H+v5cMDTHbhofdgsXlFE4N8Hzc1D6mU0U4EHgpar6455N+4DTM4K2AY/21G9us4o2Ae+0y0kHgM1JVrQbx5uBA23bu0k2tee6uedYkqQB6CdufxP4InAkyeFW+wPgG8AjSW4DXgO+0LbtB64DJoGfA7cCVNXJJHcDh9q4u6rqZFv/MvAQsBz4QVskSQMyZxhU1Q+B2eb9f26G8QXcPsuxdgO7Z6g/C3xqrl4kSYvD30CWJBkGkiTDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAk0UcYJNmd5K0kL/TU/jDJiSSH23Jdz7avJZlM8nKSa3rqW1ptMsnOnvoVSZ5u9e8mWbaQL1CSNLd+vhk8BGyZoX5fVW1sy36AJOuBG4BPtn2+nWRJkiXAt4BrgfXAjW0swDfbsT4BnAJum88LkiSdvTnDoKr+CjjZ5/G2Anur6v2q+jEwCVzZlsmqerWqfgHsBbYmCXA18L22/x7g+rN7CZKk+Vo6j33vSHIz8Cywo6pOAauAp3rGHG81gNen1a8CPg68XVVTM4z/B5JsB7YDdDodJiYm5tH+YOzYMDX3oCHrLB+NPkfh/R4lo/Ceg+fnoJxrGDwA3A1U+3kv8LsL1dRsqmoXsAtgbGysxsfHF/sp5+2WnY8Pu4U57dgwxb1H5vO5YDCO3TQ+7BYuKKNwboLn56Cc0//CVfXm6fUkfwI81h6eANb0DF3dasxS/ylwaZKl7dtB73hJ0oCc09TSJCt7Hv4OcHqm0T7ghiQfSnIFsA54BjgErGszh5bRvcm8r6oKeBL4fNt/G/DoufQkSTp3c34zSPIdYBy4LMlx4E5gPMlGupeJjgG/B1BVR5M8ArwITAG3V9UH7Th3AAeAJcDuqjranuKrwN4kXwd+BDy4UC9OktSfOcOgqm6coTzrP9hVdQ9wzwz1/cD+Geqv0p1tJEkaEn8DWZJkGEiSDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkiT7CIMnuJG8leaGn9rEkB5O80n6uaPUkuT/JZJLnk3ymZ59tbfwrSbb11D+b5Ejb5/4kWegXKUk6s36+GTwEbJlW2wk8UVXrgCfaY4BrgXVt2Q48AN3wAO4ErgKuBO48HSBtzJd69pv+XJKkRTZnGFTVXwEnp5W3Anva+h7g+p76w9X1FHBpkpXANcDBqjpZVaeAg8CWtu2jVfVUVRXwcM+xJEkDcq73DDpV9UZb/wnQaeurgNd7xh1vtTPVj89QlyQN0NL5HqCqKkktRDNzSbKd7uUnOp0OExMTg3jaedmxYWrYLcyps3w0+hyF93uUjMJ7Dp6fg3KuYfBmkpVV9Ua71PNWq58A1vSMW91qJ4DxafWJVl89w/gZVdUuYBfA2NhYjY+Pzzb0vHHLzseH3cKcdmyY4t4j8/5csOiO3TQ+7BYuKKNwboLn56Cc62WifcDpGUHbgEd76je3WUWbgHfa5aQDwOYkK9qN483Agbbt3SSb2iyim3uOJUkakDnjNsl36H6qvyzJcbqzgr4BPJLkNuA14Att+H7gOmAS+DlwK0BVnUxyN3Cojburqk7flP4y3RlLy4EftEWSNEBzhkFV3TjLps/NMLaA22c5zm5g9wz1Z4FPzdWHJGnx+BvIkiTDQJJkGEiSMAwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJzDMMkhxLciTJ4STPttrHkhxM8kr7uaLVk+T+JJNJnk/ymZ7jbGvjX0mybX4vSZJ0thbim8G/rqqNVTXWHu8EnqiqdcAT7THAtcC6tmwHHoBueAB3AlcBVwJ3ng4QSdJgLMZloq3Anra+B7i+p/5wdT0FXJpkJXANcLCqTlbVKeAgsGUR+pIkzWK+YVDAf03yXJLtrdapqjfa+k+ATltfBbzes+/xVputLkkakKXz3P9fVdWJJP8EOJjkb3s3VlUlqXk+x//TAmc7QKfTYWJiYqEOvWh2bJgadgtz6iwfjT5H4f0eJaPwnoPn56DMKwyq6kT7+VaS79O95v9mkpVV9Ua7DPRWG34CWNOz++pWOwGMT6tPzPJ8u4BdAGNjYzU+Pj7TsPPKLTsfH3YLc9qxYYp7j8z3c8HiO3bT+LBbuKCMwrkJnp+Dcs6XiZJ8OMk/Or0ObAZeAPYBp2cEbQMebev7gJvbrKJNwDvtctIBYHOSFe3G8eZWkyQNyHzitgN8P8np4/yXqvqLJIeAR5LcBrwGfKGN3w9cB0wCPwduBaiqk0nuBg61cXdV1cl59CVJOkvnHAZV9SrwGzPUfwp8boZ6AbfPcqzdwO5z7UWSND/+BrIkyTCQJBkGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSSJ8ygMkmxJ8nKSySQ7h92PJF1MzoswSLIE+BZwLbAeuDHJ+uF2JUkXj/MiDIArgcmqerWqfgHsBbYOuSdJumikqobdA0k+D2ypqn/bHn8RuKqq7pg2bjuwvT38F8DLA230wnUZ8HfDbkKahefnwvpnVXX59OLSYXRyrqpqF7Br2H1caJI8W1Vjw+5Dmonn52CcL5eJTgBreh6vbjVJ0gCcL2FwCFiX5Ioky4AbgH1D7kmSLhrnxWWiqppKcgdwAFgC7K6qo0Nu62LipTedzzw/B+C8uIEsSRqu8+UykSRpiAwDSZJhIEk6T24gSxJAkl+n+9cHVrXSCWBfVb00vK4uDn4z0C9Jcuuwe9DFKclX6f4pmgDPtCXAd/zjlYvP2UT6JUn+Z1X92rD70MUnyX8HPllV/3tafRlwtKrWDaezi4OXiS5CSZ6fbRPQGWQvUo//A/xT4LVp9ZVtmxaRYXBx6gDXAKem1QP8t8G3IwHwFeCJJK8Ar7farwGfAO6YbSctDMPg4vQY8JGqOjx9Q5KJgXcjAVX1F0n+Od0/ad97A/lQVX0wvM4uDt4zkCQ5m0iSZBhIkjAMJEkYBpIkDANJEvB/Adp0wfuC1VkMAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "dataset.data.mens.value_counts().plot(kind = 'bar', grid=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "82ca1614", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1 35182\n", + "0 28818\n", + "Name: womens, dtype: int64" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset.data.womens.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "c66258e5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "womens\n", + "0 45.028125\n", + "1 54.971875\n", + "dtype: float64" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset.data.groupby('womens').size()/dataset.data['womens'].count()*100" + ] + }, + { + "cell_type": "markdown", + "id": "5aed1905", + "metadata": {}, + "source": [ + "- `55%` - womens purchases\n", + "- `44%` - mens purchases" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "e829d37b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1 35266\n", + "0 28734\n", + "Name: mens, dtype: int64" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset.data.mens.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "eaa81820", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "mens\n", + "0 44.896875\n", + "1 55.103125\n", + "dtype: float64" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset.data.groupby('mens').size()/dataset.data['mens'].count()*100" + ] + }, + { + "cell_type": "markdown", + "id": "2349e910", + "metadata": {}, + "source": [ + "- `55%` - mens purchases\n", + "- `44%` - womens purchases" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "bb32fa92", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize = (14,8))\n", + "\n", + "sns.set()\n", + "sns.heatmap(dataset.data.corr().round(3), annot=True, square = True, linewidths=.75, cmap='RdPu', fmt = '.2f',annot_kws = {\"size\": 10} )\n", + "\n", + "plt.title('Correlation matrix')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "aa0b2435", + "metadata": {}, + "source": [ + "- Womens and mens are in inverse correlation. I propose to make 1 column \"gender\" and merge." + ] + }, + { + "cell_type": "markdown", + "id": "9ca9101c", + "metadata": {}, + "source": [ + "- As we can see, there is high correlation between \"history_segment\" and \"history\". Could we merge it also and transform columns to numeric data?" + ] + }, + { + "cell_type": "markdown", + "id": "83160e2c", + "metadata": {}, + "source": [ + "# 📄 Numeric data" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "6ce14108", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAe4AAAD7CAYAAACse3fKAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAAV2ElEQVR4nO3dcUzU9/3H8dfBHeA8Gge9Q0KNyzoXYnSYjdixNhBrKiheTA9NFBa2zMbVOeuIIzLEMreyakulaTrcltj+QVw6tApKrudMaF0pZip/zFk1aRYlnRg41A5BoNxxvz+aHw2icljgex/6fPwlX77wfX8/Ak/ujvueLRwOhwUAAIwQY/UAAAAgcoQbAACDEG4AAAxCuAEAMAjhBgDAIIQbAACDEG4AAAxit3qASN261afh4Zn9lPPkZKdu3Oi1eoyoxhpFhnWKDOsUGdZpfJO9RjExNn3zm7Pv+T5jwj08HJ7x4Zb0tTjHr4o1igzrFBnWKTKs0/ima424qxwAAIMQbgAADEK4AQAwCOEGAMAghBsAAIMQbgAADEK4AQAwiDHP455siY/MUkK8tac/MBjU7Z5+S2cAAJjlaxvuhHi7PNsbLZ3h+GtrdNvSCQAApuGucgAADEK4AQAwCOEGAMAghBsAAIMQbgAADEK4AQAwCOEGAMAghBsAAIMQbgAADEK4AQAwCOEGAMAghBsAAIMQbgAADEK4AQAwCOEGAMAghBsAAIMQbgAADBJRuBsbG5Wfn6/8/Hzt3btXknTp0iUVFBQoNzdXO3fuVDAYlCR1dHSoqKhIeXl52rx5s/r6+iRJPT092rRpk1auXKmioiIFAoEpOiUAAGauccPd39+vqqoq1dXVqbGxUefOnVNra6tKS0u1a9cunThxQuFwWPX19ZKk3bt3q7CwUH6/X4sWLVJtba0k6fXXX1dmZqbee+89rVu3TlVVVVN7ZgAAzEDjhjsUCml4eFj9/f0KBoMKBoOy2+0aGBjQkiVLJEler1d+v19DQ0M6e/ascnNzR22XpA8++EAej0eStHr1av3jH//Q0NDQFJ0WAAAzk328HZxOp7Zt26aVK1cqISFBS5culcPhkMvlGtnH5XKps7NTt27dktPplN1uH7Vdkrq6ukY+xm63y+l06ubNm0pJSYlo0ORk54RPzgQuV+ID38ZYrFFkWKfIsE6RYZ3GN11rNG64L1++rHfffVfvv/++EhMT9etf/1offfTRmP1sNpvC4fA9t99PTEzkfxt340avhofHfv6HFS1fhIHA7ZF/u1yJo97GWKxRZFinyLBOkWGdxjfZaxQTY7vvDdZxy9nS0qKsrCwlJycrLi5OXq9X//znP9Xd3T2yTyAQkNvtVlJSknp7exUKhUZtlyS32z3yMcFgUL29vZozZ85XPTcAAL5Wxg13enq6WltbdefOHYXDYTU3N2vp0qWKj49XW1ubJKmhoUHZ2dlyOBzKzMyUz+cbtV2ScnJy1NDQIEny+XzKzMyUw+GYotMCAGBmGveu8qeeekoXL16U1+uVw+HQ4sWLtWnTJj3zzDOqqKhQX1+fFi5cqOLiYklSZWWlysrKtH//fqWmpmrfvn2SpG3btqmsrEz5+flKTExUdXX11J4ZAAAzkC18rwemo9BUPMbt2d44aZ/vYRx/bQ2PcU8QaxQZ1ikyrFNkWKfxRdVj3AAAIHoQbgAADEK4AQAwCOEGAMAg4/5VOQAguiQ+MksJ8dP74/teF60aGAzqdk//tM4Bwm2pz4dCUXHJU775ALMkxNstf1aM9MUzY/hb8+lHuC0U54jlmw8AMCE8xg0AgEEINwAABiHcAAAYhHADAGAQwg0AgEEINwAABiHcAAAYhHADAGAQwg0AgEEINwAABiHcAAAYhHADAGAQwg0AgEEINwAABiHcAAAYhHADAGAQwg0AgEEINwAABiHcAAAYhHADAGAQwg0AgEEINwAABiHcAAAYxG71AABgisRHZikhnh+bsBZfgQAQoYR4uzzbG60eQ8dfW2P1CLAQd5UDAGAQwg0AgEEINwAABiHcAAAYhHADAGCQiMLd3Nwsr9ervLw8vfTSS5Kk1tZWeTwerVixQjU1NSP7Xrp0SQUFBcrNzdXOnTsVDAYlSR0dHSoqKlJeXp42b96svr6+KTgdAABmtnHD/emnn6qyslK1tbU6fvy4Ll68qFOnTqm8vFy1tbXy+Xy6cOGCTp06JUkqLS3Vrl27dOLECYXDYdXX10uSdu/ercLCQvn9fi1atEi1tbVTe2YAAMxA44b75MmTWrVqlebOnSuHw6GamhrNmjVL8+fP17x582S32+XxeOT3+3Xt2jUNDAxoyZIlkiSv1yu/36+hoSGdPXtWubm5o7YDAICJGfcCLO3t7XI4HNq4caMCgYCWLVumBQsWyOVyjezjdrvV2dmprq6uUdtdLpc6Ozt169YtOZ1O2e32UdsBAMDEjBvuUCikc+fOqa6uTt/4xjf0i1/8QrNmzRqzn81mUzgcntD2iUhOdk5of0yMy5Vo9QgRM2lWK7FOkWGdvhrW70vTtRbjhvvRRx9VVlaWkpKSJEnLly+X3+9XbGzsyD5dXV1yu91KSUlRd3f3yPZAICC3262kpCT19vYqFAopNjZ2ZPtE3LjRq+Hhsb8APCy+2EYLBG5bPUJEXK5EY2a1EusUmYmuEz83xuLr7AuT/T0XE2O77w3WccO9bNky7dixQz09PZo9e7Y+/PBD5eXl6S9/+Yva29v12GOPqampSQUFBUpLS1N8fLza2tr0gx/8QA0NDcrOzpbD4VBmZqZ8Pp88Hs/IdkSHz4dCUfEDaWAwqNs9/VaPAQBRbdxwZ2Rk6LnnnlNhYaGGhob05JNPasOGDfr2t7+trVu3anBwUDk5OcrLy5MkVVdXq6KiQn19fVq4cKGKi4slSZWVlSorK9P+/fuVmpqqffv2Te2ZIWJxjtioeeEEfncHgAeL6NXB1q5dq7Vr147alpWVpWPHjo3ZNz09XYcPHx6zPS0tTXV1dQ85JgAAkLhyGgAARiHcAAAYhHADAGAQwg0AgEEINwAABiHcAAAYhHADAGAQwg0AgEEINwAABiHcAAAYhHADAGAQwg0AgEEINwAABiHcAAAYhHADAGAQwg0AgEHsVg8A/L/Ph0JyuRLH3S+Sfb6KgcGgbvf0T+kxAOBhEW5EjThHrDzbG60eQ8dfW6PbVg8BAPfBXeUAABiEcAMAYBDCDQCAQQg3AAAGIdwAABiEcAMAYBDCDQCAQQg3AAAGIdwAABiEK6cBUSrxkVlKiH/4b9HJujQsl4AFogvhBqJUQrydS8ACGINwA3eJ9MVOAMAKhBu4SzS92AkA3I0/TgMAwCCEGwAAgxBuAAAMQrgBADAI4QYAwCCEGwAAg0Qc7r1796qsrEySdOnSJRUUFCg3N1c7d+5UMBiUJHV0dKioqEh5eXnavHmz+vr6JEk9PT3atGmTVq5cqaKiIgUCgSk4FQAAZr6Iwn369GkdPXp05O3S0lLt2rVLJ06cUDgcVn19vSRp9+7dKiwslN/v16JFi1RbWytJev3115WZman33ntP69atU1VV1RScCgAAM9+4F2D57LPPVFNTo+eff16XL1/WtWvXNDAwoCVLlkiSvF6v3njjDa1bt05nz57VH//4x5HtP/7xj1VaWqoPPvhABw8elCStXr1av/vd7zQ0NCSHwzF1ZwZgUkTDleS4XjrwpXHD/eKLL6qkpETXr1+XJHV1dcnlco283+VyqbOzU7du3ZLT6ZTdbh+1/e6PsdvtcjqdunnzplJSUib9hABMrmi4khzXSwe+9MBwHzp0SKmpqcrKytKRI0ckSeFweMx+NpvtvtvvJyZmYn8Xl5zsnND+AGaWqbrVb/W9CaZj/b40XWvxwHD7fD4FAgGtWbNG//vf/3Tnzh3ZbDZ1d3eP7BMIBOR2u5WUlKTe3l6FQiHFxsaObJckt9ut7u5uzZ07V8FgUL29vZozZ86EBr1xo1fDw2N/OXhYfLEBZgkEJv82t8uVOKHPy8+Nsabi/8VEE/1aGk9MjO2+N1gfeLP37bffVlNTkxobG/XCCy/o6aef1ssvv6z4+Hi1tbVJkhoaGpSdnS2Hw6HMzEz5fL5R2yUpJydHDQ0Nkr74ZSAzM5PHtwEAeAgP9epg1dXVqqioUF9fnxYuXKji4mJJUmVlpcrKyrR//36lpqZq3759kqRt27aprKxM+fn5SkxMVHV19eSdAQAAXyMRh9vr9crr9UqS0tPTdfjw4TH7pKWlqa6ubsz2OXPm6E9/+tNXGBMAAEhcOQ0AAKMQbgAADEK4AQAwCOEGAMAghBsAAIMQbgAADPJQz+MGgOk0lS90wtXQHl40vACN9PV7ERrCDSDqRcMLnUhfvNgJvhRN/y9fpwuvclc5AAAGIdwAABiEcAMAYBDCDQCAQQg3AAAGIdwAABiEcAMAYBDCDQCAQQg3AAAGIdwAABiEcAMAYBDCDQCAQQg3AAAGIdwAABiEcAMAYBDCDQCAQQg3AAAGIdwAABiEcAMAYBDCDQCAQQg3AAAGIdwAABiEcAMAYBDCDQCAQQg3AAAGsVs9AAAAX8XnQyG5XImWzzBdCDcAwGhxjlh5tjdaOsPx19ZM27G4qxwAAIMQbgAADBJRuN98803l5+crPz9fr7zyiiSptbVVHo9HK1asUE1Nzci+ly5dUkFBgXJzc7Vz504Fg0FJUkdHh4qKipSXl6fNmzerr69vCk4HAICZbdxwt7a2qqWlRUePHlVDQ4M+/vhjNTU1qby8XLW1tfL5fLpw4YJOnTolSSotLdWuXbt04sQJhcNh1dfXS5J2796twsJC+f1+LVq0SLW1tVN7ZgAAzEDjhtvlcqmsrExxcXFyOBx6/PHHdfXqVc2fP1/z5s2T3W6Xx+OR3+/XtWvXNDAwoCVLlkiSvF6v/H6/hoaGdPbsWeXm5o7aDgAAJmbccC9YsGAkxFevXpXP55PNZpPL5RrZx+12q7OzU11dXaO2u1wudXZ26tatW3I6nbLb7aO2AwCAiYn46WCffPKJfv7zn2vHjh2y2+26cuXKqPfbbDaFw+ExH/eg7RORnOyc0P4AAEyn6XoueUThbmtr0wsvvKDy8nLl5+frzJkz6u7uHnl/V1eX3G63UlJSRm0PBAJyu91KSkpSb2+vQqGQYmNjR7ZPxI0bvRoeHvsLwMOy+sn6AICZJRC4PWmfKybGdt8brOPeVX79+nVt2bJF1dXVys/PlyRlZGToypUram9vVygUUlNTk7Kzs5WWlqb4+Hi1tbVJkhoaGpSdnS2Hw6HMzEz5fL5R2wEAwMSMe4v7wIEDGhwc1J49e0a2rV+/Xnv27NHWrVs1ODionJwc5eXlSZKqq6tVUVGhvr4+LVy4UMXFxZKkyspKlZWVaf/+/UpNTdW+ffum6JQAAJi5xg13RUWFKioq7vm+Y8eOjdmWnp6uw4cPj9melpamurq6hxgRAAD8P66cBgCAQQg3AAAGIdwAABiEcAMAYBDCDQCAQQg3AAAGIdwAABiEcAMAYBDCDQCAQQg3AAAGIdwAABiEcAMAYBDCDQCAQQg3AAAGIdwAABiEcAMAYBDCDQCAQQg3AAAGIdwAABiEcAMAYBDCDQCAQQg3AAAGIdwAABiEcAMAYBDCDQCAQQg3AAAGIdwAABiEcAMAYBDCDQCAQQg3AAAGIdwAABiEcAMAYBDCDQCAQQg3AAAGIdwAABiEcAMAYBDCDQCAQQg3AAAGmdZwHz9+XKtWrdIzzzyjgwcPTuehAQCYEezTdaDOzk7V1NToyJEjiouL0/r16/XEE0/oO9/5znSNAACA8aYt3K2trfrhD3+oOXPmSJJyc3Pl9/v1y1/+MqKPj4mxTfpM7m/OmvTPaeIMEnPcjTlGi4Y5omEGiTnuxhxfmsxOPehz2cLhcHjSjvQAf/7zn3Xnzh2VlJRIkg4dOqTz58/r97///XQcHgCAGWHaHuO+1+8HNtvk34oGAGAmm7Zwp6SkqLu7e+Ttrq4uud3u6To8AAAzwrSF+0c/+pFOnz6tmzdvqr+/X3//+9+VnZ09XYcHAGBGmLY/TktJSVFJSYmKi4s1NDSktWvX6nvf+950HR4AgBlh2v44DQAAfHVcOQ0AAIMQbgAADEK4AQAwCOEGAMAghDsKvPnmm8rPz1d+fr5eeeUVq8eJanv37lVZWZnVY0St5uZmeb1e5eXl6aWXXrJ6nKjV2Ng48j23d+9eq8eJOr29vVq9erX++9//SvriktUej0crVqxQTU2NxdNFh7vX6G9/+5tWr14tj8ej3/zmN/r888+n7NiE22Ktra1qaWnR0aNH1dDQoI8//lgnT560eqyodPr0aR09etTqMaLWp59+qsrKStXW1ur48eO6ePGiTp06ZfVYUae/v19VVVWqq6tTY2Ojzp07p9bWVqvHihr/+te/tGHDBl29elWSNDAwoPLyctXW1srn8+nChQtf+6+ru9foypUrOnDggN555x0dO3ZMw8PD+utf/zplxyfcFnO5XCorK1NcXJwcDocef/xxdXR0WD1W1Pnss89UU1Oj559/3upRotbJkye1atUqzZ07Vw6HQzU1NcrIyLB6rKgTCoU0PDys/v5+BYNBBYNBxcfHWz1W1Kivr1dlZeXIlS3Pnz+v+fPna968ebLb7fJ4PPL7/RZPaa271yguLk6//e1v5XQ6ZbPZ9N3vfndKf45P2wVYcG8LFiwY+ffVq1fl8/n0zjvvWDhRdHrxxRdVUlKi69evWz1K1Gpvb5fD4dDGjRsVCAS0bNky/epXv7J6rKjjdDq1bds2rVy5UgkJCVq6dKm+//3vWz1W1Kiqqhr1dldXl1wu18jbbrdbnZ2d0z1WVLl7jdLS0pSWliZJunnzpg4ePKiXX355yo7PLe4o8cknn+hnP/uZduzYoW9961tWjxNVDh06pNTUVGVlZVk9SlQLhUI6ffq0Xn31VdXX1+vf//43Dy3cw+XLl/Xuu+/q/fffV0tLi2JiYnTgwAGrx4pavEBU5Do7O/WTn/xEBQUFeuKJJ6bsOIQ7CrS1temnP/2ptm/frmeffdbqcaKOz+fTRx99pDVr1uiNN95Qc3Oz/vCHP1g9VtR59NFHlZWVpaSkJCUkJGj58uU6f/681WNFnZaWFmVlZSk5OVlxcXHyer06c+aM1WNFLV4gKjL/+c9/tGHDBj377LPasmXLlB6Lu8otdv36dW3ZskU1NTXcoryPt99+e+TfR44c0ZkzZ1ReXm7hRNFp2bJl2rFjh3p6ejR79mx9+OGHWr58udVjRZ309HS9+uqrunPnjmbNmqXm5mYtXrzY6rGiVkZGhq5cuaL29nY99thjampqUkFBgdVjRZXe3l5t3LhRJSUlWrNmzZQfj3Bb7MCBAxocHNSePXtGtq1fv14bNmywcCqYKCMjQ88995wKCws1NDSkJ598kh+w9/DUU0/p4sWL8nq9cjgcWrx4sTZt2mT1WFErPj5ee/bs0datWzU4OKicnBzl5eVZPVZUOXz4sLq7u/XWW2/prbfekiQ9/fTT2rZt25QcjxcZAQDAIDzGDQCAQQg3AAAGIdwAABiEcAMAYBDCDQCAQQg3AAAGIdwAABiEcAMAYJD/A1hla7NXMQ+ZAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "dataset.data.loc[:, 'recency'].hist(figsize=(8, 4), bins=12, grid=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "cb95b5c2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "29.99 7947\n", + "81.20 9\n", + "53.79 9\n", + "60.51 8\n", + "88.09 8\n", + " ... \n", + "200.53 1\n", + "234.76 1\n", + "239.91 1\n", + "99.85 1\n", + "738.50 1\n", + "Name: history, Length: 34833, dtype: int64" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset.data.history.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "7739b8d2", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "dataset.data.loc[:, 'history'].hist(figsize=(8, 4), bins=20, grid=True);" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "0009afc4", + "metadata": {}, + "outputs": [], + "source": [ + "dataset.data['history_log'] = np.log(dataset.data['history'] + 1.1)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "e11fa9ca", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "dataset.data.loc[:, 'history_log'].hist(figsize=(8, 4), bins=20, grid=True);" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "f99160f9", + "metadata": {}, + "outputs": [], + "source": [ + "numeric_features = ['history_log', 'recency']" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "8ec394d6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['recency',\n", + " 'history_segment',\n", + " 'history',\n", + " 'mens',\n", + " 'womens',\n", + " 'zip_code',\n", + " 'newbie',\n", + " 'channel',\n", + " 'history_log']" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset.data.columns.to_list()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "6951a5ee", + "metadata": {}, + "outputs": [], + "source": [ + "final_features = ['recency', 'history_segment', 'mens', 'womens', 'zip_code', 'newbie', 'channel', 'history_log']" + ] + }, + { + "cell_type": "markdown", + "id": "a6d14da2", + "metadata": {}, + "source": [ + "# 🎯 Target data" + ] + }, + { + "cell_type": "markdown", + "id": "a17b48db", + "metadata": {}, + "source": [ + "Let's segment dataset data" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "2a6cc7c9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 Womens E-Mail\n", + "1 No E-Mail\n", + "2 Womens E-Mail\n", + "3 Mens E-Mail\n", + "4 Womens E-Mail\n", + "Name: segment, dtype: object" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset.treatment.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "ef7aa5d4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Womens E-Mail', 'No E-Mail', 'Mens E-Mail'], dtype=object)" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset.treatment.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "229b6409", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "dataset.treatment.value_counts().plot(kind = 'bar', grid = 'True')" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "78f3f7e8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 0\n", + "1 0\n", + "2 0\n", + "3 0\n", + "4 0\n", + "Name: visit, dtype: int64" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset.target.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "777b3584", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAD3CAYAAADyvkg2AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAAUw0lEQVR4nO3df2zU9eH48efVqx3afoaQu4rI2LLtH4aK8bbJkrXZktHWcqIgiWtj4z8iZFPHTLOOHxK2EZyrwJat/PXRZU631ahlkOMwMQGyFDckmYQFk02BTarttdSV62i5lvv+YXx9Px1Ci7Yepc9HYuK97n3n68W97fPu9W5LJJ/P55EkCSgq9AQkSZcPoyBJCoyCJCkwCpKkwChIkgKjIEkKjIIkKYgWegIfV29vP+fO+aMW42HmzFJ6erKFnob0oTw/x0dRUYTrrrv2gvdP+iicO5c3CuPIP0tdzjw/J57bR5KkwChIkgKjIEkKjIIkKTAKkqTAKEiSAqMgSQom/c8pTAZl/zONT5VMjj/qWKys0FMY1cDgEKf7zhR6GtIVaXJ8pZrkPlUSJfnojkJP44qx88klnC70JKQrlNtHkqTAKEiSAqMgSQqMgiQpMAqSpMAoSJICoyBJCoyCJCkwCpKkwChIkgKjIEkKjIIkKRjTL8RraGigp6eHaPT9w3/0ox/xz3/+k+3bt5PL5bj//vupr68HoL29nc2bNzM4OEhNTQ2rV68G4OjRo6xbt45sNksikWDjxo1Eo1E6OjpobGykp6eHz33uczQ3N3PttddO0HIlSRcz6ieFfD7PW2+9xY4dO8I/119/PVu3buW5555jx44d/OEPf+Af//gHAwMDrFmzhpaWFlKpFEeOHGHfvn0ANDY2sn79evbs2UM+n6e1tRWAjRs3UldXRzqdZv78+bS0tEzsiiVJFzRqFN566y0ikQgPPPAAd955J7/97W9pb2/n9ttvZ/r06VxzzTVUVVWRTqc5fPgwc+fOZc6cOUSjUZLJJOl0mpMnTzIwMMCCBQsAWLp0Kel0mlwux8GDB6mqqhoxLkkqjFGj0NfXx8KFC/nVr37Fr3/9a37/+9/T0dFBLBYLx8TjcTo7O+nq6hrTeCwWo7Ozk97eXkpLS8O21AfjkqTCGPWawq233sqtt94KwDXXXMM999zD5s2bWbly5YjjIpEI+Xz+vMd/lPFLMXNm6SUdryvDZPgb4jT+fN0n3qhReO2118jlcixcuBB4/xrD7Nmz6e7uDsd0dXURj8cpLy8f03gmkyEejzNjxgyy2SzDw8NcddVVYfxS9PRkOXfu/LhcTjyRx18m49+9NtXEYmW+7uOgqChy0TfTo24fnT59mieeeILBwUGy2SwvvfQSP/vZzzhw4ACnTp3izJkzvPzyy1RUVHDLLbdw7NgxTpw4wfDwMLt27aKiooLZs2dTUlLCoUOHAGhra6OiooLi4mISiQSpVGrEuCSpMEb9pPCNb3yD119/nbvuuotz585RV1fHbbfdxurVq2loaCCXy3HPPfdw8803A/D444/z0EMPMTg4SGVlJdXV1QA0Nzezbt06+vv7mTdvHg0NDQBs2LCBpqYmtm/fzqxZs9iyZcsELleSdDGR/Idt7E8ik2X7KPnojkJP44qx88klbiNMQW4fjY+PvX0kSZo6jIIkKTAKkqTAKEiSAqMgSQqMgiQpMAqSpMAoSJICoyBJCoyCJCkwCpKkwChIkgKjIEkKjIIkKTAKkqTAKEiSAqMgSQqMgiQpMAqSpMAoSJICoyBJCoyCJCkwCpKkwChIkgKjIEkKjIIkKTAKkqTAKEiSAqMgSQrGHIWf/vSnNDU1AXD06FGWLVtGVVUVa9euZWhoCICOjg7q6+uprq5m1apV9Pf3A9DX18eKFSuoqamhvr6eTCYDwNmzZ2lsbKSmpoa7776bN998c7zXJ0m6BGOKwoEDB3jppZfC7cbGRtavX8+ePXvI5/O0trYCsHHjRurq6kin08yfP5+WlhYAtm3bRiKRYPfu3SxfvpxNmzYB8MwzzzBt2jR2797NmjVrQnQkSYUxahTee+89tm7dysqVKwE4efIkAwMDLFiwAIClS5eSTqfJ5XIcPHiQqqqqEeMAe/fuJZlMArB48WL2799PLpdj79693HnnnQB8+ctfpre3l46OjnFfpCRpbKKjHfDYY4+xevVq3nnnHQC6urqIxWLh/lgsRmdnJ729vZSWlhKNRkeM//djotEopaWlnDp16kOf69133+WGG24Y8wJmziwd87G6csRiZYWeggrA133iXTQKzz//PLNmzWLhwoW8+OKLAOTz+fOOi0QiFxy/kKKiD/+QcqHxC+npyXLu3Pn/7cuJJ/L4y2ROF3oK+oTFYmW+7uOgqChy0TfTF41CKpUik8mwZMkS/v3vf/Of//yHSCRCd3d3OCaTyRCPx5kxYwbZbJbh4WGuuuqqMA4Qj8fp7u7m+uuvZ2hoiGw2y/Tp04nH42QyGebOnTviuSRJhXHRt+VPP/00u3btYseOHTz88MN885vfZPPmzZSUlHDo0CEA2traqKiooLi4mEQiQSqVGjEOUFlZSVtbG/B+aBKJBMXFxVRWVrJjxw4AXnvtNUpKSi5p60iSNL4+0s8pNDc3s3nzZmpqajhz5gwNDQ0AbNiwgdbWVu644w5ee+01vve97wHwyCOP8Ne//pXa2lqee+45HnvsMQDuu+8+zp49S21tLZs2beKJJ54Yn1VJkj6SSP7DLgZMIpPlmkLy0R2FnsYVY+eTS9xbnoK8pjA+Rrum4E80S5ICoyBJCoyCJCkwCpKkwChIkgKjIEkKjIIkKTAKkqTAKEiSAqMgSQqMgiQpMAqSpMAoSJICoyBJCoyCJCkwCpKkwChIkgKjIEkKjIIkKTAKkqTAKEiSAqMgSQqMgiQpMAqSpMAoSJICoyBJCoyCJCkwCpKkwChIkoIxReHnP/85d9xxB7W1tTz99NMAtLe3k0wmWbRoEVu3bg3HHj16lGXLllFVVcXatWsZGhoCoKOjg/r6eqqrq1m1ahX9/f0A9PX1sWLFCmpqaqivryeTyYz3GiVJYzRqFP7yl7/w6quv8sc//pEXXniBZ555hjfeeIM1a9bQ0tJCKpXiyJEj7Nu3D4DGxkbWr1/Pnj17yOfztLa2ArBx40bq6upIp9PMnz+flpYWALZt20YikWD37t0sX76cTZs2TeByJUkXM2oUvvKVr/Cb3/yGaDRKT08Pw8PD9PX1MXfuXObMmUM0GiWZTJJOpzl58iQDAwMsWLAAgKVLl5JOp8nlchw8eJCqqqoR4wB79+4lmUwCsHjxYvbv308ul5ug5UqSLiY6loOKi4v5xS9+wVNPPUV1dTVdXV3EYrFwfzwep7Oz87zxWCxGZ2cnvb29lJaWEo1GR4wDIx4TjUYpLS3l1KlTlJeXj2kBM2eWjm2luqLEYmWFnoIKwNd94o0pCgAPP/wwDzzwACtXruT48ePn3R+JRMjn85c0fiFFRWO//t3Tk+XcufOf/3LiiTz+MpnThZ6CPmGxWJmv+zgoKopc9M30qF9933zzTY4ePQrAtGnTWLRoEX/+85/p7u4Ox3R1dRGPxykvLx8xnslkiMfjzJgxg2w2y/Dw8IhxeP9TxgePGRoaIpvNMn369EtfqSTpYxs1Cm+//Tbr1q3j7NmznD17lldeeYV7772XY8eOceLECYaHh9m1axcVFRXMnj2bkpISDh06BEBbWxsVFRUUFxeTSCRIpVIjxgEqKytpa2sDIJVKkUgkKC4unqDlSpIuZtTto8rKSl5//XXuuusurrrqKhYtWkRtbS0zZszgoYceYnBwkMrKSqqrqwFobm5m3bp19Pf3M2/ePBoaGgDYsGEDTU1NbN++nVmzZrFlyxYAHnnkEZqamqitraWsrIzm5uYJXK4k6WIi+Q/b8J9EJss1heSjOwo9jSvGzieXuLc8BXlNYXx87GsKkqSpwyhIkgKjIEkKjIIkKTAKkqTAKEiSAqMgSQqMgiQpMAqSpMAoSJICoyBJCoyCJCkwCpKkwChIkgKjIEkKjIIkKTAKkqTAKEiSAqMgSQqMgiQpMAqSpMAoSJICoyBJCoyCJCkwCpKkwChIkgKjIEkKjIIkKTAKkqRgTFH45S9/SW1tLbW1tTzxxBMAtLe3k0wmWbRoEVu3bg3HHj16lGXLllFVVcXatWsZGhoCoKOjg/r6eqqrq1m1ahX9/f0A9PX1sWLFCmpqaqivryeTyYz3GiVJYzRqFNrb2/nTn/7ESy+9RFtbG3/729/YtWsXa9asoaWlhVQqxZEjR9i3bx8AjY2NrF+/nj179pDP52ltbQVg48aN1NXVkU6nmT9/Pi0tLQBs27aNRCLB7t27Wb58OZs2bZrA5UqSLmbUKMRiMZqamrj66qspLi7m85//PMePH2fu3LnMmTOHaDRKMpkknU5z8uRJBgYGWLBgAQBLly4lnU6Ty+U4ePAgVVVVI8YB9u7dSzKZBGDx4sXs37+fXC43QcuVJF1MdLQDvvjFL4Z/P378OKlUivvuu49YLBbG4/E4nZ2ddHV1jRiPxWJ0dnbS29tLaWkp0Wh0xDgw4jHRaJTS0lJOnTpFeXn5mBYwc2bpmI7TlSUWKyv0FFQAvu4Tb9QofODvf/87Dz74ID/4wQ+IRqMcO3ZsxP2RSIR8Pn/e4y42fiFFRWO//t3Tk+XcufOf/3LiiTz+MpnThZ6CPmGxWJmv+zgoKopc9M30mL76Hjp0iPvvv59HH32Uu+++m/Lycrq7u8P9XV1dxOPx88YzmQzxeJwZM2aQzWYZHh4eMQ7vf8r44DFDQ0Nks1mmT59+yQuVJH18o0bhnXfe4Tvf+Q7Nzc3U1tYCcMstt3Ds2DFOnDjB8PAwu3btoqKigtmzZ1NSUsKhQ4cAaGtro6KiguLiYhKJBKlUasQ4QGVlJW1tbQCkUikSiQTFxcUTsVZJ0igi+Q/b2/k/fvKTn/DCCy/wmc98Jozde++9fPazn2Xz5s0MDg5SWVnJD3/4QyKRCG+88Qbr1q2jv7+fefPmsXnzZq6++mpOnjxJU1MTPT09zJo1iy1btvDpT3+a9957j6amJv71r39RVlZGc3MzN95445gXMFm2j5KP7ij0NK4YO59c4jbCFOT20fgYbfto1Chc7ozC1GMUpiajMD7G5ZqCJGlqMAqSpMAoSJICoyBJCoyCJCkwCpKkwChIkgKjIEkKjIIkKTAKkqTAKEiSAqMgSQqMgiQpMAqSpMAoSJICoyBJCoyCJCkwCpKkwChIkgKjIEkKjIIkKTAKkqTAKEiSAqMgSQqMgiQpMAqSpMAoSJICoyBJCoyCJCkYcxSy2SyLFy/m7bffBqC9vZ1kMsmiRYvYunVrOO7o0aMsW7aMqqoq1q5dy9DQEAAdHR3U19dTXV3NqlWr6O/vB6Cvr48VK1ZQU1NDfX09mUxmPNcnSboEY4rC66+/zre//W2OHz8OwMDAAGvWrKGlpYVUKsWRI0fYt28fAI2Njaxfv549e/aQz+dpbW0FYOPGjdTV1ZFOp5k/fz4tLS0AbNu2jUQiwe7du1m+fDmbNm2agGVKksZiTFFobW1lw4YNxONxAA4fPszcuXOZM2cO0WiUZDJJOp3m5MmTDAwMsGDBAgCWLl1KOp0ml8tx8OBBqqqqRowD7N27l2QyCcDixYvZv38/uVxuvNcpSRqD6FgO+u93711dXcRisXA7Ho/T2dl53ngsFqOzs5Pe3l5KS0uJRqMjxv/7uaLRKKWlpZw6dYry8vIxLWDmzNIxHacrSyxWVugpqAB83SfemKLw3/L5/HljkUjkkscvpKho7Ne/e3qynDt3/vNfTjyRx18mc7rQU9AnLBYr83UfB0VFkYu+mf5I331UXl5Od3d3uN3V1UU8Hj9vPJPJEI/HmTFjBtlsluHh4RHj8P6njA8eMzQ0RDabZfr06R9lWpKkj+kjfVK45ZZbOHbsGCdOnODGG29k165dLFu2jNmzZ1NSUsKhQ4e47bbbaGtro6KiguLiYhKJBKlUimQyGcYBKisraWtrY+XKlaRSKRKJBMXFxeO6SEkXVvY/0/hUyUf6UvCJu9w/dQ8MDnG670yhp/GxfKQzoaSkhMcff5yHHnqIwcFBKisrqa6uBqC5uZl169bR39/PvHnzaGhoAGDDhg00NTWxfft2Zs2axZYtWwB45JFHaGpqora2lrKyMpqbm8dpaZLG4lMlUZKP7ij0NK4IO59cwmTf4IrkP2zDfxKZLNcU/J9u/Ox8col7y+PI83P8TIZzc0KuKUiSrkxGQZIUGAVJUmAUJEmBUZAkBUZBkhQYBUlSYBQkSYFRkCQFRkGSFBgFSVJgFCRJgVGQJAVGQZIUGAVJUmAUJEmBUZAkBUZBkhQYBUlSYBQkSYFRkCQFRkGSFBgFSVJgFCRJgVGQJAVGQZIUGAVJUmAUJEmBUZAkBZdFFHbu3Mkdd9zBt771LZ599tlCT0eSpqxooSfQ2dnJ1q1befHFF7n66qu59957+epXv8oXvvCFQk9Nkqacgkehvb2d22+/nenTpwNQVVVFOp3mu9/97pgeX1QUmcDZjZ/4ddMKPYUrymR53ScLz8/xc7mfm6PNr+BR6OrqIhaLhdvxeJzDhw+P+fHXXXftRExr3P3vukWFnsIVZebM0kJP4Yri+Tl+Jvu5WfBrCvl8/ryxSOTyLq0kXakKHoXy8nK6u7vD7a6uLuLxeAFnJElTV8Gj8LWvfY0DBw5w6tQpzpw5w8svv0xFRUWhpyVJU1LBrymUl5ezevVqGhoayOVy3HPPPdx8882FnpYkTUmR/Idt6kuSpqSCbx9Jki4fRkGSFBgFSVJgFCRJgVGQJAUF/5ZUFc6bb77Jnj17ePfddykqKiIej/P1r3+dm266qdBTk1QgflKYop599lm+//3vA3DTTTfxpS99CYD169fz1FNPFXJqkgrIn1OYoqqqqmhra2PatJG/HfPMmTPcfffdpNPpAs1Mgo6Ojovef8MNN3xCM5l63D6aoqLRKENDQ+eNDwwMUFxcXIAZSf/fgw8+yPHjx4nH4+f90sxIJMIrr7xSoJld+YzCFLVy5UruuusuFi5cGH51eSaT4dVXX2X16tUFnp2mut/97nfU1dWxYcMGbrvttkJPZ0px+2gK6+zs5MCBA3R1dZHP5ykvL2fhwoWUl5cXemoShw8f5vnnn+fHP/5xoacypRgFSVLgdx9JkgKjIEkKjIIkKTAKkqTAKEiSgv8Hi1TH8eTFG+8AAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "dataset.target.value_counts().plot(kind = 'bar')" + ] + }, + { + "cell_type": "markdown", + "id": "f331cdf9", + "metadata": {}, + "source": [ + "Target is disbalanced" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "dc53d3d6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
visit01
segment
Mens E-Mail0.8172430.182757
No E-Mail0.8938330.106167
Womens E-Mail0.8486000.151400
\n", + "
" + ], + "text/plain": [ + "visit 0 1\n", + "segment \n", + "Mens E-Mail 0.817243 0.182757\n", + "No E-Mail 0.893833 0.106167\n", + "Womens E-Mail 0.848600 0.151400" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd \n", + "\n", + "pd.crosstab(dataset.treatment, dataset.target, normalize='index')" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "e1422eb7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 1], dtype=int64)" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset.target.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "bcc5192b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 1.0, 'Treatment & Target')" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "crosstab = pd.crosstab(dataset.treatment, dataset.target, normalize='index')\n", + " \n", + "sns.heatmap(crosstab, annot=True, fmt=\".2f\", linewidths=1, square = True, cmap = 'RdPu')\n", + "plt.xlabel('Target')\n", + "plt.title(\"Treatment & Target\")" + ] + }, + { + "cell_type": "markdown", + "id": "6a1f69fc", + "metadata": {}, + "source": [ + "There are two cases:\n", + "- `Womens E-mail - No E-mail`\n", + "- `Mens E-mail - No E-mail`\n", + "\n", + "In our tutorial we perform with first. The second you could explore by yourself." + ] + }, + { + "cell_type": "markdown", + "id": "0ee63ffc", + "metadata": {}, + "source": [ + "## Womens E-mail - No E-mail" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "00d59845", + "metadata": {}, + "outputs": [], + "source": [ + "# make treatment binary\n", + "treat_dict_womens = {\n", + " 'Womens E-Mail': 1,\n", + " 'No E-Mail': 0, \n", + " 'Mens E-Mail': 0\n", + " }\n", + "dataset.treatment_womens = dataset.treatment.map(treat_dict_womens)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "38975696", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "dataset.treatment_womens.value_counts().plot(kind = 'bar', grid = 'True')" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "30893b76", + "metadata": {}, + "outputs": [], + "source": [ + "stratify_cols = pd.concat([dataset.treatment_womens, dataset.target], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "bef56cf6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
segmentvisit
010
100
\n", + "
" + ], + "text/plain": [ + " segment visit\n", + "0 1 0\n", + "1 0 0" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stratify_cols.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "0e52cefe", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train shape: (44800, 9)\n", + "Validation shape: (19200, 9)\n" + ] + } + ], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "stratify_cols = pd.concat([dataset.treatment_womens, dataset.target], axis=1)\n", + "\n", + "X_train, X_val, trmnt_train, trmnt_val, y_train, y_val = train_test_split(\n", + " dataset.data,\n", + " dataset.treatment_womens,\n", + " dataset.target,\n", + " stratify=stratify_cols,\n", + " test_size=0.3,\n", + " random_state=42\n", + ")\n", + "\n", + "print(f\"Train shape: {X_train.shape}\")\n", + "print(f\"Validation shape: {X_val.shape}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "c8b0523d", + "metadata": {}, + "outputs": [], + "source": [ + "from sklift.models import ClassTransformation\n", + "from catboost import CatBoostClassifier\n", + "\n", + "estimator = CatBoostClassifier(verbose=100, \n", + " cat_features=cat_features,\n", + " random_state=42,\n", + " thread_count=1\n", + " )\n", + "\n", + "ct_model = ClassTransformation(estimator=estimator)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "b6b47265", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":1: UserWarning: It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.\n", + " ct_model.fit(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Learning rate set to 0.052243\n", + "0:\tlearn: 0.6899802\ttotal: 438ms\tremaining: 7m 18s\n", + "100:\tlearn: 0.6609514\ttotal: 14.1s\tremaining: 2m 5s\n", + "200:\tlearn: 0.6590899\ttotal: 28.2s\tremaining: 1m 51s\n", + "300:\tlearn: 0.6567741\ttotal: 44.3s\tremaining: 1m 42s\n", + "400:\tlearn: 0.6542030\ttotal: 1m 1s\tremaining: 1m 31s\n", + "500:\tlearn: 0.6519431\ttotal: 1m 17s\tremaining: 1m 17s\n", + "600:\tlearn: 0.6499765\ttotal: 1m 33s\tremaining: 1m 2s\n", + "700:\tlearn: 0.6479570\ttotal: 1m 51s\tremaining: 47.6s\n", + "800:\tlearn: 0.6456324\ttotal: 2m 8s\tremaining: 31.9s\n", + "900:\tlearn: 0.6436541\ttotal: 2m 25s\tremaining: 16s\n", + "999:\tlearn: 0.6416725\ttotal: 2m 42s\tremaining: 0us\n" + ] + }, + { + "data": { + "text/plain": [ + "ClassTransformation(estimator=)" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ct_model.fit(\n", + " X=X_train, \n", + " y=y_train, \n", + " treatment=trmnt_train\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "1eae8382", + "metadata": {}, + "outputs": [], + "source": [ + "from sklift.metrics import uplift_at_k" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "da7c489e", + "metadata": {}, + "outputs": [], + "source": [ + "uplift_predictions = ct_model.predict(X_val)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "56c39c27", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "uplift@10%: 0.0162 (sort groups by uplift together)\n", + "uplift@10%: 0.0192 (sort groups by uplift separately)\n" + ] + } + ], + "source": [ + "# k = 10%\n", + "k = 0.1 \n", + "\n", + "# strategy='overall' sort by uplift treatment and control together\n", + "uplift_overall = uplift_at_k(y_val, uplift_predictions, trmnt_val, strategy='overall', k=k)\n", + "\n", + "# strategy='by_group' sort by uplift treatment and control separately\n", + "uplift_bygroup = uplift_at_k(y_val, uplift_predictions, trmnt_val, strategy='by_group', k=k)\n", + "\n", + "\n", + "print(f\"uplift@{k * 100:.0f}%: {uplift_overall:.4f} (sort groups by uplift together)\")\n", + "print(f\"uplift@{k * 100:.0f}%: {uplift_bygroup:.4f} (sort groups by uplift separately)\")" + ] + }, + { + "cell_type": "markdown", + "id": "e84867d2", + "metadata": {}, + "source": [ + "# 🎯 Adversarial validation " + ] + }, + { + "cell_type": "markdown", + "id": "42438771", + "metadata": {}, + "source": [ + "The general idea of adversarial validation is to check the degree of similarity between training and tests in terms of feature distribution: if they are difficult to distinguish, the distribution is probably similar and the usual validation techniques should work" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "id": "bee7b6a2", + "metadata": {}, + "outputs": [], + "source": [ + "train_proportion = 0.7\n", + "train_test_cut = int(dataset.data.shape[0] * train_proportion)" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "id": "2bdbcf43", + "metadata": {}, + "outputs": [], + "source": [ + "dataset.data.loc[:train_test_cut, 'dataset_label'] = 0" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "id": "bd788227", + "metadata": {}, + "outputs": [], + "source": [ + "dataset.data.loc[train_test_cut:, 'dataset_label'] = 1" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "id": "8075b15a", + "metadata": {}, + "outputs": [], + "source": [ + "target = 'dataset_label'" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "id": "4b6c468f", + "metadata": {}, + "outputs": [], + "source": [ + "df_train = dataset.data[:train_test_cut] " + ] + }, + { + "cell_type": "code", + "execution_count": 113, + "id": "f840e966", + "metadata": {}, + "outputs": [], + "source": [ + "df_test = dataset.data[train_test_cut:] " + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "id": "df188f77", + "metadata": {}, + "outputs": [], + "source": [ + "from catboost import Pool" + ] + }, + { + "cell_type": "markdown", + "id": "c7c7544c", + "metadata": {}, + "source": [ + "This is the target that we’ll train a model to predict. Right now, the train and test datasets are separate, and each dataset has only one label for the target value. If we trained a model on this training set, it would just learn that everything was 0. We want to instead shuffle the train and test datasets, and then create new datasets for fitting and evaluating the adversarial validation model. I define a function for combining, shuffling, and re-splitting:" + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "id": "8c96050e", + "metadata": {}, + "outputs": [], + "source": [ + "def create_adversarial_data(df_train, df_test, cols, N_val=6400):\n", + " df_master = pd.concat([df_train[cols], df_test[cols]], axis=0)\n", + " adversarial_val = df_master.sample(N_val, replace=False)\n", + " adversarial_train = df_master[~df_master.index.isin(adversarial_val.index)]\n", + " return adversarial_train, adversarial_val" + ] + }, + { + "cell_type": "code", + "execution_count": 118, + "id": "80f446a3", + "metadata": {}, + "outputs": [], + "source": [ + "final_features = ['recency', 'history_segment', 'mens', 'womens', 'zip_code', 'newbie', 'channel', 'history_log']" + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "id": "e8c2cc41", + "metadata": {}, + "outputs": [], + "source": [ + "features = final_features\n", + "all_cols = features + [target]\n", + "adversarial_train, adversarial_test = create_adversarial_data(df_train, df_test, all_cols)" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "id": "7e3e6574", + "metadata": {}, + "outputs": [], + "source": [ + "train_data = Pool(\n", + " data=adversarial_train[features],\n", + " label=adversarial_train[target],\n", + " cat_features=cat_features\n", + ")\n", + "holdout_data = Pool(\n", + " data=adversarial_test[features],\n", + " label=adversarial_test[target],\n", + " cat_features=cat_features\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "id": "164d5393", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Learning rate set to 0.234819\n", + "0:\ttest: 0.5130548\tbest: 0.5130548 (0)\ttotal: 113ms\tremaining: 11.2s\n", + "1:\ttest: 0.5109823\tbest: 0.5130548 (0)\ttotal: 179ms\tremaining: 8.79s\n", + "2:\ttest: 0.5080080\tbest: 0.5130548 (0)\ttotal: 251ms\tremaining: 8.1s\n", + "3:\ttest: 0.5121898\tbest: 0.5130548 (0)\ttotal: 317ms\tremaining: 7.6s\n", + "4:\ttest: 0.5092711\tbest: 0.5130548 (0)\ttotal: 380ms\tremaining: 7.22s\n", + "5:\ttest: 0.5082763\tbest: 0.5130548 (0)\ttotal: 436ms\tremaining: 6.83s\n", + "6:\ttest: 0.5078603\tbest: 0.5130548 (0)\ttotal: 491ms\tremaining: 6.52s\n", + "7:\ttest: 0.5078408\tbest: 0.5130548 (0)\ttotal: 521ms\tremaining: 5.99s\n", + "8:\ttest: 0.5072985\tbest: 0.5130548 (0)\ttotal: 589ms\tremaining: 5.96s\n", + "9:\ttest: 0.5073745\tbest: 0.5130548 (0)\ttotal: 626ms\tremaining: 5.64s\n", + "10:\ttest: 0.5076422\tbest: 0.5130548 (0)\ttotal: 657ms\tremaining: 5.31s\n", + "11:\ttest: 0.5077096\tbest: 0.5130548 (0)\ttotal: 723ms\tremaining: 5.3s\n", + "12:\ttest: 0.5079760\tbest: 0.5130548 (0)\ttotal: 787ms\tremaining: 5.27s\n", + "13:\ttest: 0.5079760\tbest: 0.5130548 (0)\ttotal: 812ms\tremaining: 4.99s\n", + "14:\ttest: 0.5078911\tbest: 0.5130548 (0)\ttotal: 901ms\tremaining: 5.11s\n", + "15:\ttest: 0.5085221\tbest: 0.5130548 (0)\ttotal: 962ms\tremaining: 5.05s\n", + "16:\ttest: 0.5085221\tbest: 0.5130548 (0)\ttotal: 998ms\tremaining: 4.87s\n", + "17:\ttest: 0.5088082\tbest: 0.5130548 (0)\ttotal: 1.07s\tremaining: 4.87s\n", + "18:\ttest: 0.5102033\tbest: 0.5130548 (0)\ttotal: 1.11s\tremaining: 4.75s\n", + "19:\ttest: 0.5102033\tbest: 0.5130548 (0)\ttotal: 1.14s\tremaining: 4.57s\n", + "20:\ttest: 0.5091191\tbest: 0.5130548 (0)\ttotal: 1.17s\tremaining: 4.39s\n", + "21:\ttest: 0.5099598\tbest: 0.5130548 (0)\ttotal: 1.23s\tremaining: 4.34s\n", + "22:\ttest: 0.5099598\tbest: 0.5130548 (0)\ttotal: 1.25s\tremaining: 4.18s\n", + "23:\ttest: 0.5124841\tbest: 0.5130548 (0)\ttotal: 1.31s\tremaining: 4.16s\n", + "24:\ttest: 0.5124841\tbest: 0.5130548 (0)\ttotal: 1.34s\tremaining: 4.02s\n", + "25:\ttest: 0.5124841\tbest: 0.5130548 (0)\ttotal: 1.37s\tremaining: 3.89s\n", + "26:\ttest: 0.5126382\tbest: 0.5130548 (0)\ttotal: 1.42s\tremaining: 3.85s\n", + "27:\ttest: 0.5127858\tbest: 0.5130548 (0)\ttotal: 1.48s\tremaining: 3.81s\n", + "28:\ttest: 0.5130221\tbest: 0.5130548 (0)\ttotal: 1.54s\tremaining: 3.77s\n", + "29:\ttest: 0.5125187\tbest: 0.5130548 (0)\ttotal: 1.61s\tremaining: 3.75s\n", + "30:\ttest: 0.5119062\tbest: 0.5130548 (0)\ttotal: 1.67s\tremaining: 3.71s\n", + "31:\ttest: 0.5119062\tbest: 0.5130548 (0)\ttotal: 1.71s\tremaining: 3.63s\n", + "32:\ttest: 0.5119065\tbest: 0.5130548 (0)\ttotal: 1.76s\tremaining: 3.58s\n", + "33:\ttest: 0.5119168\tbest: 0.5130548 (0)\ttotal: 1.83s\tremaining: 3.54s\n", + "34:\ttest: 0.5119168\tbest: 0.5130548 (0)\ttotal: 1.88s\tremaining: 3.49s\n", + "35:\ttest: 0.5119277\tbest: 0.5130548 (0)\ttotal: 1.94s\tremaining: 3.44s\n", + "36:\ttest: 0.5143258\tbest: 0.5143258 (36)\ttotal: 2s\tremaining: 3.41s\n", + "37:\ttest: 0.5145906\tbest: 0.5145906 (37)\ttotal: 2.07s\tremaining: 3.37s\n", + "38:\ttest: 0.5150807\tbest: 0.5150807 (38)\ttotal: 2.12s\tremaining: 3.32s\n", + "39:\ttest: 0.5150807\tbest: 0.5150807 (38)\ttotal: 2.15s\tremaining: 3.22s\n", + "40:\ttest: 0.5150807\tbest: 0.5150807 (38)\ttotal: 2.17s\tremaining: 3.12s\n", + "41:\ttest: 0.5139849\tbest: 0.5150807 (38)\ttotal: 2.23s\tremaining: 3.08s\n", + "42:\ttest: 0.5136584\tbest: 0.5150807 (38)\ttotal: 2.29s\tremaining: 3.04s\n", + "43:\ttest: 0.5139842\tbest: 0.5150807 (38)\ttotal: 2.36s\tremaining: 3.01s\n", + "44:\ttest: 0.5153170\tbest: 0.5153170 (44)\ttotal: 2.43s\tremaining: 2.97s\n", + "45:\ttest: 0.5150217\tbest: 0.5153170 (44)\ttotal: 2.49s\tremaining: 2.93s\n", + "46:\ttest: 0.5151375\tbest: 0.5153170 (44)\ttotal: 2.56s\tremaining: 2.89s\n", + "47:\ttest: 0.5155845\tbest: 0.5155845 (47)\ttotal: 2.61s\tremaining: 2.83s\n", + "48:\ttest: 0.5156149\tbest: 0.5156149 (48)\ttotal: 2.67s\tremaining: 2.78s\n", + "49:\ttest: 0.5170975\tbest: 0.5170975 (49)\ttotal: 2.73s\tremaining: 2.73s\n", + "50:\ttest: 0.5172240\tbest: 0.5172240 (50)\ttotal: 2.8s\tremaining: 2.69s\n", + "51:\ttest: 0.5172212\tbest: 0.5172240 (50)\ttotal: 2.85s\tremaining: 2.63s\n", + "52:\ttest: 0.5172322\tbest: 0.5172322 (52)\ttotal: 2.91s\tremaining: 2.58s\n", + "53:\ttest: 0.5166552\tbest: 0.5172322 (52)\ttotal: 2.97s\tremaining: 2.53s\n", + "54:\ttest: 0.5166552\tbest: 0.5172322 (52)\ttotal: 3s\tremaining: 2.46s\n", + "55:\ttest: 0.5148724\tbest: 0.5172322 (52)\ttotal: 3.06s\tremaining: 2.4s\n", + "56:\ttest: 0.5147731\tbest: 0.5172322 (52)\ttotal: 3.12s\tremaining: 2.35s\n", + "57:\ttest: 0.5149763\tbest: 0.5172322 (52)\ttotal: 3.17s\tremaining: 2.3s\n", + "58:\ttest: 0.5149423\tbest: 0.5172322 (52)\ttotal: 3.24s\tremaining: 2.25s\n", + "59:\ttest: 0.5155427\tbest: 0.5172322 (52)\ttotal: 3.3s\tremaining: 2.2s\n", + "60:\ttest: 0.5155427\tbest: 0.5172322 (52)\ttotal: 3.36s\tremaining: 2.15s\n", + "61:\ttest: 0.5155542\tbest: 0.5172322 (52)\ttotal: 3.42s\tremaining: 2.1s\n", + "62:\ttest: 0.5156125\tbest: 0.5172322 (52)\ttotal: 3.48s\tremaining: 2.05s\n", + "63:\ttest: 0.5154880\tbest: 0.5172322 (52)\ttotal: 3.55s\tremaining: 2s\n", + "64:\ttest: 0.5140872\tbest: 0.5172322 (52)\ttotal: 3.6s\tremaining: 1.94s\n", + "65:\ttest: 0.5139578\tbest: 0.5172322 (52)\ttotal: 3.66s\tremaining: 1.89s\n", + "66:\ttest: 0.5134700\tbest: 0.5172322 (52)\ttotal: 3.72s\tremaining: 1.83s\n", + "67:\ttest: 0.5134209\tbest: 0.5172322 (52)\ttotal: 3.79s\tremaining: 1.78s\n", + "68:\ttest: 0.5137175\tbest: 0.5172322 (52)\ttotal: 3.85s\tremaining: 1.73s\n", + "69:\ttest: 0.5145397\tbest: 0.5172322 (52)\ttotal: 3.91s\tremaining: 1.67s\n", + "70:\ttest: 0.5151316\tbest: 0.5172322 (52)\ttotal: 3.97s\tremaining: 1.62s\n", + "71:\ttest: 0.5151316\tbest: 0.5172322 (52)\ttotal: 4.03s\tremaining: 1.57s\n", + "72:\ttest: 0.5154323\tbest: 0.5172322 (52)\ttotal: 4.09s\tremaining: 1.51s\n", + "73:\ttest: 0.5154323\tbest: 0.5172322 (52)\ttotal: 4.14s\tremaining: 1.46s\n", + "74:\ttest: 0.5154456\tbest: 0.5172322 (52)\ttotal: 4.21s\tremaining: 1.4s\n", + "75:\ttest: 0.5154456\tbest: 0.5172322 (52)\ttotal: 4.27s\tremaining: 1.35s\n", + "76:\ttest: 0.5156375\tbest: 0.5172322 (52)\ttotal: 4.32s\tremaining: 1.29s\n", + "77:\ttest: 0.5158455\tbest: 0.5172322 (52)\ttotal: 4.38s\tremaining: 1.24s\n", + "78:\ttest: 0.5158064\tbest: 0.5172322 (52)\ttotal: 4.44s\tremaining: 1.18s\n", + "79:\ttest: 0.5158064\tbest: 0.5172322 (52)\ttotal: 4.51s\tremaining: 1.13s\n", + "80:\ttest: 0.5164766\tbest: 0.5172322 (52)\ttotal: 4.57s\tremaining: 1.07s\n", + "81:\ttest: 0.5137400\tbest: 0.5172322 (52)\ttotal: 4.63s\tremaining: 1.01s\n", + "82:\ttest: 0.5150029\tbest: 0.5172322 (52)\ttotal: 4.69s\tremaining: 960ms\n", + "83:\ttest: 0.5159938\tbest: 0.5172322 (52)\ttotal: 4.75s\tremaining: 905ms\n", + "84:\ttest: 0.5133364\tbest: 0.5172322 (52)\ttotal: 4.81s\tremaining: 849ms\n", + "85:\ttest: 0.5127295\tbest: 0.5172322 (52)\ttotal: 4.87s\tremaining: 792ms\n", + "86:\ttest: 0.5128319\tbest: 0.5172322 (52)\ttotal: 4.93s\tremaining: 738ms\n", + "87:\ttest: 0.5128633\tbest: 0.5172322 (52)\ttotal: 5.01s\tremaining: 683ms\n", + "88:\ttest: 0.5127721\tbest: 0.5172322 (52)\ttotal: 5.07s\tremaining: 627ms\n", + "89:\ttest: 0.5128172\tbest: 0.5172322 (52)\ttotal: 5.14s\tremaining: 571ms\n", + "90:\ttest: 0.5128195\tbest: 0.5172322 (52)\ttotal: 5.21s\tremaining: 515ms\n", + "91:\ttest: 0.5128195\tbest: 0.5172322 (52)\ttotal: 5.27s\tremaining: 458ms\n", + "92:\ttest: 0.5126648\tbest: 0.5172322 (52)\ttotal: 5.36s\tremaining: 403ms\n", + "93:\ttest: 0.5122866\tbest: 0.5172322 (52)\ttotal: 5.45s\tremaining: 348ms\n", + "94:\ttest: 0.5119554\tbest: 0.5172322 (52)\ttotal: 5.54s\tremaining: 291ms\n", + "95:\ttest: 0.5118455\tbest: 0.5172322 (52)\ttotal: 5.6s\tremaining: 233ms\n", + "96:\ttest: 0.5119052\tbest: 0.5172322 (52)\ttotal: 5.67s\tremaining: 175ms\n", + "97:\ttest: 0.5125816\tbest: 0.5172322 (52)\ttotal: 5.72s\tremaining: 117ms\n", + "98:\ttest: 0.5126916\tbest: 0.5172322 (52)\ttotal: 5.78s\tremaining: 58.4ms\n", + "99:\ttest: 0.5133625\tbest: 0.5172322 (52)\ttotal: 5.84s\tremaining: 0us\n", + "\n", + "bestTest = 0.5172322343\n", + "bestIteration = 52\n", + "\n", + "Shrink model to first 53 iterations.\n" + ] + } + ], + "source": [ + "params = {\n", + " 'iterations': 100,\n", + " 'eval_metric': 'AUC',\n", + " 'od_type': 'Iter',\n", + " 'od_wait': 50,\n", + "}\n", + "\n", + "model = CatBoostClassifier(**params)\n", + "_ = model.fit(train_data, eval_set=holdout_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "id": "34ca0539", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import roc_curve, roc_auc_score" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "id": "49bd16cf", + "metadata": {}, + "outputs": [], + "source": [ + "def plot_roc(y_trues, y_preds, labels, x_max=1.0):\n", + " fig, ax = plt.subplots()\n", + " for i, y_pred in enumerate(y_preds):\n", + " y_true = y_trues[i]\n", + " fpr, tpr, thresholds = roc_curve(y_true, y_pred)\n", + " auc = roc_auc_score(y_true, y_pred)\n", + " ax.plot(fpr, tpr, label='%s; AUC=%.3f' % (labels[i], auc), marker='o', markersize=1)\n", + "\n", + " ax.legend()\n", + " ax.grid()\n", + " ax.plot(np.linspace(0, 1, 20), np.linspace(0, 1, 20), linestyle='--')\n", + " ax.set_title('ROC curve')\n", + " ax.set_xlabel('False Positive Rate')\n", + " ax.set_xlim([-0.01, x_max])\n", + " _ = ax.set_ylabel('True Positive Rate')" + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "id": "be78be7c", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plot_roc(\n", + " [holdout_data.get_label()],\n", + " [model.predict_proba(holdout_data)[:,1]],\n", + " ['Baseline']\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "f1539290", + "metadata": {}, + "source": [ + "The ROC curve above is for the validation dataset and the values come close to 50% which indicates that it is difficult to distinguish between train and test set and they come from the same distribution.\n" + ] + }, + { + "cell_type": "markdown", + "id": "cf20f332", + "metadata": {}, + "source": [ + "`So, in this case, our usual method for stratified train validation split should work fine and will perform equally well on the unseen test set`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7da9aae1", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}