update autogluon as per best practice guidelines

Sergey Feldman · Sergey Feldman · commit a4313e4eeddb · 2020-12-31T14:54:18.000-08:00
diff --git a/03_autogluon.py b/03_autogluon.py
@@ -9,7 +9,7 @@
 from utils import load_data
 
 
-SEC = 120
+SEC = 60 * 5
 
 
 def define_and_evaluate_autogluon_pipeline(X, y, random_state=0):
@@ -31,7 +31,7 @@ def define_and_evaluate_autogluon_pipeline(X, y, random_state=0):
             data_df_train,
             "y",
             time_limits=SEC,
-            auto_stack=True,
+            presets="best_quality",
             output_directory=".autogluon_temp",
             eval_metric=eval_metric,
             problem_type=problem_type,
diff --git a/make_figures.ipynb b/make_figures.ipynb
@@ -81,7 +81,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -91,7 +91,7 @@
       "Number of datasets each algorithm does best on:\n",
       "Counter({'AutoGluon (sec=120)': 84, 'AutoGluon (sec=60)': 74, 'LightGBM (n_iter=25)': 74, 'LightGBM (n_iter=10)': 68, 'Logistic Regression': 64, 'Random Forest': 64, 'SVC': 35}) \n",
       "\n",
-      "Average performance for each algorithm: model\n",
+      "Average performance for each model\n",
       "AutoGluon (sec=120)     0.887491\n",
       "AutoGluon (sec=60)      0.886326\n",
       "LightGBM (n_iter=10)    0.886359\n",
@@ -101,7 +101,7 @@
       "SVC                     0.852368\n",
       "Name: mean_auroc, dtype: float64 \n",
       "\n",
-      "Median performance for each algorithm: model\n",
+      "Median performance for each model\n",
       "AutoGluon (sec=120)     0.924359\n",
       "AutoGluon (sec=60)      0.925754\n",
       "LightGBM (n_iter=10)    0.924920\n",
@@ -124,8 +124,8 @@
     "\n",
     "print('Number of datasets each algorithm does best on:')\n",
     "print(Counter(winning_algorithms), '\\n')\n",
-    "print('Average performance for each algorithm:', results_df.groupby('model')['mean_auroc'].mean(), '\\n')\n",
-    "print('Median performance for each algorithm:', results_df.groupby('model')['mean_auroc'].median())"
+    "print('Average performance for each', results_df.groupby('model')['mean_auroc'].mean(), '\\n')\n",
+    "print('Median performance for each', results_df.groupby('model')['mean_auroc'].median())"
    ]
   },
   {
@@ -242,6 +242,250 @@
     "g.set(xscale=\"log\")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "dataset\n",
+       "iris                           0.000000\n",
+       "robot-nav-sensor-readings-2    0.000000\n",
+       "robot-nav-sensor-readings-4    0.000000\n",
+       "hayes-roth                     0.000000\n",
+       "banknote-authentication        0.000000\n",
+       "                                 ...   \n",
+       "thoracic-surgery               0.014894\n",
+       "leukemia-haslinger             0.022436\n",
+       "autoUniv-au7-cpd1-500          0.022964\n",
+       "planning-relax                 0.051938\n",
+       "meta-data                      0.324029\n",
+       "Name: mean_auroc, Length: 142, dtype: float64"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "results_df.groupby('dataset')['mean_auroc'].apply(lambda x: np.sort(x)[-1] - np.sort(x)[-2]).sort_values()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>auroc_split_1</th>\n",
+       "      <th>auroc_split_2</th>\n",
+       "      <th>auroc_split_3</th>\n",
+       "      <th>auroc_split_4</th>\n",
+       "      <th>model</th>\n",
+       "      <th>nrow</th>\n",
+       "      <th>ncol</th>\n",
+       "      <th>mv</th>\n",
+       "      <th>ir</th>\n",
+       "      <th>class</th>\n",
+       "      <th>mean_auroc</th>\n",
+       "      <th>min_auroc</th>\n",
+       "      <th>max_auroc</th>\n",
+       "      <th>std_auroc</th>\n",
+       "      <th>dataset</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>planning-relax</th>\n",
+       "      <td>0.648019</td>\n",
+       "      <td>0.594406</td>\n",
+       "      <td>0.358173</td>\n",
+       "      <td>0.531250</td>\n",
+       "      <td>SVC</td>\n",
+       "      <td>182.0</td>\n",
+       "      <td>13.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.714286</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>0.532962</td>\n",
+       "      <td>0.358173</td>\n",
+       "      <td>0.648019</td>\n",
+       "      <td>0.125920</td>\n",
+       "      <td>planning-relax</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>planning-relax</th>\n",
+       "      <td>0.375291</td>\n",
+       "      <td>0.356643</td>\n",
+       "      <td>0.305288</td>\n",
+       "      <td>0.497596</td>\n",
+       "      <td>Logistic Regression</td>\n",
+       "      <td>182.0</td>\n",
+       "      <td>13.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.714286</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>0.383705</td>\n",
+       "      <td>0.305288</td>\n",
+       "      <td>0.497596</td>\n",
+       "      <td>0.081493</td>\n",
+       "      <td>planning-relax</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>planning-relax</th>\n",
+       "      <td>0.341492</td>\n",
+       "      <td>0.403263</td>\n",
+       "      <td>0.268029</td>\n",
+       "      <td>0.413462</td>\n",
+       "      <td>Random Forest</td>\n",
+       "      <td>182.0</td>\n",
+       "      <td>13.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.714286</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>0.356561</td>\n",
+       "      <td>0.268029</td>\n",
+       "      <td>0.413462</td>\n",
+       "      <td>0.067042</td>\n",
+       "      <td>planning-relax</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>planning-relax</th>\n",
+       "      <td>0.393939</td>\n",
+       "      <td>0.550117</td>\n",
+       "      <td>0.268029</td>\n",
+       "      <td>0.500000</td>\n",
+       "      <td>LightGBM (n_iter=10)</td>\n",
+       "      <td>182.0</td>\n",
+       "      <td>13.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.714286</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>0.428021</td>\n",
+       "      <td>0.268029</td>\n",
+       "      <td>0.550117</td>\n",
+       "      <td>0.124963</td>\n",
+       "      <td>planning-relax</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>planning-relax</th>\n",
+       "      <td>0.493007</td>\n",
+       "      <td>0.589744</td>\n",
+       "      <td>0.341346</td>\n",
+       "      <td>0.500000</td>\n",
+       "      <td>LightGBM (n_iter=25)</td>\n",
+       "      <td>182.0</td>\n",
+       "      <td>13.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.714286</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>0.481024</td>\n",
+       "      <td>0.341346</td>\n",
+       "      <td>0.589744</td>\n",
+       "      <td>0.103011</td>\n",
+       "      <td>planning-relax</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>planning-relax</th>\n",
+       "      <td>0.333333</td>\n",
+       "      <td>0.496503</td>\n",
+       "      <td>0.367788</td>\n",
+       "      <td>0.514423</td>\n",
+       "      <td>AutoGluon (sec=60)</td>\n",
+       "      <td>182.0</td>\n",
+       "      <td>13.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.714286</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>0.428012</td>\n",
+       "      <td>0.333333</td>\n",
+       "      <td>0.514423</td>\n",
+       "      <td>0.090827</td>\n",
+       "      <td>planning-relax</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>planning-relax</th>\n",
+       "      <td>0.365967</td>\n",
+       "      <td>0.463869</td>\n",
+       "      <td>0.382212</td>\n",
+       "      <td>0.500000</td>\n",
+       "      <td>AutoGluon (sec=120)</td>\n",
+       "      <td>182.0</td>\n",
+       "      <td>13.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.714286</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>0.428012</td>\n",
+       "      <td>0.365967</td>\n",
+       "      <td>0.500000</td>\n",
+       "      <td>0.064331</td>\n",
+       "      <td>planning-relax</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                auroc_split_1  auroc_split_2  auroc_split_3  auroc_split_4  \\\n",
+       "planning-relax       0.648019       0.594406       0.358173       0.531250   \n",
+       "planning-relax       0.375291       0.356643       0.305288       0.497596   \n",
+       "planning-relax       0.341492       0.403263       0.268029       0.413462   \n",
+       "planning-relax       0.393939       0.550117       0.268029       0.500000   \n",
+       "planning-relax       0.493007       0.589744       0.341346       0.500000   \n",
+       "planning-relax       0.333333       0.496503       0.367788       0.514423   \n",
+       "planning-relax       0.365967       0.463869       0.382212       0.500000   \n",
+       "\n",
+       "                               model   nrow  ncol   mv        ir  class  \\\n",
+       "planning-relax                   SVC  182.0  13.0  0.0  0.714286    2.0   \n",
+       "planning-relax   Logistic Regression  182.0  13.0  0.0  0.714286    2.0   \n",
+       "planning-relax         Random Forest  182.0  13.0  0.0  0.714286    2.0   \n",
+       "planning-relax  LightGBM (n_iter=10)  182.0  13.0  0.0  0.714286    2.0   \n",
+       "planning-relax  LightGBM (n_iter=25)  182.0  13.0  0.0  0.714286    2.0   \n",
+       "planning-relax    AutoGluon (sec=60)  182.0  13.0  0.0  0.714286    2.0   \n",
+       "planning-relax   AutoGluon (sec=120)  182.0  13.0  0.0  0.714286    2.0   \n",
+       "\n",
+       "                mean_auroc  min_auroc  max_auroc  std_auroc         dataset  \n",
+       "planning-relax    0.532962   0.358173   0.648019   0.125920  planning-relax  \n",
+       "planning-relax    0.383705   0.305288   0.497596   0.081493  planning-relax  \n",
+       "planning-relax    0.356561   0.268029   0.413462   0.067042  planning-relax  \n",
+       "planning-relax    0.428021   0.268029   0.550117   0.124963  planning-relax  \n",
+       "planning-relax    0.481024   0.341346   0.589744   0.103011  planning-relax  \n",
+       "planning-relax    0.428012   0.333333   0.514423   0.090827  planning-relax  \n",
+       "planning-relax    0.428012   0.365967   0.500000   0.064331  planning-relax  "
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "results_df.loc['planning-relax']"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,