Update IOW notebook with basic model training

pablodecm · pablodecm · commit 02f0745085e7 · 2022-04-02T15:35:39.000+02:00
diff --git a/04_internet_of_wands/internet_of_wands.ipynb b/04_internet_of_wands/internet_of_wands.ipynb
@@ -198,7 +198,7 @@
    },
    "outputs": [],
    "source": [
-    "!wget \"https://github.com/pablodecm/datalab_ml_iot/raw/master/04_internet_of_wands/iow_data.zip\"; unzip -o iow_data.zip"
+    "!wget \"https://github.com/pablodecm/datalab_ml_iot/raw/master/04_internet_of_wands/iow_data.zip\"; unzip -qq -o iow_data.zip"
    ]
   },
   {
@@ -235,7 +235,7 @@
     "import matplotlib.pyplot as plt\n",
     "import warnings\n",
     "from sklearn.model_selection import train_test_split\n",
-    "#warnings.filterwarnings(\"ignore\")\n",
+    "warnings.filterwarnings(\"ignore\")\n",
     "\n",
     "example_file = \"iow_data/wingardium-leviosa/Peppapig_9b2bd7a9.0696f8.json\"\n",
     "\n",
@@ -331,6 +331,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# we can not plot the resampled sensor data\n",
     "fig, axs = plt.subplots(2, figsize=(12,12))\n",
     "\n",
     "merged_df.filter(regex=\"._accel\").plot(ax=axs[0])\n",
@@ -401,6 +402,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# and apply fo all dat\n",
     "md_fields = [\"spell_select\",\"device_select\",\"wizard_name\"]\n",
     "data_path = Path(\"./iow_data\")\n",
     "\n",
@@ -424,6 +426,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# this is dataframe with a multi-index (spell_select, device_select, wizard_name, spell_id, timestamp)\n",
     "all_df = pd.concat(merged_df_dict, names = (md_fields+ [\"spell_id\"]))\n",
     "all_df"
    ]
@@ -445,7 +448,8 @@
     "\n",
     "var_name = \"y_accel\"\n",
     "wizard_name = \"pablodecm\"\n",
-    "spell_name = \"reparo\"\n",
+    "spell_name = \"alohomora\"\n",
+    "\n",
     "subset_df = all_df.loc[(spell_name,slice(None),wizard_name),var_name]\n",
     "\n",
     "\n",
@@ -518,7 +522,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# further exploratory data analysis"
+    "# feel free to carry out further exploratory data analysis"
    ]
   },
   {
@@ -534,7 +538,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "\n",
+    "# cleaning a little bit the data\n",
     "# get only spells that last more than 400 ms\n",
     "long_spells = all_df.groupby(\"spell_id\").count() > 20\n",
     "long_spells = long_spells[long_spells].dropna().index"
@@ -552,6 +556,24 @@
     "valid_df = all_df.loc[(slice(None),slice(None),slice(None),list(valid_subset)),:]"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_df."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "valid_df"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -577,8 +599,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# use the training dataframe to create a model\n",
-    "train_df"
+    "# a single row (spell) is characterized by 6 sensor recording and variable number of timesteps\n",
+    "valid_df.loc[(slice(None),slice(None),slice(None),\"lumos/Voldemort_525eda85.587874\")].reset_index(drop=True).plot()"
    ]
   },
   {
@@ -587,16 +609,251 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# evaluate the model in the validation dataset\n"
+    "# the raw features are the following, however we have the problem that their number is variable\n",
+    "features = [ f\"{i}_accel\" for i in [\"x\", \"y\",\"z\"]] + [ f\"{i}_gyro\" for i in [\"x\", \"y\",\"z\"]]\n",
+    "features"
    ]
   },
   {
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# we can use initially the mean of each wavelenght as a feature\n",
+    "train_df_mean = train_df.loc[(slice(None),slice(None),slice(None),slice(None)),:].groupby(\"spell_id\").mean().loc[:, features]\n",
+    "train_df_mean.columns = [f\"{f}_mean\" for f in features]\n",
+    "valid_df_mean= valid_df.loc[(slice(None),slice(None),slice(None),slice(None)),:].groupby(\"spell_id\").mean().loc[:, features]\n",
+    "valid_df_mean.columns = [f\"{f}_mean\" for f in features]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# we can optionally complement with the standard deviation as features\n",
+    "train_df_std = train_df.loc[(slice(None),slice(None),slice(None),slice(None)),:].groupby(\"spell_id\").std().loc[:, features]\n",
+    "train_df_std.columns = [f\"{f}_std\" for f in features]\n",
+    "valid_df_std = valid_df.loc[(slice(None),slice(None),slice(None),slice(None)),:].groupby(\"spell_id\").std().loc[:, features]\n",
+    "valid_df_std.columns = [f\"{f}_std\" for f in features]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# example of features\n",
+    "train_df_std.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# we can create a simplified training dataset by contatenating both\n",
+    "train_df_extra = pd.concat([train_df_mean,train_df_std], axis=1)\n",
+    "valid_df_extra = pd.concat([valid_df_mean,valid_df_std], axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# the training data will look like this\n",
+    "train_df_extra.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# we can also obtain the category label from the spell_id index\n",
+    "label_assign = { \"alohomora\" : 0, \"lumos\" : 1, \"wingardium-leviosa\" : 2, \"reparo\" : 3}\n",
+    "train_y =  train_df_extra.reset_index().spell_id.str.split(\"/\").apply(lambda x: label_assign[x[0]])\n",
+    "valid_y =  valid_df_extra.reset_index().spell_id.str.split(\"/\").apply(lambda x: label_assign[x[0]])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.model_selection import KFold, GridSearchCV\n",
+    "from sklearn.ensemble import GradientBoostingClassifier\n",
+    "\n",
+    "gb_clf = GradientBoostingClassifier()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# we can train the classifier\n",
+    "gb_clf.fit(train_df_extra, train_y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, confusion_matrix\n",
+    "from sklearn.metrics import classification_report"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# we can get probability predictions\n",
+    "gb_clf.predict_proba(train_df_extra)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# and compute some metrics on the training datataset (not to be trusted)\n",
+    "y_train_clf_proba = gb_clf.predict_proba(train_df_extra)[:, 1]\n",
+    "y_train_clf_pred = gb_clf.predict(train_df_extra)\n",
+    "\n",
+    "print(\"Confusion Matrix:\")\n",
+    "print(confusion_matrix(train_y,y_train_clf_pred))\n",
+    "print(\"Gradient Boosting Classifier Accuracy: \"+\"{:.1%}\".format(accuracy_score(train_y,y_train_clf_pred)));\n",
+    "print(\"Classification Report:\")\n",
+    "print(classification_report(train_y,y_train_clf_pred))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y_valid_clf_proba = gb_clf.predict_proba(valid_df_extra)[:, 1]\n",
+    "y_valid_clf_pred = gb_clf.predict(valid_df_extra)\n",
+    "\n",
+    "print(\"Confusion Matrix:\")\n",
+    "print(confusion_matrix(valid_y,y_valid_clf_pred))\n",
+    "print(\"Gradient Boosting Classifier Accuracy: \"+\"{:.1%}\".format(accuracy_score(valid_y,y_valid_clf_pred)));\n",
+    "print(\"Classification Report:\")\n",
+    "print(classification_report(valid_y,y_valid_clf_pred))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# can try to do a grid search to try to find a better hyper-parameter combination\n",
+    "from sklearn.model_selection import KFold, GridSearchCV\n",
+    "from sklearn.ensemble import GradientBoostingClassifier\n",
+    "\n",
+    "gb_clf = GradientBoostingClassifier()\n",
+    "\n",
+    "# to avoid having same UnitNumber in both sets\n",
+    "cv = KFold(3)\n",
+    "\n",
+    "param_grid = { \"n_estimators\" : [100, 130, 150, 180, 200],\n",
+    "               \"learning_rate\" :  [0.05, .1, 0.07]\n",
+    "             }\n",
+    "\n",
+    "\n",
+    "\n",
+    "optimized_gb_clf = GridSearchCV(estimator=gb_clf,\n",
+    "                            cv = cv,\n",
+    "                            param_grid=param_grid,\n",
+    "                            verbose = 1,\n",
+    "                            n_jobs = -1)\n",
+    "\n",
+    "# we train the best model with the full dataset\n",
+    "optimized_gb_clf.fit(train_df_extra, train_y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
-    "## Evaluate the Performance in the Test Dataset\n",
+    "# compute again the metrics on the validation set\n",
+    "y_valid_clf_proba = optimized_gb_clf.predict_proba(valid_df_extra)[:, 1]\n",
+    "y_valid_clf_pred = optimized_gb_clf.predict(valid_df_extra)\n",
     "\n",
-    "Finally, we can evaluate the final performance on our holdout test dataset."
+    "print(\"Confusion Matrix:\")\n",
+    "print(confusion_matrix(valid_y,y_valid_clf_pred))\n",
+    "print(\"Gradient Boosting Classifier Accuracy: \"+\"{:.1%}\".format(accuracy_score(valid_y,y_valid_clf_pred)));\n",
+    "print(\"Classification Report:\")\n",
+    "print(classification_report(valid_y,y_valid_clf_pred))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y_valid_clf_pred"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "valid_y != y_valid_clf_pred"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# it is possible that some of the mistakes are due to\n",
+    "# incorrect data, we could potentially explore some of the \n",
+    "wrongly_classified_ids = (valid_df_extra.loc[(valid_y != y_valid_clf_pred).values]).index\n",
+    "wrongly_classified_ids"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# we could check a few of the incorrectly classified to see \n",
+    "spell_id = 'reparo/Serious_4d7e9d0d.fa4574'\n",
+    "valid_df.loc[(slice(None),slice(None),slice(None), spell_id)].reset_index(drop=True).plot()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# feel free to carry out additonal work and/or train additional models"
    ]
   },
   {
@@ -638,7 +895,7 @@
  "metadata": {
   "celltoolbar": "Slideshow",
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -652,7 +909,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.8"
+   "version": "3.8.12"
   },
   "rise": {
    "theme": "black"