Added notebooks for training and scoring a Diabetes Ridge regression model (#145)

bjcmit · dtzar · sudivate · commit faeeef98e019 · 2020-01-22T14:27:00.000-08:00
* Added notebooks for training and scoring a Ridge regression model with the Diabetes dataset

* Fixed joblib import and remove request headers

Co-authored-by: David Tesar &lt;david.tesar@microsoft.com&gt;
diff --git a/experimentation/Diabetes Ridge Regression Scoring.ipynb b/experimentation/Diabetes Ridge Regression Scoring.ipynb
@@ -0,0 +1,114 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Score Data with a Ridge Regression Model Trained on the Diabetes Dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This notebook loads the model trained in the Diabetes Ridge Regression Training notebook, prepares the data, and scores the data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import numpy\n",
+    "from azureml.core.model import Model\n",
+    "import joblib"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_path = Model.get_model_path(model_name=\"sklearn_regression_model.pkl\")\n",
+    "model = joblib.load(model_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prepare Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "raw_data = '{\"data\":[[1,2,3,4,5,6,7,8,9,10],[10,9,8,7,6,5,4,3,2,1]]}'\n",
+    "\n",
+    "data = json.loads(raw_data)[\"data\"]\n",
+    "data = numpy.array(data)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Score Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Test result:  {'result': [5113.099642122813, 3713.6329271385353]}\n"
+     ]
+    }
+   ],
+   "source": [
+    "request_headers = {}\n",
+    "\n",
+    "result = model.predict(data)\n",
+    "print(\"Test result: \", {\"result\": result.tolist()})"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python (storedna)",
+   "language": "python",
+   "name": "storedna"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/experimentation/Diabetes Ridge Regression Training.ipynb b/experimentation/Diabetes Ridge Regression Training.ipynb
@@ -0,0 +1,171 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Train a Ridge Regression Model on the Diabetes Dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This notebook loads the Diabetes dataset from sklearn, splits the data into training and validation sets, trains a Ridge regression model, validates the model on the validation set, and saves the model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.datasets import load_diabetes\n",
+    "from sklearn.linear_model import Ridge\n",
+    "from sklearn.metrics import mean_squared_error\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "import joblib"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X, y = load_diabetes(return_X_y=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Split Data into Training and Validation Sets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)\n",
+    "data = {\"train\": {\"X\": X_train, \"y\": y_train},\n",
+    "        \"test\": {\"X\": X_test, \"y\": y_test}}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Train Model on Training Set"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,\n",
+       "      normalize=False, random_state=None, solver='auto', tol=0.001)"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "alpha = 0.5\n",
+    "\n",
+    "reg = Ridge(alpha=alpha)\n",
+    "reg.fit(data[\"train\"][\"X\"], data[\"train\"][\"y\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Validate Model on Validation Set"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "mse:  3298.9096058070622\n"
+     ]
+    }
+   ],
+   "source": [
+    "preds = reg.predict(data[\"test\"][\"X\"])\n",
+    "print(\"mse: \", mean_squared_error(preds, y_test))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Save Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['sklearn_regression_model.pkl']"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model_name = \"sklearn_regression_model.pkl\"\n",
+    "\n",
+    "joblib.dump(value=reg, filename=model_name)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python (storedna)",
+   "language": "python",
+   "name": "storedna"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}