MicrosoftDocs
diff --git a/‎learn-pr/azure/optimize-model-performance-roc-auc/notebooks/9-3-evaluate-roc-curves.ipynb
Lines changed: 110 additions & 40 deletions b/‎learn-pr/azure/optimize-model-performance-roc-auc/notebooks/9-3-evaluate-roc-curves.ipynb
Lines changed: 110 additions & 40 deletions
@@ -19,12 +19,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import numpy\n",
+    "import numpy as np\n",
     "import pandas\n",
     "!pip install statsmodels\n",
     "!wget https://raw.githubusercontent.com/MicrosoftDocs/mslearn-introduction-to-machine-learning/main/graphing.py\n",
     "!wget https://raw.githubusercontent.com/MicrosoftDocs/mslearn-introduction-to-machine-learning/main/Data/hiker_or_tree.csv\n",
-    "import graphing # custom graphing code. See our GitHub repo for details\n",
+    "import matplotlib.pyplot as plt\n",
     "import sklearn.model_selection\n",
     "\n",
     "# Load our data from disk\n",
@@ -33,10 +33,30 @@
     "# Split into train and test\n",
     "train, test =  sklearn.model_selection.train_test_split(df, test_size=0.5, random_state=1)\n",
     "\n",
-    "# Graph our three features\n",
-    "graphing.histogram(test, label_x=\"height\", label_colour=\"is_hiker\", show=True)\n",
-    "graphing.multiple_histogram(test, label_x=\"motion\", label_group=\"is_hiker\", nbins=12, show=True)\n",
-    "graphing.multiple_histogram(test, label_x=\"texture\", label_group=\"is_hiker\", nbins=12)"
+    "# Define a helper function to plot histograms by class\n",
+    "def plot_histogram_by_group(data, column, group_column, bins=12):\n",
+    "    groups = data[group_column].unique()\n",
+    "    for group in groups:\n",
+    "        subset = data[data[group_column] == group]\n",
+    "        plt.hist(subset[column], bins=bins, alpha=0.5, label=f'{group_column}={group}')\n",
+    "    plt.xlabel(column)\n",
+    "    plt.ylabel('Count')\n",
+    "    plt.title(f'{column} by {group_column}')\n",
+    "    plt.legend()\n",
+    "    plt.grid(True)\n",
+    "    plt.show()\n",
+    "\n",
+    "# Histogram for \"height\"\n",
+    "plt.hist(test[\"height\"], bins=12, color='skyblue', edgecolor='black')\n",
+    "plt.xlabel(\"height\")\n",
+    "plt.ylabel(\"Count\")\n",
+    "plt.title(\"Height Distribution\")\n",
+    "plt.grid(True)\n",
+    "plt.show()\n",
+    "\n",
+    "# Multiple histograms for \"motion\" and \"texture\" by \"is_hiker\"\n",
+    "plot_histogram_by_group(test, \"motion\", \"is_hiker\", bins=12)\n",
+    "plot_histogram_by_group(test, \"texture\", \"is_hiker\", bins=12)"
    ]
   },
   {
@@ -70,13 +90,26 @@
     "    def predict(self, x):\n",
     "        # The perfect model believes that hikers are all\n",
     "        # under 4m tall\n",
-    "        return 1 / (1 + numpy.exp(80*(x - 4)))\n",
+    "        return 1 / (1 + np.exp(80*(x - 4)))\n",
     "    \n",
     "model = PerfectModel()\n",
     "\n",
     "# Plot the model\n",
-    "import graphing\n",
-    "graphing.scatter_2D(test, trendline=model.predict)"
+    "plt.scatter(test[\"height\"], test[\"is_hiker\"], alpha=0.6, label=\"Data\", edgecolor='k')\n",
+    "\n",
+    "# Create a range of height values for plotting the model's prediction\n",
+    "x_vals = np.linspace(test[\"height\"].min(), test[\"height\"].max(), 200)\n",
+    "y_vals = model.predict(x_vals)\n",
+    "\n",
+    "# Plot the model's prediction (trendline)\n",
+    "plt.plot(x_vals, y_vals, color='red', label=\"Perfect Model\")\n",
+    "\n",
+    "plt.xlabel(\"height\")\n",
+    "plt.ylabel(\"is_hiker\")\n",
+    "plt.title(\"Scatter Plot with Model Trendline\")\n",
+    "plt.legend()\n",
+    "plt.grid(True)\n",
+    "plt.show()"
    ]
   },
   {
@@ -112,21 +145,21 @@
     "    # FN - how many false negatives (where the model predicts tree, but it was a hiker)\n",
     "\n",
     "    # First, make a note of which predictions were 'true' and which were 'false'\n",
-    "    prediction_true = numpy.equal(prediction, 1)\n",
-    "    prediction_false= numpy.equal(prediction, 0)\n",
+    "    prediction_true = np.equal(prediction, 1)\n",
+    "    prediction_false= np.equal(prediction, 0)\n",
     "\n",
     "    # Now, make a note of which correct results were 'true' and which were 'false'\n",
-    "    actual_true = numpy.equal(actual, 1)\n",
-    "    actual_false = numpy.equal(actual, 0)\n",
+    "    actual_true = np.equal(actual, 1)\n",
+    "    actual_false = np.equal(actual, 0)\n",
     "\n",
     "    # Calculate TP, TN, FP, and FN\n",
     "    # The combination of sum and '&' counts the overlap\n",
     "    # For example, TP calculates how many 'true' predictions \n",
     "    # overlapped with 'true' labels (correct answers)\n",
-    "    TP = numpy.sum(prediction_true  & actual_true)\n",
-    "    TN = numpy.sum(prediction_false & actual_false)\n",
-    "    FP = numpy.sum(prediction_true  & actual_false)\n",
-    "    FN = numpy.sum(prediction_false & actual_true)\n",
+    "    TP = np.sum(prediction_true  & actual_true)\n",
+    "    TN = np.sum(prediction_false & actual_false)\n",
+    "    FP = np.sum(prediction_true  & actual_false)\n",
+    "    FN = np.sum(prediction_false & actual_true)\n",
     "\n",
     "    # Calculate the true positive rate\n",
     "    # This is the proportion of 'hiker' labels that are identified as hikers\n",
@@ -215,7 +248,7 @@
     "    # we had used different thresholds. \n",
     "\n",
     "    #  Make a list of thresholds to try\n",
-    "    thresholds = numpy.linspace(0,1,101)\n",
+    "    thresholds = np.linspace(0,1,101)\n",
     "\n",
     "    false_positive_rates = []\n",
     "    true_positive_rates = []\n",
@@ -232,17 +265,19 @@
     "\n",
     "\n",
     "    # Graph the result\n",
-    "    # You don't need to understand this code, but essentially we are plotting\n",
-    "    # TPR versus FPR as a line plot\n",
-    "    # -- Prepare a dataframe, required by our graphing code\n",
-    "    df_for_graphing = pandas.DataFrame(dict(fpr=false_positive_rates, tpr=true_positive_rates, threshold=thresholds))\n",
-    "    # -- Generate the plot\n",
-    "    fig = graphing.scatter_2D(df_for_graphing, x_range=[-0.05,1.05])\n",
-    "    fig.update_traces(mode='lines') # Comment our this line if you would like to see points rather than lines\n",
-    "    fig.update_yaxes(range=[-0.05, 1.05])\n",
+    "    plt.figure(figsize=(6, 6))\n",
+    "    plt.plot(false_positive_rates, true_positive_rates, color='blue', label='ROC Curve')\n",
+    "    plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Random Guess')\n",
     "\n",
-    "    # Display the graph\n",
-    "    fig.show()\n",
+    "    plt.xlim([-0.05, 1.05])\n",
+    "    plt.ylim([-0.05, 1.05])\n",
+    "    plt.xlabel(\"False Positive Rate\")\n",
+    "    plt.ylabel(\"True Positive Rate\")\n",
+    "    plt.title(\"ROC Curve\")\n",
+    "    plt.legend(loc=\"lower right\")\n",
+    "    plt.grid(True)\n",
+    "    plt.gca().set_aspect('equal', adjustable='box')  # Optional: make the plot square\n",
+    "    plt.show()\n",
     "\n",
     "\n",
     "# Create an roc curve for our model\n",
@@ -284,12 +319,24 @@
     "    def predict(self, x):\n",
     "        # This model thinks that all people are over 4m tall \n",
     "        # and all trees are shorter\n",
-    "        return 1 / (1 + numpy.exp(-80*(x - 4)))\n",
+    "        return 1 / (1 + np.exp(-80*(x - 4)))\n",
     "\n",
     "model = VeryBadModel()\n",
     "\n",
     "# Plot the model\n",
-    "graphing.scatter_2D(test, trendline=model.predict)"
+    "plt.scatter(test[\"height\"], test[\"is_hiker\"], alpha=0.6, label=\"Data\", edgecolor='k')\n",
+    "\n",
+    "# Create a range of height values and get the model's predictions\n",
+    "x_vals = np.linspace(test[\"height\"].min(), test[\"height\"].max(), 200)\n",
+    "y_vals = model.predict(x_vals)\n",
+    "\n",
+    "# Plot the model's prediction trendline\n",
+    "plt.plot(x_vals, y_vals, color='red', label=\"Very Bad Model\")\n",
+    "\n",
+    "plt.xlabel(\"height\")\n",
+    "plt.ylabel(\"is_hiker\")\n",
+    "plt.title(\"Scatter Plot with Very Bad Model Trendline\")\n",
+    "plt.legend()"
    ]
   },
   {
@@ -339,13 +386,27 @@
     "\n",
     "# This is a helper method that reformats the data to be compatible\n",
     "# with this particular logistic regression model \n",
-    "prep_data = lambda x:  numpy.column_stack((numpy.full(x.shape, 1), x))\n",
+    "prep_data = lambda x:  np.column_stack((np.full(x.shape, 1), x))\n",
     "\n",
     "# Train a logistic regression model to predict hiker based on texture\n",
     "model = statsmodels.api.Logit(train.is_hiker, prep_data(train.texture)).fit()\n",
     "\n",
     "# Plot the model\n",
-    "graphing.scatter_2D(test, label_x=\"texture\", label_y=\"is_hiker\", trendline=lambda x: model.predict(prep_data(x)))"
+    "plt.scatter(test[\"texture\"], test[\"is_hiker\"], alpha=0.6, label=\"Data\", edgecolor='k')\n",
+    "\n",
+    "# Create a smooth range of texture values for the trendline\n",
+    "x_vals = np.linspace(test[\"texture\"].min(), test[\"texture\"].max(), 200)\n",
+    "y_vals = model.predict(prep_data(x_vals))\n",
+    "\n",
+    "# Plot the logistic regression model's predicted probabilities\n",
+    "plt.plot(x_vals, y_vals, color='red', label=\"Logistic Regression Model\")\n",
+    "\n",
+    "plt.xlabel(\"texture\")\n",
+    "plt.ylabel(\"is_hiker\")\n",
+    "plt.title(\"Scatter Plot with Logistic Regression Trendline\")\n",
+    "plt.legend()\n",
+    "plt.grid(True)\n",
+    "plt.show()"
    ]
   },
   {
@@ -382,13 +443,6 @@
     "If we continued this approach for all thresholds, we'd achieve a diagonal line."
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -412,7 +466,23 @@
     "model = statsmodels.api.Logit(train.is_hiker, prep_data(train.motion), add_constant=True).fit()\n",
     "\n",
     "# Plot the model\n",
-    "graphing.scatter_2D(test, label_x=\"motion\", label_y=\"is_hiker\", trendline=lambda x: model.predict(prep_data(x)))"
+    "plt.scatter(test[\"motion\"], test[\"is_hiker\"], alpha=0.6, label=\"Data\", edgecolor='k')\n",
+    "\n",
+    "# Generate smooth range of motion values for plotting the trendline\n",
+    "x_vals = np.linspace(test[\"motion\"].min(), test[\"motion\"].max(), 200)\n",
+    "\n",
+    "# Predict probabilities using the trained model\n",
+    "y_vals = model.predict(prep_data(x_vals))\n",
+    "\n",
+    "# Plot the logistic regression trendline\n",
+    "plt.plot(x_vals, y_vals, color='red', label=\"Logistic Regression Model\")\n",
+    "\n",
+    "plt.xlabel(\"motion\")\n",
+    "plt.ylabel(\"is_hiker\")\n",
+    "plt.title(\"Scatter Plot with Logistic Regression Trendline\")\n",
+    "plt.legend()\n",
+    "plt.grid(True)\n",
+    "plt.show()"
    ]
   },
   {