Skip to content

Commit 727f775

Browse files
committed
Merge branch 'main' of https://github.com/riswinto/learn-pr
2 parents 2c54061 + 9305918 commit 727f775

File tree

48 files changed

+316
-211
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+316
-211
lines changed

learn-pr/azure/optimize-model-performance-roc-auc/notebooks/9-3-evaluate-roc-curves.ipynb

Lines changed: 110 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,12 @@
1919
"metadata": {},
2020
"outputs": [],
2121
"source": [
22-
"import numpy\n",
22+
"import numpy as np\n",
2323
"import pandas\n",
2424
"!pip install statsmodels\n",
2525
"!wget https://raw.githubusercontent.com/MicrosoftDocs/mslearn-introduction-to-machine-learning/main/graphing.py\n",
2626
"!wget https://raw.githubusercontent.com/MicrosoftDocs/mslearn-introduction-to-machine-learning/main/Data/hiker_or_tree.csv\n",
27-
"import graphing # custom graphing code. See our GitHub repo for details\n",
27+
"import matplotlib.pyplot as plt\n",
2828
"import sklearn.model_selection\n",
2929
"\n",
3030
"# Load our data from disk\n",
@@ -33,10 +33,30 @@
3333
"# Split into train and test\n",
3434
"train, test = sklearn.model_selection.train_test_split(df, test_size=0.5, random_state=1)\n",
3535
"\n",
36-
"# Graph our three features\n",
37-
"graphing.histogram(test, label_x=\"height\", label_colour=\"is_hiker\", show=True)\n",
38-
"graphing.multiple_histogram(test, label_x=\"motion\", label_group=\"is_hiker\", nbins=12, show=True)\n",
39-
"graphing.multiple_histogram(test, label_x=\"texture\", label_group=\"is_hiker\", nbins=12)"
36+
"# Define a helper function to plot histograms by class\n",
37+
"def plot_histogram_by_group(data, column, group_column, bins=12):\n",
38+
" groups = data[group_column].unique()\n",
39+
" for group in groups:\n",
40+
" subset = data[data[group_column] == group]\n",
41+
" plt.hist(subset[column], bins=bins, alpha=0.5, label=f'{group_column}={group}')\n",
42+
" plt.xlabel(column)\n",
43+
" plt.ylabel('Count')\n",
44+
" plt.title(f'{column} by {group_column}')\n",
45+
" plt.legend()\n",
46+
" plt.grid(True)\n",
47+
" plt.show()\n",
48+
"\n",
49+
"# Histogram for \"height\"\n",
50+
"plt.hist(test[\"height\"], bins=12, color='skyblue', edgecolor='black')\n",
51+
"plt.xlabel(\"height\")\n",
52+
"plt.ylabel(\"Count\")\n",
53+
"plt.title(\"Height Distribution\")\n",
54+
"plt.grid(True)\n",
55+
"plt.show()\n",
56+
"\n",
57+
"# Multiple histograms for \"motion\" and \"texture\" by \"is_hiker\"\n",
58+
"plot_histogram_by_group(test, \"motion\", \"is_hiker\", bins=12)\n",
59+
"plot_histogram_by_group(test, \"texture\", \"is_hiker\", bins=12)"
4060
]
4161
},
4262
{
@@ -70,13 +90,26 @@
7090
" def predict(self, x):\n",
7191
" # The perfect model believes that hikers are all\n",
7292
" # under 4m tall\n",
73-
" return 1 / (1 + numpy.exp(80*(x - 4)))\n",
93+
" return 1 / (1 + np.exp(80*(x - 4)))\n",
7494
" \n",
7595
"model = PerfectModel()\n",
7696
"\n",
7797
"# Plot the model\n",
78-
"import graphing\n",
79-
"graphing.scatter_2D(test, trendline=model.predict)"
98+
"plt.scatter(test[\"height\"], test[\"is_hiker\"], alpha=0.6, label=\"Data\", edgecolor='k')\n",
99+
"\n",
100+
"# Create a range of height values for plotting the model's prediction\n",
101+
"x_vals = np.linspace(test[\"height\"].min(), test[\"height\"].max(), 200)\n",
102+
"y_vals = model.predict(x_vals)\n",
103+
"\n",
104+
"# Plot the model's prediction (trendline)\n",
105+
"plt.plot(x_vals, y_vals, color='red', label=\"Perfect Model\")\n",
106+
"\n",
107+
"plt.xlabel(\"height\")\n",
108+
"plt.ylabel(\"is_hiker\")\n",
109+
"plt.title(\"Scatter Plot with Model Trendline\")\n",
110+
"plt.legend()\n",
111+
"plt.grid(True)\n",
112+
"plt.show()"
80113
]
81114
},
82115
{
@@ -112,21 +145,21 @@
112145
" # FN - how many false negatives (where the model predicts tree, but it was a hiker)\n",
113146
"\n",
114147
" # First, make a note of which predictions were 'true' and which were 'false'\n",
115-
" prediction_true = numpy.equal(prediction, 1)\n",
116-
" prediction_false= numpy.equal(prediction, 0)\n",
148+
" prediction_true = np.equal(prediction, 1)\n",
149+
" prediction_false= np.equal(prediction, 0)\n",
117150
"\n",
118151
" # Now, make a note of which correct results were 'true' and which were 'false'\n",
119-
" actual_true = numpy.equal(actual, 1)\n",
120-
" actual_false = numpy.equal(actual, 0)\n",
152+
" actual_true = np.equal(actual, 1)\n",
153+
" actual_false = np.equal(actual, 0)\n",
121154
"\n",
122155
" # Calculate TP, TN, FP, and FN\n",
123156
" # The combination of sum and '&' counts the overlap\n",
124157
" # For example, TP calculates how many 'true' predictions \n",
125158
" # overlapped with 'true' labels (correct answers)\n",
126-
" TP = numpy.sum(prediction_true & actual_true)\n",
127-
" TN = numpy.sum(prediction_false & actual_false)\n",
128-
" FP = numpy.sum(prediction_true & actual_false)\n",
129-
" FN = numpy.sum(prediction_false & actual_true)\n",
159+
" TP = np.sum(prediction_true & actual_true)\n",
160+
" TN = np.sum(prediction_false & actual_false)\n",
161+
" FP = np.sum(prediction_true & actual_false)\n",
162+
" FN = np.sum(prediction_false & actual_true)\n",
130163
"\n",
131164
" # Calculate the true positive rate\n",
132165
" # This is the proportion of 'hiker' labels that are identified as hikers\n",
@@ -215,7 +248,7 @@
215248
" # we had used different thresholds. \n",
216249
"\n",
217250
" # Make a list of thresholds to try\n",
218-
" thresholds = numpy.linspace(0,1,101)\n",
251+
" thresholds = np.linspace(0,1,101)\n",
219252
"\n",
220253
" false_positive_rates = []\n",
221254
" true_positive_rates = []\n",
@@ -232,17 +265,19 @@
232265
"\n",
233266
"\n",
234267
" # Graph the result\n",
235-
" # You don't need to understand this code, but essentially we are plotting\n",
236-
" # TPR versus FPR as a line plot\n",
237-
" # -- Prepare a dataframe, required by our graphing code\n",
238-
" df_for_graphing = pandas.DataFrame(dict(fpr=false_positive_rates, tpr=true_positive_rates, threshold=thresholds))\n",
239-
" # -- Generate the plot\n",
240-
" fig = graphing.scatter_2D(df_for_graphing, x_range=[-0.05,1.05])\n",
241-
" fig.update_traces(mode='lines') # Comment our this line if you would like to see points rather than lines\n",
242-
" fig.update_yaxes(range=[-0.05, 1.05])\n",
268+
" plt.figure(figsize=(6, 6))\n",
269+
" plt.plot(false_positive_rates, true_positive_rates, color='blue', label='ROC Curve')\n",
270+
" plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Random Guess')\n",
243271
"\n",
244-
" # Display the graph\n",
245-
" fig.show()\n",
272+
" plt.xlim([-0.05, 1.05])\n",
273+
" plt.ylim([-0.05, 1.05])\n",
274+
" plt.xlabel(\"False Positive Rate\")\n",
275+
" plt.ylabel(\"True Positive Rate\")\n",
276+
" plt.title(\"ROC Curve\")\n",
277+
" plt.legend(loc=\"lower right\")\n",
278+
" plt.grid(True)\n",
279+
" plt.gca().set_aspect('equal', adjustable='box') # Optional: make the plot square\n",
280+
" plt.show()\n",
246281
"\n",
247282
"\n",
248283
"# Create an roc curve for our model\n",
@@ -284,12 +319,24 @@
284319
" def predict(self, x):\n",
285320
" # This model thinks that all people are over 4m tall \n",
286321
" # and all trees are shorter\n",
287-
" return 1 / (1 + numpy.exp(-80*(x - 4)))\n",
322+
" return 1 / (1 + np.exp(-80*(x - 4)))\n",
288323
"\n",
289324
"model = VeryBadModel()\n",
290325
"\n",
291326
"# Plot the model\n",
292-
"graphing.scatter_2D(test, trendline=model.predict)"
327+
"plt.scatter(test[\"height\"], test[\"is_hiker\"], alpha=0.6, label=\"Data\", edgecolor='k')\n",
328+
"\n",
329+
"# Create a range of height values and get the model's predictions\n",
330+
"x_vals = np.linspace(test[\"height\"].min(), test[\"height\"].max(), 200)\n",
331+
"y_vals = model.predict(x_vals)\n",
332+
"\n",
333+
"# Plot the model's prediction trendline\n",
334+
"plt.plot(x_vals, y_vals, color='red', label=\"Very Bad Model\")\n",
335+
"\n",
336+
"plt.xlabel(\"height\")\n",
337+
"plt.ylabel(\"is_hiker\")\n",
338+
"plt.title(\"Scatter Plot with Very Bad Model Trendline\")\n",
339+
"plt.legend()"
293340
]
294341
},
295342
{
@@ -339,13 +386,27 @@
339386
"\n",
340387
"# This is a helper method that reformats the data to be compatible\n",
341388
"# with this particular logistic regression model \n",
342-
"prep_data = lambda x: numpy.column_stack((numpy.full(x.shape, 1), x))\n",
389+
"prep_data = lambda x: np.column_stack((np.full(x.shape, 1), x))\n",
343390
"\n",
344391
"# Train a logistic regression model to predict hiker based on texture\n",
345392
"model = statsmodels.api.Logit(train.is_hiker, prep_data(train.texture)).fit()\n",
346393
"\n",
347394
"# Plot the model\n",
348-
"graphing.scatter_2D(test, label_x=\"texture\", label_y=\"is_hiker\", trendline=lambda x: model.predict(prep_data(x)))"
395+
"plt.scatter(test[\"texture\"], test[\"is_hiker\"], alpha=0.6, label=\"Data\", edgecolor='k')\n",
396+
"\n",
397+
"# Create a smooth range of texture values for the trendline\n",
398+
"x_vals = np.linspace(test[\"texture\"].min(), test[\"texture\"].max(), 200)\n",
399+
"y_vals = model.predict(prep_data(x_vals))\n",
400+
"\n",
401+
"# Plot the logistic regression model's predicted probabilities\n",
402+
"plt.plot(x_vals, y_vals, color='red', label=\"Logistic Regression Model\")\n",
403+
"\n",
404+
"plt.xlabel(\"texture\")\n",
405+
"plt.ylabel(\"is_hiker\")\n",
406+
"plt.title(\"Scatter Plot with Logistic Regression Trendline\")\n",
407+
"plt.legend()\n",
408+
"plt.grid(True)\n",
409+
"plt.show()"
349410
]
350411
},
351412
{
@@ -382,13 +443,6 @@
382443
"If we continued this approach for all thresholds, we'd achieve a diagonal line."
383444
]
384445
},
385-
{
386-
"cell_type": "code",
387-
"execution_count": null,
388-
"metadata": {},
389-
"outputs": [],
390-
"source": []
391-
},
392446
{
393447
"cell_type": "markdown",
394448
"metadata": {},
@@ -412,7 +466,23 @@
412466
"model = statsmodels.api.Logit(train.is_hiker, prep_data(train.motion), add_constant=True).fit()\n",
413467
"\n",
414468
"# Plot the model\n",
415-
"graphing.scatter_2D(test, label_x=\"motion\", label_y=\"is_hiker\", trendline=lambda x: model.predict(prep_data(x)))"
469+
"plt.scatter(test[\"motion\"], test[\"is_hiker\"], alpha=0.6, label=\"Data\", edgecolor='k')\n",
470+
"\n",
471+
"# Generate smooth range of motion values for plotting the trendline\n",
472+
"x_vals = np.linspace(test[\"motion\"].min(), test[\"motion\"].max(), 200)\n",
473+
"\n",
474+
"# Predict probabilities using the trained model\n",
475+
"y_vals = model.predict(prep_data(x_vals))\n",
476+
"\n",
477+
"# Plot the logistic regression trendline\n",
478+
"plt.plot(x_vals, y_vals, color='red', label=\"Logistic Regression Model\")\n",
479+
"\n",
480+
"plt.xlabel(\"motion\")\n",
481+
"plt.ylabel(\"is_hiker\")\n",
482+
"plt.title(\"Scatter Plot with Logistic Regression Trendline\")\n",
483+
"plt.legend()\n",
484+
"plt.grid(True)\n",
485+
"plt.show()"
416486
]
417487
},
418488
{

0 commit comments

Comments
 (0)