|
19 | 19 | "metadata": {},
|
20 | 20 | "outputs": [],
|
21 | 21 | "source": [
|
22 |
| - "import numpy\n", |
| 22 | + "import numpy as np\n", |
23 | 23 | "import pandas\n",
|
24 | 24 | "!pip install statsmodels\n",
|
25 | 25 | "!wget https://raw.githubusercontent.com/MicrosoftDocs/mslearn-introduction-to-machine-learning/main/graphing.py\n",
|
26 | 26 | "!wget https://raw.githubusercontent.com/MicrosoftDocs/mslearn-introduction-to-machine-learning/main/Data/hiker_or_tree.csv\n",
|
27 |
| - "import graphing # custom graphing code. See our GitHub repo for details\n", |
| 27 | + "import matplotlib.pyplot as plt\n", |
28 | 28 | "import sklearn.model_selection\n",
|
29 | 29 | "\n",
|
30 | 30 | "# Load our data from disk\n",
|
|
33 | 33 | "# Split into train and test\n",
|
34 | 34 | "train, test = sklearn.model_selection.train_test_split(df, test_size=0.5, random_state=1)\n",
|
35 | 35 | "\n",
|
36 |
| - "# Graph our three features\n", |
37 |
| - "graphing.histogram(test, label_x=\"height\", label_colour=\"is_hiker\", show=True)\n", |
38 |
| - "graphing.multiple_histogram(test, label_x=\"motion\", label_group=\"is_hiker\", nbins=12, show=True)\n", |
39 |
| - "graphing.multiple_histogram(test, label_x=\"texture\", label_group=\"is_hiker\", nbins=12)" |
| 36 | + "# Define a helper function to plot histograms by class\n", |
| 37 | + "def plot_histogram_by_group(data, column, group_column, bins=12):\n", |
| 38 | + " groups = data[group_column].unique()\n", |
| 39 | + " for group in groups:\n", |
| 40 | + " subset = data[data[group_column] == group]\n", |
| 41 | + " plt.hist(subset[column], bins=bins, alpha=0.5, label=f'{group_column}={group}')\n", |
| 42 | + " plt.xlabel(column)\n", |
| 43 | + " plt.ylabel('Count')\n", |
| 44 | + " plt.title(f'{column} by {group_column}')\n", |
| 45 | + " plt.legend()\n", |
| 46 | + " plt.grid(True)\n", |
| 47 | + " plt.show()\n", |
| 48 | + "\n", |
| 49 | + "# Histogram for \"height\"\n", |
| 50 | + "plt.hist(test[\"height\"], bins=12, color='skyblue', edgecolor='black')\n", |
| 51 | + "plt.xlabel(\"height\")\n", |
| 52 | + "plt.ylabel(\"Count\")\n", |
| 53 | + "plt.title(\"Height Distribution\")\n", |
| 54 | + "plt.grid(True)\n", |
| 55 | + "plt.show()\n", |
| 56 | + "\n", |
| 57 | + "# Multiple histograms for \"motion\" and \"texture\" by \"is_hiker\"\n", |
| 58 | + "plot_histogram_by_group(test, \"motion\", \"is_hiker\", bins=12)\n", |
| 59 | + "plot_histogram_by_group(test, \"texture\", \"is_hiker\", bins=12)" |
40 | 60 | ]
|
41 | 61 | },
|
42 | 62 | {
|
|
70 | 90 | " def predict(self, x):\n",
|
71 | 91 | " # The perfect model believes that hikers are all\n",
|
72 | 92 | " # under 4m tall\n",
|
73 |
| - " return 1 / (1 + numpy.exp(80*(x - 4)))\n", |
| 93 | + " return 1 / (1 + np.exp(80*(x - 4)))\n", |
74 | 94 | " \n",
|
75 | 95 | "model = PerfectModel()\n",
|
76 | 96 | "\n",
|
77 | 97 | "# Plot the model\n",
|
78 |
| - "import graphing\n", |
79 |
| - "graphing.scatter_2D(test, trendline=model.predict)" |
| 98 | + "plt.scatter(test[\"height\"], test[\"is_hiker\"], alpha=0.6, label=\"Data\", edgecolor='k')\n", |
| 99 | + "\n", |
| 100 | + "# Create a range of height values for plotting the model's prediction\n", |
| 101 | + "x_vals = np.linspace(test[\"height\"].min(), test[\"height\"].max(), 200)\n", |
| 102 | + "y_vals = model.predict(x_vals)\n", |
| 103 | + "\n", |
| 104 | + "# Plot the model's prediction (trendline)\n", |
| 105 | + "plt.plot(x_vals, y_vals, color='red', label=\"Perfect Model\")\n", |
| 106 | + "\n", |
| 107 | + "plt.xlabel(\"height\")\n", |
| 108 | + "plt.ylabel(\"is_hiker\")\n", |
| 109 | + "plt.title(\"Scatter Plot with Model Trendline\")\n", |
| 110 | + "plt.legend()\n", |
| 111 | + "plt.grid(True)\n", |
| 112 | + "plt.show()" |
80 | 113 | ]
|
81 | 114 | },
|
82 | 115 | {
|
|
112 | 145 | " # FN - how many false negatives (where the model predicts tree, but it was a hiker)\n",
|
113 | 146 | "\n",
|
114 | 147 | " # First, make a note of which predictions were 'true' and which were 'false'\n",
|
115 |
| - " prediction_true = numpy.equal(prediction, 1)\n", |
116 |
| - " prediction_false= numpy.equal(prediction, 0)\n", |
| 148 | + " prediction_true = np.equal(prediction, 1)\n", |
| 149 | + " prediction_false= np.equal(prediction, 0)\n", |
117 | 150 | "\n",
|
118 | 151 | " # Now, make a note of which correct results were 'true' and which were 'false'\n",
|
119 |
| - " actual_true = numpy.equal(actual, 1)\n", |
120 |
| - " actual_false = numpy.equal(actual, 0)\n", |
| 152 | + " actual_true = np.equal(actual, 1)\n", |
| 153 | + " actual_false = np.equal(actual, 0)\n", |
121 | 154 | "\n",
|
122 | 155 | " # Calculate TP, TN, FP, and FN\n",
|
123 | 156 | " # The combination of sum and '&' counts the overlap\n",
|
124 | 157 | " # For example, TP calculates how many 'true' predictions \n",
|
125 | 158 | " # overlapped with 'true' labels (correct answers)\n",
|
126 |
| - " TP = numpy.sum(prediction_true & actual_true)\n", |
127 |
| - " TN = numpy.sum(prediction_false & actual_false)\n", |
128 |
| - " FP = numpy.sum(prediction_true & actual_false)\n", |
129 |
| - " FN = numpy.sum(prediction_false & actual_true)\n", |
| 159 | + " TP = np.sum(prediction_true & actual_true)\n", |
| 160 | + " TN = np.sum(prediction_false & actual_false)\n", |
| 161 | + " FP = np.sum(prediction_true & actual_false)\n", |
| 162 | + " FN = np.sum(prediction_false & actual_true)\n", |
130 | 163 | "\n",
|
131 | 164 | " # Calculate the true positive rate\n",
|
132 | 165 | " # This is the proportion of 'hiker' labels that are identified as hikers\n",
|
|
215 | 248 | " # we had used different thresholds. \n",
|
216 | 249 | "\n",
|
217 | 250 | " # Make a list of thresholds to try\n",
|
218 |
| - " thresholds = numpy.linspace(0,1,101)\n", |
| 251 | + " thresholds = np.linspace(0,1,101)\n", |
219 | 252 | "\n",
|
220 | 253 | " false_positive_rates = []\n",
|
221 | 254 | " true_positive_rates = []\n",
|
|
232 | 265 | "\n",
|
233 | 266 | "\n",
|
234 | 267 | " # Graph the result\n",
|
235 |
| - " # You don't need to understand this code, but essentially we are plotting\n", |
236 |
| - " # TPR versus FPR as a line plot\n", |
237 |
| - " # -- Prepare a dataframe, required by our graphing code\n", |
238 |
| - " df_for_graphing = pandas.DataFrame(dict(fpr=false_positive_rates, tpr=true_positive_rates, threshold=thresholds))\n", |
239 |
| - " # -- Generate the plot\n", |
240 |
| - " fig = graphing.scatter_2D(df_for_graphing, x_range=[-0.05,1.05])\n", |
241 |
| - " fig.update_traces(mode='lines') # Comment our this line if you would like to see points rather than lines\n", |
242 |
| - " fig.update_yaxes(range=[-0.05, 1.05])\n", |
| 268 | + " plt.figure(figsize=(6, 6))\n", |
| 269 | + " plt.plot(false_positive_rates, true_positive_rates, color='blue', label='ROC Curve')\n", |
| 270 | + " plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Random Guess')\n", |
243 | 271 | "\n",
|
244 |
| - " # Display the graph\n", |
245 |
| - " fig.show()\n", |
| 272 | + " plt.xlim([-0.05, 1.05])\n", |
| 273 | + " plt.ylim([-0.05, 1.05])\n", |
| 274 | + " plt.xlabel(\"False Positive Rate\")\n", |
| 275 | + " plt.ylabel(\"True Positive Rate\")\n", |
| 276 | + " plt.title(\"ROC Curve\")\n", |
| 277 | + " plt.legend(loc=\"lower right\")\n", |
| 278 | + " plt.grid(True)\n", |
| 279 | + " plt.gca().set_aspect('equal', adjustable='box') # Optional: make the plot square\n", |
| 280 | + " plt.show()\n", |
246 | 281 | "\n",
|
247 | 282 | "\n",
|
248 | 283 | "# Create an roc curve for our model\n",
|
|
284 | 319 | " def predict(self, x):\n",
|
285 | 320 | " # This model thinks that all people are over 4m tall \n",
|
286 | 321 | " # and all trees are shorter\n",
|
287 |
| - " return 1 / (1 + numpy.exp(-80*(x - 4)))\n", |
| 322 | + " return 1 / (1 + np.exp(-80*(x - 4)))\n", |
288 | 323 | "\n",
|
289 | 324 | "model = VeryBadModel()\n",
|
290 | 325 | "\n",
|
291 | 326 | "# Plot the model\n",
|
292 |
| - "graphing.scatter_2D(test, trendline=model.predict)" |
| 327 | + "plt.scatter(test[\"height\"], test[\"is_hiker\"], alpha=0.6, label=\"Data\", edgecolor='k')\n", |
| 328 | + "\n", |
| 329 | + "# Create a range of height values and get the model's predictions\n", |
| 330 | + "x_vals = np.linspace(test[\"height\"].min(), test[\"height\"].max(), 200)\n", |
| 331 | + "y_vals = model.predict(x_vals)\n", |
| 332 | + "\n", |
| 333 | + "# Plot the model's prediction trendline\n", |
| 334 | + "plt.plot(x_vals, y_vals, color='red', label=\"Very Bad Model\")\n", |
| 335 | + "\n", |
| 336 | + "plt.xlabel(\"height\")\n", |
| 337 | + "plt.ylabel(\"is_hiker\")\n", |
| 338 | + "plt.title(\"Scatter Plot with Very Bad Model Trendline\")\n", |
| 339 | + "plt.legend()" |
293 | 340 | ]
|
294 | 341 | },
|
295 | 342 | {
|
|
339 | 386 | "\n",
|
340 | 387 | "# This is a helper method that reformats the data to be compatible\n",
|
341 | 388 | "# with this particular logistic regression model \n",
|
342 |
| - "prep_data = lambda x: numpy.column_stack((numpy.full(x.shape, 1), x))\n", |
| 389 | + "prep_data = lambda x: np.column_stack((np.full(x.shape, 1), x))\n", |
343 | 390 | "\n",
|
344 | 391 | "# Train a logistic regression model to predict hiker based on texture\n",
|
345 | 392 | "model = statsmodels.api.Logit(train.is_hiker, prep_data(train.texture)).fit()\n",
|
346 | 393 | "\n",
|
347 | 394 | "# Plot the model\n",
|
348 |
| - "graphing.scatter_2D(test, label_x=\"texture\", label_y=\"is_hiker\", trendline=lambda x: model.predict(prep_data(x)))" |
| 395 | + "plt.scatter(test[\"texture\"], test[\"is_hiker\"], alpha=0.6, label=\"Data\", edgecolor='k')\n", |
| 396 | + "\n", |
| 397 | + "# Create a smooth range of texture values for the trendline\n", |
| 398 | + "x_vals = np.linspace(test[\"texture\"].min(), test[\"texture\"].max(), 200)\n", |
| 399 | + "y_vals = model.predict(prep_data(x_vals))\n", |
| 400 | + "\n", |
| 401 | + "# Plot the logistic regression model's predicted probabilities\n", |
| 402 | + "plt.plot(x_vals, y_vals, color='red', label=\"Logistic Regression Model\")\n", |
| 403 | + "\n", |
| 404 | + "plt.xlabel(\"texture\")\n", |
| 405 | + "plt.ylabel(\"is_hiker\")\n", |
| 406 | + "plt.title(\"Scatter Plot with Logistic Regression Trendline\")\n", |
| 407 | + "plt.legend()\n", |
| 408 | + "plt.grid(True)\n", |
| 409 | + "plt.show()" |
349 | 410 | ]
|
350 | 411 | },
|
351 | 412 | {
|
|
382 | 443 | "If we continued this approach for all thresholds, we'd achieve a diagonal line."
|
383 | 444 | ]
|
384 | 445 | },
|
385 |
| - { |
386 |
| - "cell_type": "code", |
387 |
| - "execution_count": null, |
388 |
| - "metadata": {}, |
389 |
| - "outputs": [], |
390 |
| - "source": [] |
391 |
| - }, |
392 | 446 | {
|
393 | 447 | "cell_type": "markdown",
|
394 | 448 | "metadata": {},
|
|
412 | 466 | "model = statsmodels.api.Logit(train.is_hiker, prep_data(train.motion), add_constant=True).fit()\n",
|
413 | 467 | "\n",
|
414 | 468 | "# Plot the model\n",
|
415 |
| - "graphing.scatter_2D(test, label_x=\"motion\", label_y=\"is_hiker\", trendline=lambda x: model.predict(prep_data(x)))" |
| 469 | + "plt.scatter(test[\"motion\"], test[\"is_hiker\"], alpha=0.6, label=\"Data\", edgecolor='k')\n", |
| 470 | + "\n", |
| 471 | + "# Generate smooth range of motion values for plotting the trendline\n", |
| 472 | + "x_vals = np.linspace(test[\"motion\"].min(), test[\"motion\"].max(), 200)\n", |
| 473 | + "\n", |
| 474 | + "# Predict probabilities using the trained model\n", |
| 475 | + "y_vals = model.predict(prep_data(x_vals))\n", |
| 476 | + "\n", |
| 477 | + "# Plot the logistic regression trendline\n", |
| 478 | + "plt.plot(x_vals, y_vals, color='red', label=\"Logistic Regression Model\")\n", |
| 479 | + "\n", |
| 480 | + "plt.xlabel(\"motion\")\n", |
| 481 | + "plt.ylabel(\"is_hiker\")\n", |
| 482 | + "plt.title(\"Scatter Plot with Logistic Regression Trendline\")\n", |
| 483 | + "plt.legend()\n", |
| 484 | + "plt.grid(True)\n", |
| 485 | + "plt.show()" |
416 | 486 | ]
|
417 | 487 | },
|
418 | 488 | {
|
|
0 commit comments