Skip to content

Commit d065c49

Browse files
authored
Merge pull request #50571 from ShawnKupfer/WB1797
AB#1037347: Fix graphing code in Select and customize architectures a…
2 parents 0a8761c + ea1d7e0 commit d065c49

File tree

3 files changed

+166
-61
lines changed

3 files changed

+166
-61
lines changed

learn-pr/azure/machine-learning-architectures-and-hyperparameters/notebooks/7-3-exercise-decision-trees.ipynb

Lines changed: 85 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -54,28 +54,56 @@
5454
},
5555
"outputs": [],
5656
"source": [
57-
"import graphing # custom graphing code. See our GitHub repo for details\n",
5857
"import numpy as np\n",
58+
"import matplotlib.pyplot as plt\n",
5959
"\n",
6060
"# Crime category\n",
61-
"graphing.multiple_histogram(dataset, label_x='Category', label_group=\"Resolution\", histfunc='sum', show=True)\n",
61+
"dataset.groupby(['Category', 'Resolution']).size().unstack().plot(kind='bar', stacked=False)\n",
62+
"plt.xlabel('Category')\n",
63+
"plt.ylabel('Count')\n",
64+
"plt.title('Crimes by Category and Resolution')\n",
65+
"plt.tight_layout()\n",
66+
"plt.show()\n",
6267
"\n",
6368
"# District\n",
64-
"graphing.multiple_histogram(dataset, label_group=\"Resolution\", label_x=\"PdDistrict\", show=True)\n",
69+
"dataset.groupby(['PdDistrict', 'Resolution']).size().unstack().plot(kind='bar', stacked=False)\n",
70+
"plt.xlabel('Police District')\n",
71+
"plt.ylabel('Count')\n",
72+
"plt.title('Crimes by District and Resolution')\n",
73+
"plt.tight_layout()\n",
74+
"plt.show()\n",
6575
"\n",
6676
"# Map of crimes\n",
67-
"graphing.scatter_2D(dataset, label_x=\"X\", label_y=\"Y\", label_colour=\"Resolution\", title=\"GPS Coordinates\", size_multiplier=0.8, show=True)\n",
77+
"import seaborn as sns\n",
6878
"\n",
69-
"# Day of the week\n",
70-
"graphing.multiple_histogram(dataset, label_group=\"Resolution\", label_x=\"DayOfWeek\", show=True)\n",
79+
"plt.figure(figsize=(10, 6))\n",
80+
"sns.scatterplot(data=dataset, x='X', y='Y', hue='Resolution', alpha=0.6, s=8 * 0.8) # size_multiplier=0.8\n",
81+
"plt.title('GPS Coordinates')\n",
82+
"plt.xlabel('Longitude')\n",
83+
"plt.ylabel('Latitude')\n",
84+
"plt.legend(loc='best', title='Resolution')\n",
85+
"plt.tight_layout()\n",
86+
"plt.show()\n",
7187
"\n",
72-
"# day of the year\n",
88+
"# Day of the week\n",
89+
"dataset.groupby(['DayOfWeek', 'Resolution']).size().unstack().plot(kind='bar', stacked=False)\n",
90+
"plt.xlabel('Day of the Week')\n",
91+
"plt.ylabel('Count')\n",
92+
"plt.title('Crimes by Day of the Week and Resolution')\n",
93+
"plt.tight_layout()\n",
94+
"plt.show()\n",
95+
"\n",
96+
"# week of the year\n",
7397
"# For graphing we simplify this to week or the graph becomes overwhelmed with bars\n",
7498
"dataset[\"week_of_year\"] = np.round(dataset.day_of_year / 7.0)\n",
75-
"graphing.multiple_histogram(dataset, \n",
76-
" label_x='week_of_year',\n",
77-
" label_group='Resolution',\n",
78-
" histfunc='sum', show=True)\n",
99+
"\n",
100+
"dataset.groupby(['week_of_year', 'Resolution']).size().unstack().plot(kind='bar', stacked=False)\n",
101+
"plt.xlabel('Week of the Year')\n",
102+
"plt.ylabel('Count')\n",
103+
"plt.title('Crimes by Week and Resolution')\n",
104+
"plt.tight_layout()\n",
105+
"plt.show()\n",
106+
"\n",
79107
"del dataset[\"week_of_year\"]"
80108
]
81109
},
@@ -308,20 +336,44 @@
308336
"metadata": {},
309337
"outputs": [],
310338
"source": [
311-
"# Temporarily shrink the training set to something\n",
312-
"# more realistic\n",
339+
"# Temporarily shrink the training set to 10000\n",
340+
"# for this exercise to see how pruning is important\n",
341+
"# even with moderately large datasets\n",
313342
"full_training_set = train\n",
314-
"train = train[:100]\n",
343+
"train = train[:10000]\n",
315344
"\n",
316-
"# fit the same tree as before\n",
317-
"model = sklearn.tree.DecisionTreeClassifier(random_state=1, max_depth=100)\n",
318345
"\n",
319-
"# Assess on the same test set as before\n",
320-
"train_accuracy, test_accuracy = fit_and_test_model(model)\n",
321-
"print(\"Train accuracy\", train_accuracy)\n",
322-
"print(\"Test accuracy\", test_accuracy)\n",
346+
"# Loop through the values below and build a model\n",
347+
"# each time, setting the maximum depth to that value \n",
348+
"max_depth_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,15, 20, 50, 100]\n",
349+
"accuracy_trainset = []\n",
350+
"accuracy_testset = []\n",
351+
"for depth in max_depth_range:\n",
352+
" # Create and fit the model\n",
353+
" prune_model = sklearn.tree.DecisionTreeClassifier(random_state=1, max_depth=depth)\n",
323354
"\n",
324-
"# Roll the training set back to the full set\n",
355+
" # Calculate and record its sensitivity\n",
356+
" train_accuracy, test_accuracy = fit_and_test_model(prune_model)\n",
357+
" accuracy_trainset.append(train_accuracy)\n",
358+
" accuracy_testset.append(test_accuracy)\n",
359+
"\n",
360+
"# Plot the sensitivity as a function of depth \n",
361+
"pruned_plot = pandas.DataFrame(dict(max_depth=max_depth_range, accuracy=accuracy_trainset))\n",
362+
"\n",
363+
"plt.figure(figsize=(10, 6))\n",
364+
"plt.plot(max_depth_range, accuracy_trainset, marker='o', label='Train Accuracy')\n",
365+
"plt.plot(max_depth_range, accuracy_testset, marker='s', label='Test Accuracy')\n",
366+
"\n",
367+
"plt.title('Model Accuracy vs. Decision Tree Depth')\n",
368+
"plt.xlabel('Max Depth')\n",
369+
"plt.ylabel('Accuracy')\n",
370+
"plt.xticks(max_depth_range)\n",
371+
"plt.legend()\n",
372+
"plt.grid(True)\n",
373+
"plt.tight_layout()\n",
374+
"plt.show()\n",
375+
"\n",
376+
"# Roll the training set back to the full thing\n",
325377
"train = full_training_set"
326378
]
327379
},
@@ -377,7 +429,18 @@
377429
"# Plot the sensitivity as a function of depth \n",
378430
"pruned_plot = pandas.DataFrame(dict(max_depth=max_depth_range, accuracy=accuracy_trainset))\n",
379431
"\n",
380-
"fig = graphing.line_2D(dict(train=accuracy_trainset, test=accuracy_testset), x_range=max_depth_range, show=True)\n",
432+
"plt.figure(figsize=(10, 6))\n",
433+
"plt.plot(max_depth_range, accuracy_trainset, marker='o', label='Train Accuracy')\n",
434+
"plt.plot(max_depth_range, accuracy_testset, marker='s', label='Test Accuracy')\n",
435+
"\n",
436+
"plt.title('Model Accuracy vs. Decision Tree Depth')\n",
437+
"plt.xlabel('Max Depth')\n",
438+
"plt.ylabel('Accuracy')\n",
439+
"plt.xticks(max_depth_range)\n",
440+
"plt.legend()\n",
441+
"plt.grid(True)\n",
442+
"plt.tight_layout()\n",
443+
"plt.show()\n",
381444
"\n",
382445
"# Roll the training set back to the full thing\n",
383446
"train = full_training_set"

learn-pr/azure/machine-learning-architectures-and-hyperparameters/notebooks/7-5-exercise-random-forests.ipynb

Lines changed: 42 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@
3434
"!wget https://raw.githubusercontent.com/MicrosoftDocs/mslearn-introduction-to-machine-learning/main/Data/san_fran_crime.csv\n",
3535
"import numpy as np\n",
3636
"from sklearn.model_selection import train_test_split\n",
37-
"import graphing # custom graphing code. See our GitHub repo for details\n",
3837
"\n",
3938
"# Import the data from the .csv file\n",
4039
"dataset = pandas.read_csv('san_fran_crime.csv', delimiter=\"\\t\")\n",
@@ -185,7 +184,7 @@
185184
"metadata": {},
186185
"outputs": [],
187186
"source": [
188-
"import graphing\n",
187+
"import matplotlib.pyplot as plt\n",
189188
"\n",
190189
"# n_estimators states how many trees to put in the model\n",
191190
"# We will make one model for every entry in this list\n",
@@ -213,11 +212,18 @@
213212
"\n",
214213
"\n",
215214
"# Plot results\n",
216-
"graphing.line_2D(dict(Train=train_accuracies, Test=test_accuracies), \n",
217-
" n_estimators,\n",
218-
" label_x=\"Numer of trees (n_estimators)\",\n",
219-
" label_y=\"Accuracy\",\n",
220-
" title=\"Performance X number of trees\", show=True)"
215+
"plt.figure(figsize=(10, 6))\n",
216+
"plt.plot(n_estimators, train_accuracies, marker='o', label='Train')\n",
217+
"plt.plot(n_estimators, test_accuracies, marker='s', label='Test')\n",
218+
"\n",
219+
"plt.title('Performance vs. Number of Trees')\n",
220+
"plt.xlabel('Number of trees (n_estimators)')\n",
221+
"plt.ylabel('Accuracy')\n",
222+
"plt.xticks(n_estimators) \n",
223+
"plt.grid(True)\n",
224+
"plt.legend()\n",
225+
"plt.tight_layout()\n",
226+
"plt.show()"
221227
]
222228
},
223229
{
@@ -279,13 +285,20 @@
279285
"\n",
280286
"\n",
281287
"# Plot results\n",
282-
"graphing.line_2D(dict(Train=train_accuracies, Test=test_accuracies), \n",
283-
" min_samples_split,\n",
284-
" label_x=\"Minimum samples split (min_samples_split)\",\n",
285-
" label_y=\"Accuracy\",\n",
286-
" title=\"Performance\", show=True)\n",
287-
"\n",
288-
"# Rol back the trainset to the full set\n",
288+
"plt.figure(figsize=(10, 6))\n",
289+
"plt.plot(min_samples_split, train_accuracies, marker='o', label='Train')\n",
290+
"plt.plot(min_samples_split, test_accuracies, marker='s', label='Test')\n",
291+
"\n",
292+
"plt.title('Performance vs. min_samples_split')\n",
293+
"plt.xlabel('Minimum samples split (min_samples_split)')\n",
294+
"plt.ylabel('Accuracy')\n",
295+
"plt.xticks(min_samples_split) \n",
296+
"plt.grid(True)\n",
297+
"plt.legend()\n",
298+
"plt.tight_layout()\n",
299+
"plt.show()\n",
300+
"\n",
301+
"# Roll back the trainset to the full set\n",
289302
"train = full_trainset"
290303
]
291304
},
@@ -347,13 +360,20 @@
347360
"\n",
348361
"\n",
349362
"# Plot results\n",
350-
"graphing.line_2D(dict(Train=train_accuracies, Test=test_accuracies),\n",
351-
" max_depths,\n",
352-
" label_x=\"Maximum depth (max_depths)\",\n",
353-
" label_y=\"Accuracy\",\n",
354-
" title=\"Performance\", show=True)\n",
355-
"\n",
356-
"# Rol back the trainset to the full set\n",
363+
"plt.figure(figsize=(10, 6))\n",
364+
"plt.plot(max_depths, train_accuracies, marker='o', label='Train')\n",
365+
"plt.plot(max_depths, test_accuracies, marker='s', label='Test')\n",
366+
"\n",
367+
"plt.title('Performance vs. max_depth')\n",
368+
"plt.xlabel('Maximum depth (max_depths)')\n",
369+
"plt.ylabel('Accuracy')\n",
370+
"plt.xticks(max_depths) # Ensure all depth values appear on the x-axis\n",
371+
"plt.grid(True)\n",
372+
"plt.legend()\n",
373+
"plt.tight_layout()\n",
374+
"plt.show()\n",
375+
"\n",
376+
"# Roll back the trainset to the full set\n",
357377
"train = full_trainset"
358378
]
359379
},
@@ -390,7 +410,7 @@
390410
" verbose=False)\n",
391411
"\n",
392412
"# Train and test the result\n",
393-
"print(\"Training model. This may take 1 - 2 minutes\")\n",
413+
"print(\"Training model. This might take 1 - 2 minutes\")\n",
394414
"train_accuracy, test_accuracy = fit_and_test_model(rf)\n",
395415
"\n",
396416
"# Print out results, compared to the decision tree\n",

learn-pr/azure/machine-learning-architectures-and-hyperparameters/notebooks/7-7-exercise-hyperparameters-tuning.ipynb

Lines changed: 39 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@
3333
"!wget https://raw.githubusercontent.com/MicrosoftDocs/mslearn-introduction-to-machine-learning/main/Data/san_fran_crime.csv\n",
3434
"from sklearn.model_selection import train_test_split\n",
3535
"from sklearn.metrics import balanced_accuracy_score\n",
36-
"import graphing # custom graphing code. See our GitHub repo for details\n",
3736
"\n",
3837
"#Import the data from the .csv file\n",
3938
"dataset = pandas.read_csv('san_fran_crime.csv', delimiter=\"\\t\")\n",
@@ -156,6 +155,7 @@
156155
"metadata": {},
157156
"outputs": [],
158157
"source": [
158+
"import matplotlib.pyplot as plt\n",
159159
"import numpy as np\n",
160160
"\n",
161161
"# Shrink the training set temporarily to explore this\n",
@@ -187,14 +187,21 @@
187187
"\n",
188188
"\n",
189189
"# Plot results\n",
190-
"graphing.line_2D(dict(Train=train_accuracies, Test=test_accuracies), \n",
191-
" min_impurity_decreases,\n",
192-
" label_x=\"Minimum impurity decreases (min_impurity_decrease)\",\n",
193-
" label_y=\"Accuracy\",\n",
194-
" title=\"Performance\", show=True)\n",
190+
"plt.figure(figsize=(10, 6))\n",
191+
"plt.plot(min_impurity_decreases, train_accuracies, marker='o', label='Train')\n",
192+
"plt.plot(min_impurity_decreases, test_accuracies, marker='s', label='Test')\n",
193+
"\n",
194+
"plt.title('Performance')\n",
195+
"plt.xlabel('Minimum impurity decreases (min_impurity_decrease)')\n",
196+
"plt.ylabel('Accuracy')\n",
197+
"plt.xticks(min_impurity_decreases) \n",
198+
"plt.grid(True)\n",
199+
"plt.legend()\n",
200+
"plt.tight_layout()\n",
201+
"plt.show()\n",
195202
"\n",
196203
"# Roll back the train dataset to the full train set\n",
197-
"train = full_trainset\n"
204+
"train = full_trainset"
198205
]
199206
},
200207
{
@@ -249,11 +256,19 @@
249256
"\n",
250257
"\n",
251258
"# Plot results\n",
252-
"graphing.line_2D(dict(Train=train_accuracies, Test=test_accuracies), \n",
253-
" max_features,\n",
254-
" label_x=\"Maximum number of features (max_features)\",\n",
255-
" label_y=\"Accuracy\",\n",
256-
" title=\"Performance\", show=True)\n",
259+
"plt.figure(figsize=(10, 6))\n",
260+
"plt.plot(max_features, train_accuracies, marker='o', label='Train')\n",
261+
"plt.plot(max_features, test_accuracies, marker='s', label='Test')\n",
262+
"\n",
263+
"plt.title('Performance vs. max_features')\n",
264+
"plt.xlabel('Maximum number of features (max_features)')\n",
265+
"plt.ylabel('Accuracy')\n",
266+
"plt.xticks(max_features) \n",
267+
"plt.grid(True)\n",
268+
"plt.legend()\n",
269+
"plt.tight_layout()\n",
270+
"plt.show()\n",
271+
"\n",
257272
"\n",
258273
"# Roll back the trainset to the full set\n",
259274
"train = full_trainset"
@@ -313,11 +328,18 @@
313328
"\n",
314329
"\n",
315330
"# Plot results\n",
316-
"graphing.line_2D(dict(Train=train_accuracies, Test=test_accuracies), \n",
317-
" seeds,\n",
318-
" label_x=\"Seed value (random_state)\",\n",
319-
" label_y=\"Accuracy\",\n",
320-
" title=\"Performance\", show=True)\n",
331+
"\n",
332+
"plt.figure(figsize=(10, 6))\n",
333+
"plt.plot(seeds, train_accuracies, label='Train', alpha=0.7)\n",
334+
"plt.plot(seeds, test_accuracies, label='Test', alpha=0.7)\n",
335+
"\n",
336+
"plt.title('Performance vs. Random Seed')\n",
337+
"plt.xlabel('Seed value (random_state)')\n",
338+
"plt.ylabel('Accuracy')\n",
339+
"plt.grid(True)\n",
340+
"plt.legend()\n",
341+
"plt.tight_layout()\n",
342+
"plt.show()\n",
321343
"\n",
322344
"# Roll back the trainset to the full set\n",
323345
"train = full_trainset"

0 commit comments

Comments
 (0)