Merge pull request #50571 from ShawnKupfer/WB1797

JillGrant615 · web-flow · commit d065c4947a35 · 2025-05-20T16:29:33.000-06:00
AB#1037347: Fix graphing code in Select and customize architectures a…
diff --git a/learn-pr/azure/machine-learning-architectures-and-hyperparameters/notebooks/7-3-exercise-decision-trees.ipynb b/learn-pr/azure/machine-learning-architectures-and-hyperparameters/notebooks/7-3-exercise-decision-trees.ipynb
@@ -54,28 +54,56 @@
    },
    "outputs": [],
    "source": [
-    "import graphing # custom graphing code. See our GitHub repo for details\n",
     "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
     "\n",
     "# Crime category\n",
-    "graphing.multiple_histogram(dataset, label_x='Category', label_group=\"Resolution\", histfunc='sum', show=True)\n",
+    "dataset.groupby(['Category', 'Resolution']).size().unstack().plot(kind='bar', stacked=False)\n",
+    "plt.xlabel('Category')\n",
+    "plt.ylabel('Count')\n",
+    "plt.title('Crimes by Category and Resolution')\n",
+    "plt.tight_layout()\n",
+    "plt.show()\n",
     "\n",
     "# District\n",
-    "graphing.multiple_histogram(dataset, label_group=\"Resolution\", label_x=\"PdDistrict\", show=True)\n",
+    "dataset.groupby(['PdDistrict', 'Resolution']).size().unstack().plot(kind='bar', stacked=False)\n",
+    "plt.xlabel('Police District')\n",
+    "plt.ylabel('Count')\n",
+    "plt.title('Crimes by District and Resolution')\n",
+    "plt.tight_layout()\n",
+    "plt.show()\n",
     "\n",
     "# Map of crimes\n",
-    "graphing.scatter_2D(dataset, label_x=\"X\", label_y=\"Y\", label_colour=\"Resolution\", title=\"GPS Coordinates\", size_multiplier=0.8, show=True)\n",
+    "import seaborn as sns\n",
     "\n",
-    "# Day of the week\n",
-    "graphing.multiple_histogram(dataset, label_group=\"Resolution\", label_x=\"DayOfWeek\", show=True)\n",
+    "plt.figure(figsize=(10, 6))\n",
+    "sns.scatterplot(data=dataset, x='X', y='Y', hue='Resolution', alpha=0.6, s=8 * 0.8)  # size_multiplier=0.8\n",
+    "plt.title('GPS Coordinates')\n",
+    "plt.xlabel('Longitude')\n",
+    "plt.ylabel('Latitude')\n",
+    "plt.legend(loc='best', title='Resolution')\n",
+    "plt.tight_layout()\n",
+    "plt.show()\n",
     "\n",
-    "# day of the year\n",
+    "# Day of the week\n",
+    "dataset.groupby(['DayOfWeek', 'Resolution']).size().unstack().plot(kind='bar', stacked=False)\n",
+    "plt.xlabel('Day of the Week')\n",
+    "plt.ylabel('Count')\n",
+    "plt.title('Crimes by Day of the Week and Resolution')\n",
+    "plt.tight_layout()\n",
+    "plt.show()\n",
+    "\n",
+    "# week of the year\n",
     "# For graphing we simplify this to week or the graph becomes overwhelmed with bars\n",
     "dataset[\"week_of_year\"] = np.round(dataset.day_of_year / 7.0)\n",
-    "graphing.multiple_histogram(dataset, \n",
-    "                    label_x='week_of_year',\n",
-    "                    label_group='Resolution',\n",
-    "                    histfunc='sum', show=True)\n",
+    "\n",
+    "dataset.groupby(['week_of_year', 'Resolution']).size().unstack().plot(kind='bar', stacked=False)\n",
+    "plt.xlabel('Week of the Year')\n",
+    "plt.ylabel('Count')\n",
+    "plt.title('Crimes by Week and Resolution')\n",
+    "plt.tight_layout()\n",
+    "plt.show()\n",
+    "\n",
     "del dataset[\"week_of_year\"]"
    ]
   },
@@ -308,20 +336,44 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Temporarily shrink the training set to something\n",
-    "# more realistic\n",
+    "# Temporarily shrink the training set to 10000\n",
+    "# for this exercise to see how pruning is important\n",
+    "# even with moderately large datasets\n",
     "full_training_set = train\n",
-    "train = train[:100]\n",
+    "train = train[:10000]\n",
     "\n",
-    "# fit the same tree as before\n",
-    "model = sklearn.tree.DecisionTreeClassifier(random_state=1, max_depth=100)\n",
     "\n",
-    "# Assess on the same test set as before\n",
-    "train_accuracy, test_accuracy = fit_and_test_model(model)\n",
-    "print(\"Train accuracy\", train_accuracy)\n",
-    "print(\"Test accuracy\", test_accuracy)\n",
+    "# Loop through the values below and build a model\n",
+    "# each time, setting the maximum depth to that value \n",
+    "max_depth_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,15, 20, 50, 100]\n",
+    "accuracy_trainset = []\n",
+    "accuracy_testset = []\n",
+    "for depth in max_depth_range:\n",
+    "    # Create and fit the model\n",
+    "    prune_model = sklearn.tree.DecisionTreeClassifier(random_state=1, max_depth=depth)\n",
     "\n",
-    "# Roll the training set back to the full set\n",
+    "    # Calculate and record its sensitivity\n",
+    "    train_accuracy, test_accuracy = fit_and_test_model(prune_model)\n",
+    "    accuracy_trainset.append(train_accuracy)\n",
+    "    accuracy_testset.append(test_accuracy)\n",
+    "\n",
+    "# Plot the sensitivity as a function of depth  \n",
+    "pruned_plot = pandas.DataFrame(dict(max_depth=max_depth_range, accuracy=accuracy_trainset))\n",
+    "\n",
+    "plt.figure(figsize=(10, 6))\n",
+    "plt.plot(max_depth_range, accuracy_trainset, marker='o', label='Train Accuracy')\n",
+    "plt.plot(max_depth_range, accuracy_testset, marker='s', label='Test Accuracy')\n",
+    "\n",
+    "plt.title('Model Accuracy vs. Decision Tree Depth')\n",
+    "plt.xlabel('Max Depth')\n",
+    "plt.ylabel('Accuracy')\n",
+    "plt.xticks(max_depth_range)\n",
+    "plt.legend()\n",
+    "plt.grid(True)\n",
+    "plt.tight_layout()\n",
+    "plt.show()\n",
+    "\n",
+    "# Roll the training set back to the full thing\n",
     "train = full_training_set"
    ]
   },
@@ -377,7 +429,18 @@
     "# Plot the sensitivity as a function of depth  \n",
     "pruned_plot = pandas.DataFrame(dict(max_depth=max_depth_range, accuracy=accuracy_trainset))\n",
     "\n",
-    "fig = graphing.line_2D(dict(train=accuracy_trainset, test=accuracy_testset), x_range=max_depth_range, show=True)\n",
+    "plt.figure(figsize=(10, 6))\n",
+    "plt.plot(max_depth_range, accuracy_trainset, marker='o', label='Train Accuracy')\n",
+    "plt.plot(max_depth_range, accuracy_testset, marker='s', label='Test Accuracy')\n",
+    "\n",
+    "plt.title('Model Accuracy vs. Decision Tree Depth')\n",
+    "plt.xlabel('Max Depth')\n",
+    "plt.ylabel('Accuracy')\n",
+    "plt.xticks(max_depth_range)\n",
+    "plt.legend()\n",
+    "plt.grid(True)\n",
+    "plt.tight_layout()\n",
+    "plt.show()\n",
     "\n",
     "# Roll the training set back to the full thing\n",
     "train = full_training_set"
diff --git a/learn-pr/azure/machine-learning-architectures-and-hyperparameters/notebooks/7-5-exercise-random-forests.ipynb b/learn-pr/azure/machine-learning-architectures-and-hyperparameters/notebooks/7-5-exercise-random-forests.ipynb
@@ -34,7 +34,6 @@
     "!wget https://raw.githubusercontent.com/MicrosoftDocs/mslearn-introduction-to-machine-learning/main/Data/san_fran_crime.csv\n",
     "import numpy as np\n",
     "from sklearn.model_selection import train_test_split\n",
-    "import graphing # custom graphing code. See our GitHub repo for details\n",
     "\n",
     "# Import the data from the .csv file\n",
     "dataset = pandas.read_csv('san_fran_crime.csv', delimiter=\"\\t\")\n",
@@ -185,7 +184,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import graphing\n",
+    "import matplotlib.pyplot as plt\n",
     "\n",
     "# n_estimators states how many trees to put in the model\n",
     "# We will make one model for every entry in this list\n",
@@ -213,11 +212,18 @@
     "\n",
     "\n",
     "# Plot results\n",
-    "graphing.line_2D(dict(Train=train_accuracies, Test=test_accuracies), \n",
-    "                    n_estimators,\n",
-    "                    label_x=\"Numer of trees (n_estimators)\",\n",
-    "                    label_y=\"Accuracy\",\n",
-    "                    title=\"Performance X number of trees\", show=True)"
+    "plt.figure(figsize=(10, 6))\n",
+    "plt.plot(n_estimators, train_accuracies, marker='o', label='Train')\n",
+    "plt.plot(n_estimators, test_accuracies, marker='s', label='Test')\n",
+    "\n",
+    "plt.title('Performance vs. Number of Trees')\n",
+    "plt.xlabel('Number of trees (n_estimators)')\n",
+    "plt.ylabel('Accuracy')\n",
+    "plt.xticks(n_estimators) \n",
+    "plt.grid(True)\n",
+    "plt.legend()\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
    ]
   },
   {
@@ -279,13 +285,20 @@
     "\n",
     "\n",
     "# Plot results\n",
-    "graphing.line_2D(dict(Train=train_accuracies, Test=test_accuracies), \n",
-    "                    min_samples_split,\n",
-    "                    label_x=\"Minimum samples split (min_samples_split)\",\n",
-    "                    label_y=\"Accuracy\",\n",
-    "                    title=\"Performance\", show=True)\n",
-    "\n",
-    "# Rol back the trainset to the full set\n",
+    "plt.figure(figsize=(10, 6))\n",
+    "plt.plot(min_samples_split, train_accuracies, marker='o', label='Train')\n",
+    "plt.plot(min_samples_split, test_accuracies, marker='s', label='Test')\n",
+    "\n",
+    "plt.title('Performance vs. min_samples_split')\n",
+    "plt.xlabel('Minimum samples split (min_samples_split)')\n",
+    "plt.ylabel('Accuracy')\n",
+    "plt.xticks(min_samples_split) \n",
+    "plt.grid(True)\n",
+    "plt.legend()\n",
+    "plt.tight_layout()\n",
+    "plt.show()\n",
+    "\n",
+    "# Roll back the trainset to the full set\n",
     "train = full_trainset"
    ]
   },
@@ -347,13 +360,20 @@
     "\n",
     "\n",
     "# Plot results\n",
-    "graphing.line_2D(dict(Train=train_accuracies, Test=test_accuracies),\n",
-    "                    max_depths,\n",
-    "                    label_x=\"Maximum depth (max_depths)\",\n",
-    "                    label_y=\"Accuracy\",\n",
-    "                    title=\"Performance\", show=True)\n",
-    "\n",
-    "# Rol back the trainset to the full set\n",
+    "plt.figure(figsize=(10, 6))\n",
+    "plt.plot(max_depths, train_accuracies, marker='o', label='Train')\n",
+    "plt.plot(max_depths, test_accuracies, marker='s', label='Test')\n",
+    "\n",
+    "plt.title('Performance vs. max_depth')\n",
+    "plt.xlabel('Maximum depth (max_depths)')\n",
+    "plt.ylabel('Accuracy')\n",
+    "plt.xticks(max_depths)  # Ensure all depth values appear on the x-axis\n",
+    "plt.grid(True)\n",
+    "plt.legend()\n",
+    "plt.tight_layout()\n",
+    "plt.show()\n",
+    "\n",
+    "# Roll back the trainset to the full set\n",
     "train = full_trainset"
    ]
   },
@@ -390,7 +410,7 @@
     "                            verbose=False)\n",
     "\n",
     "# Train and test the result\n",
-    "print(\"Training model. This may take 1 - 2 minutes\")\n",
+    "print(\"Training model. This might take 1 - 2 minutes\")\n",
     "train_accuracy, test_accuracy = fit_and_test_model(rf)\n",
     "\n",
     "# Print out results, compared to the decision tree\n",
diff --git a/learn-pr/azure/machine-learning-architectures-and-hyperparameters/notebooks/7-7-exercise-hyperparameters-tuning.ipynb b/learn-pr/azure/machine-learning-architectures-and-hyperparameters/notebooks/7-7-exercise-hyperparameters-tuning.ipynb
@@ -33,7 +33,6 @@
     "!wget https://raw.githubusercontent.com/MicrosoftDocs/mslearn-introduction-to-machine-learning/main/Data/san_fran_crime.csv\n",
     "from sklearn.model_selection import train_test_split\n",
     "from sklearn.metrics import balanced_accuracy_score\n",
-    "import graphing # custom graphing code. See our GitHub repo for details\n",
     "\n",
     "#Import the data from the .csv file\n",
     "dataset = pandas.read_csv('san_fran_crime.csv', delimiter=\"\\t\")\n",
@@ -156,6 +155,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import matplotlib.pyplot as plt\n",
     "import numpy as np\n",
     "\n",
     "# Shrink the training set temporarily to explore this\n",
@@ -187,14 +187,21 @@
     "\n",
     "\n",
     "# Plot results\n",
-    "graphing.line_2D(dict(Train=train_accuracies, Test=test_accuracies), \n",
-    "                    min_impurity_decreases,\n",
-    "                    label_x=\"Minimum impurity decreases (min_impurity_decrease)\",\n",
-    "                    label_y=\"Accuracy\",\n",
-    "                    title=\"Performance\", show=True)\n",
+    "plt.figure(figsize=(10, 6))\n",
+    "plt.plot(min_impurity_decreases, train_accuracies, marker='o', label='Train')\n",
+    "plt.plot(min_impurity_decreases, test_accuracies, marker='s', label='Test')\n",
+    "\n",
+    "plt.title('Performance')\n",
+    "plt.xlabel('Minimum impurity decreases (min_impurity_decrease)')\n",
+    "plt.ylabel('Accuracy')\n",
+    "plt.xticks(min_impurity_decreases)  \n",
+    "plt.grid(True)\n",
+    "plt.legend()\n",
+    "plt.tight_layout()\n",
+    "plt.show()\n",
     "\n",
     "# Roll back the train dataset to the full train set\n",
-    "train = full_trainset\n"
+    "train = full_trainset"
    ]
   },
   {
@@ -249,11 +256,19 @@
     "\n",
     "\n",
     "# Plot results\n",
-    "graphing.line_2D(dict(Train=train_accuracies, Test=test_accuracies), \n",
-    "                    max_features,\n",
-    "                    label_x=\"Maximum number of features (max_features)\",\n",
-    "                    label_y=\"Accuracy\",\n",
-    "                    title=\"Performance\", show=True)\n",
+    "plt.figure(figsize=(10, 6))\n",
+    "plt.plot(max_features, train_accuracies, marker='o', label='Train')\n",
+    "plt.plot(max_features, test_accuracies, marker='s', label='Test')\n",
+    "\n",
+    "plt.title('Performance vs. max_features')\n",
+    "plt.xlabel('Maximum number of features (max_features)')\n",
+    "plt.ylabel('Accuracy')\n",
+    "plt.xticks(max_features) \n",
+    "plt.grid(True)\n",
+    "plt.legend()\n",
+    "plt.tight_layout()\n",
+    "plt.show()\n",
+    "\n",
     "\n",
     "# Roll back the trainset to the full set\n",
     "train = full_trainset"
@@ -313,11 +328,18 @@
     "\n",
     "\n",
     "# Plot results\n",
-    "graphing.line_2D(dict(Train=train_accuracies, Test=test_accuracies), \n",
-    "                    seeds,\n",
-    "                    label_x=\"Seed value (random_state)\",\n",
-    "                    label_y=\"Accuracy\",\n",
-    "                    title=\"Performance\", show=True)\n",
+    "\n",
+    "plt.figure(figsize=(10, 6))\n",
+    "plt.plot(seeds, train_accuracies, label='Train', alpha=0.7)\n",
+    "plt.plot(seeds, test_accuracies, label='Test', alpha=0.7)\n",
+    "\n",
+    "plt.title('Performance vs. Random Seed')\n",
+    "plt.xlabel('Seed value (random_state)')\n",
+    "plt.ylabel('Accuracy')\n",
+    "plt.grid(True)\n",
+    "plt.legend()\n",
+    "plt.tight_layout()\n",
+    "plt.show()\n",
     "\n",
     "# Roll back the trainset to the full set\n",
     "train = full_trainset"