added more comments for clarity

srii5477 · srii5477 · commit f456afb45490 · 2024-10-06T13:51:34.000+05:30
diff --git a/Prediction Models/Handwritten Digit Recognition/svm.ipynb b/Prediction Models/Handwritten Digit Recognition/svm.ipynb
@@ -227,7 +227,8 @@
    ],
    "source": [
     "df_train = pd.read_csv(\"train.csv\")\n",
-    "df_train.head()"
+    "df_train.head()\n",
+    "# There are 784 features, each representing a pixel of an image that is represented by a row."
    ]
   },
   {
@@ -258,7 +259,7 @@
     }
    ],
    "source": [
-    "# it is clear that the dataset is not linearly separable and so, we can't employ a linear kernel and must transform input data to higher-dimensional space. \n",
+    "# It is clear that the dataset is not linearly separable and so, we can't employ a linear kernel and must transform input data to higher-dimensional space. \n",
     "# This is done using kernel trick when we invoke SVC() from the sklearn library with a non-linear kernel\n",
     "sns.pairplot(df_train, x_vars=['pixel0', 'pixel1', 'pixel2', 'pixel3', 'pixel4', 'pixel5', 'pixel6'], y_vars=['pixel7', 'pixel8', 'pixel9', 'pixel10', 'pixel11', 'pixel12'], hue='label')"
    ]
@@ -270,7 +271,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# segregating the dataset into features and label.\n",
+    "# Segregating the dataset into features and label.\n",
     "X = df_train.drop('label', axis=1)\n",
     "X = X.values\n",
     "y = df_train['label']\n",
@@ -295,7 +296,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# normalizing the data is important when we use svm to avoid giving unintentional priority to a feature owing to increased avg value\n",
+    "# Normalizing the data is important when we use svm to avoid giving unintentional priority to a feature owing to increased avg value\n",
     "scaler = StandardScaler()\n",
     "X_train = scaler.fit_transform(X_train)\n",
     "X_test = scaler.transform(X_test)"
@@ -726,7 +727,7 @@
     }
    ],
    "source": [
-    "# using default values for c, gamma and the default 'rbf' kernel\n",
+    "# Using default values for c, gamma and the default 'rbf' kernel\n",
     "model = SVC()\n",
     "model.fit(X_train, y_train)"
    ]
@@ -859,11 +860,12 @@
     }
    ],
    "source": [
-    "df_test = pd.read_csv(\"test.csv\", nrows=15) # for demonstrative purposes, limiting the rows read to speed up the process\n",
+    "df_test = pd.read_csv(\"test.csv\", nrows=15) # For demonstrative purposes, limiting the rows read to speed up the process\n",
     "X_testing = df_test.values\n",
     "X_testing = scaler.transform(X_testing)\n",
     "y_test_pred = model.predict(X_testing) \n",
-    "random_arr = [random.randint(0, len(df_test)-1) for _ in range(10)] # visualizing 10 test samples, manually checking the accuracy of prediction\n",
+    "random_arr = [random.randint(0, len(df_test)-1) for _ in range(10)] # Visualizing 10 test samples, manually checking the accuracy of prediction\n",
+    "# We resize the 784 columns row to a 28x28 grid to visualize the digit \n",
     "for i in random_arr:         \n",
     "    image = np.reshape(X_testing[i], (28, 28))\n",
     "    plt.title(f'Predicted: {y_test_pred[i]}')\n",
@@ -878,7 +880,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# use GridSearchCV to get the best parameters for our svm model that lead to increased accuracy\n",
+    "# Use GridSearchCV to get the best parameters for our svm model that lead to increased accuracy\n",
     "\n",
     "grid = {\n",
     "    'C': [0.1, 1, 10, 100, 1000],  \n",