|
227 | 227 | ],
|
228 | 228 | "source": [
|
229 | 229 | "df_train = pd.read_csv(\"train.csv\")\n",
|
230 |
| - "df_train.head()" |
| 230 | + "df_train.head()\n", |
| 231 | + "# There are 784 features, each representing a pixel of an image that is represented by a row." |
231 | 232 | ]
|
232 | 233 | },
|
233 | 234 | {
|
|
258 | 259 | }
|
259 | 260 | ],
|
260 | 261 | "source": [
|
261 |
| - "# it is clear that the dataset is not linearly separable and so, we can't employ a linear kernel and must transform input data to higher-dimensional space. \n", |
| 262 | + "# It is clear that the dataset is not linearly separable and so, we can't employ a linear kernel and must transform input data to higher-dimensional space. \n", |
262 | 263 | "# This is done using kernel trick when we invoke SVC() from the sklearn library with a non-linear kernel\n",
|
263 | 264 | "sns.pairplot(df_train, x_vars=['pixel0', 'pixel1', 'pixel2', 'pixel3', 'pixel4', 'pixel5', 'pixel6'], y_vars=['pixel7', 'pixel8', 'pixel9', 'pixel10', 'pixel11', 'pixel12'], hue='label')"
|
264 | 265 | ]
|
|
270 | 271 | "metadata": {},
|
271 | 272 | "outputs": [],
|
272 | 273 | "source": [
|
273 |
| - "# segregating the dataset into features and label.\n", |
| 274 | + "# Segregating the dataset into features and label.\n", |
274 | 275 | "X = df_train.drop('label', axis=1)\n",
|
275 | 276 | "X = X.values\n",
|
276 | 277 | "y = df_train['label']\n",
|
|
295 | 296 | "metadata": {},
|
296 | 297 | "outputs": [],
|
297 | 298 | "source": [
|
298 |
| - "# normalizing the data is important when we use svm to avoid giving unintentional priority to a feature owing to increased avg value\n", |
| 299 | + "# Normalizing the data is important when we use svm to avoid giving unintentional priority to a feature owing to increased avg value\n", |
299 | 300 | "scaler = StandardScaler()\n",
|
300 | 301 | "X_train = scaler.fit_transform(X_train)\n",
|
301 | 302 | "X_test = scaler.transform(X_test)"
|
|
726 | 727 | }
|
727 | 728 | ],
|
728 | 729 | "source": [
|
729 |
| - "# using default values for c, gamma and the default 'rbf' kernel\n", |
| 730 | + "# Using default values for c, gamma and the default 'rbf' kernel\n", |
730 | 731 | "model = SVC()\n",
|
731 | 732 | "model.fit(X_train, y_train)"
|
732 | 733 | ]
|
|
859 | 860 | }
|
860 | 861 | ],
|
861 | 862 | "source": [
|
862 |
| - "df_test = pd.read_csv(\"test.csv\", nrows=15) # for demonstrative purposes, limiting the rows read to speed up the process\n", |
| 863 | + "df_test = pd.read_csv(\"test.csv\", nrows=15) # For demonstrative purposes, limiting the rows read to speed up the process\n", |
863 | 864 | "X_testing = df_test.values\n",
|
864 | 865 | "X_testing = scaler.transform(X_testing)\n",
|
865 | 866 | "y_test_pred = model.predict(X_testing) \n",
|
866 |
| - "random_arr = [random.randint(0, len(df_test)-1) for _ in range(10)] # visualizing 10 test samples, manually checking the accuracy of prediction\n", |
| 867 | + "random_arr = [random.randint(0, len(df_test)-1) for _ in range(10)] # Visualizing 10 test samples, manually checking the accuracy of prediction\n", |
| 868 | + "# We resize the 784 columns row to a 28x28 grid to visualize the digit \n", |
867 | 869 | "for i in random_arr: \n",
|
868 | 870 | " image = np.reshape(X_testing[i], (28, 28))\n",
|
869 | 871 | " plt.title(f'Predicted: {y_test_pred[i]}')\n",
|
|
878 | 880 | "metadata": {},
|
879 | 881 | "outputs": [],
|
880 | 882 | "source": [
|
881 |
| - "# use GridSearchCV to get the best parameters for our svm model that lead to increased accuracy\n", |
| 883 | + "# Use GridSearchCV to get the best parameters for our svm model that lead to increased accuracy\n", |
882 | 884 | "\n",
|
883 | 885 | "grid = {\n",
|
884 | 886 | " 'C': [0.1, 1, 10, 100, 1000], \n",
|
|
0 commit comments