docs: Add detailed comments to data preprocessing notebook

arjun-ms · arjun-ms · commit cc491ad65edf · 2025-10-03T09:13:40.000+05:30
Added comprehensive comments explaining the data loading, preprocessing, labeling, shuffling, and saving workflow to improve code documentation.
diff --git a/Model/data_preprocess.ipynb b/Model/data_preprocess.ipynb
@@ -14,98 +14,55 @@
    "id": "3b0a337f",
    "metadata": {},
    "outputs": [],
-   "source": [
-    "import numpy as np\n",
-    "import cv2 as cv\n",
-    "import random\n",
-    "import os\n",
-    "import matplotlib.pyplot as plt\n",
-    "import pickle\n",
-    "%matplotlib inline"
-   ]
+   "source": "# Import necessary libraries\nimport numpy as np  # For numerical operations and array handling\nimport cv2 as cv  # OpenCV for image processing\nimport random  # For shuffling data\nimport os  # For file and directory operations\nimport matplotlib.pyplot as plt  # For data visualization\nimport pickle  # For saving processed data to disk\n%matplotlib inline"
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "id": "850322e9",
    "metadata": {},
    "outputs": [],
-   "source": [
-    "DIRECTORY = r'C:\\Users\\Arjun M S\\Desktop\\build-from-home\\dataset'\n",
-    "# a raw string(r'...') is used to treat backslash(\\) as a normal character\n",
-    "CATEGORIES = ['mammooty','mohanlal','random']"
-   ]
+   "source": "# Define the dataset directory path\nDIRECTORY = r'C:\\Users\\Arjun M S\\Desktop\\build-from-home\\dataset'\n# A raw string (r'...') is used to treat backslash (\\) as a normal character\n\n# Define the three classes for classification\nCATEGORIES = ['mammooty','mohanlal','random']"
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "id": "f01e99e7",
    "metadata": {},
    "outputs": [],
-   "source": [
-    "\n",
-    "IMG_SIZE = 224\n",
-    "\n",
-    "data = []\n",
-    "\n",
-    "# labelling the data\n",
-    "for category in CATEGORIES:\n",
-    "    folder = os.path.join(DIRECTORY, category)\n",
-    "    for img in os.listdir(folder):\n",
-    "        img_path = os.path.join(folder, img)\n",
-    "        label = CATEGORIES.index(category)\n",
-    "        img_arr = cv.imread(img_path, cv.IMREAD_GRAYSCALE)\n",
-    "        img_arr = cv.resize(img_arr,(IMG_SIZE,IMG_SIZE))\n",
-    "        data.append([img_arr,label])\n"
-   ]
+   "source": "# Set the target image size for model input\nIMG_SIZE = 224\n\n# Initialize list to store image data and labels\ndata = []\n\n# Process and label all images from the dataset\nfor category in CATEGORIES:\n    # Build path to category folder\n    folder = os.path.join(DIRECTORY, category)\n    \n    # Iterate through all images in the category folder\n    for img in os.listdir(folder):\n        # Build full path to image file\n        img_path = os.path.join(folder, img)\n        \n        # Assign numeric label based on category index (0: mammooty, 1: mohanlal, 2: random)\n        label = CATEGORIES.index(category)\n        \n        # Read image in grayscale mode\n        img_arr = cv.imread(img_path, cv.IMREAD_GRAYSCALE)\n        \n        # Resize image to standard size (224x224)\n        img_arr = cv.resize(img_arr,(IMG_SIZE,IMG_SIZE))\n        \n        # Append [image, label] pair to data list\n        data.append([img_arr,label])"
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "id": "738cc5fa",
    "metadata": {},
    "outputs": [],
-   "source": [
-    "# to shuffle the data\n",
-    "random.shuffle(data)"
-   ]
+   "source": "# Shuffle the data to randomize the order\n# This ensures the model doesn't learn based on the order of data\nrandom.shuffle(data)"
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "id": "f56a39be",
    "metadata": {},
    "outputs": [],
-   "source": [
-    "X = []\n",
-    "y = []\n",
-    "\n",
-    "for features, labels in data:\n",
-    "    X.append(features)\n",
-    "    y.append(labels)"
-   ]
+   "source": "# Separate features (X) and labels (y)\nX = []  # Will store image arrays (features)\ny = []  # Will store corresponding labels\n\n# Extract features and labels from the data list\nfor features, labels in data:\n    X.append(features)\n    y.append(labels)"
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "id": "1a78a065",
    "metadata": {},
    "outputs": [],
-   "source": [
-    "# changed X & y into arrays and stored it in respective variables\n",
-    "X = np.array(X)\n",
-    "y = np.array(y)"
-   ]
+   "source": "# Convert lists to NumPy arrays for efficient computation\n# X contains all image data, y contains all corresponding labels\nX = np.array(X)\ny = np.array(y)"
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "id": "093080fa",
    "metadata": {},
    "outputs": [],
-   "source": [
-    "X"
-   ]
+   "source": "# Display the features array\nX"
   },
   {
    "cell_type": "code",
@@ -115,9 +72,7 @@
     "scrolled": true
    },
    "outputs": [],
-   "source": [
-    "y"
-   ]
+   "source": "# Display the labels array\ny"
   },
   {
    "cell_type": "code",
@@ -127,11 +82,7 @@
     "scrolled": true
    },
    "outputs": [],
-   "source": [
-    "# storing the data in a pickle file\n",
-    "X = pickle.dump(X,open('X.pkl', 'wb'))\n",
-    "y = pickle.dump(y,open('y.pkl', 'wb'))\n"
-   ]
+   "source": "# Save the preprocessed data to pickle files for later use\n# Pickle files preserve the NumPy array format and can be loaded quickly\nX = pickle.dump(X,open('X.pkl', 'wb'))  # Save features\ny = pickle.dump(y,open('y.pkl', 'wb'))  # Save labels"
   },
   {
    "cell_type": "code",
@@ -168,4 +119,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}