Skip to content

Commit cc491ad

Browse files
committed
docs: Add detailed comments to data preprocessing notebook
Added comprehensive comments explaining the data loading, preprocessing, labeling, shuffling, and saving workflow to improve code documentation.
1 parent e79a18e commit cc491ad

File tree

1 file changed

+10
-59
lines changed

1 file changed

+10
-59
lines changed

Model/data_preprocess.ipynb

Lines changed: 10 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -14,98 +14,55 @@
1414
"id": "3b0a337f",
1515
"metadata": {},
1616
"outputs": [],
17-
"source": [
18-
"import numpy as np\n",
19-
"import cv2 as cv\n",
20-
"import random\n",
21-
"import os\n",
22-
"import matplotlib.pyplot as plt\n",
23-
"import pickle\n",
24-
"%matplotlib inline"
25-
]
17+
"source": "# Import necessary libraries\nimport numpy as np # For numerical operations and array handling\nimport cv2 as cv # OpenCV for image processing\nimport random # For shuffling data\nimport os # For file and directory operations\nimport matplotlib.pyplot as plt # For data visualization\nimport pickle # For saving processed data to disk\n%matplotlib inline"
2618
},
2719
{
2820
"cell_type": "code",
2921
"execution_count": null,
3022
"id": "850322e9",
3123
"metadata": {},
3224
"outputs": [],
33-
"source": [
34-
"DIRECTORY = r'C:\\Users\\Arjun M S\\Desktop\\build-from-home\\dataset'\n",
35-
"# a raw string(r'...') is used to treat backslash(\\) as a normal character\n",
36-
"CATEGORIES = ['mammooty','mohanlal','random']"
37-
]
25+
"source": "# Define the dataset directory path\nDIRECTORY = r'C:\\Users\\Arjun M S\\Desktop\\build-from-home\\dataset'\n# A raw string (r'...') is used to treat backslash (\\) as a normal character\n\n# Define the three classes for classification\nCATEGORIES = ['mammooty','mohanlal','random']"
3826
},
3927
{
4028
"cell_type": "code",
4129
"execution_count": null,
4230
"id": "f01e99e7",
4331
"metadata": {},
4432
"outputs": [],
45-
"source": [
46-
"\n",
47-
"IMG_SIZE = 224\n",
48-
"\n",
49-
"data = []\n",
50-
"\n",
51-
"# labelling the data\n",
52-
"for category in CATEGORIES:\n",
53-
" folder = os.path.join(DIRECTORY, category)\n",
54-
" for img in os.listdir(folder):\n",
55-
" img_path = os.path.join(folder, img)\n",
56-
" label = CATEGORIES.index(category)\n",
57-
" img_arr = cv.imread(img_path, cv.IMREAD_GRAYSCALE)\n",
58-
" img_arr = cv.resize(img_arr,(IMG_SIZE,IMG_SIZE))\n",
59-
" data.append([img_arr,label])\n"
60-
]
33+
"source": "# Set the target image size for model input\nIMG_SIZE = 224\n\n# Initialize list to store image data and labels\ndata = []\n\n# Process and label all images from the dataset\nfor category in CATEGORIES:\n # Build path to category folder\n folder = os.path.join(DIRECTORY, category)\n \n # Iterate through all images in the category folder\n for img in os.listdir(folder):\n # Build full path to image file\n img_path = os.path.join(folder, img)\n \n # Assign numeric label based on category index (0: mammooty, 1: mohanlal, 2: random)\n label = CATEGORIES.index(category)\n \n # Read image in grayscale mode\n img_arr = cv.imread(img_path, cv.IMREAD_GRAYSCALE)\n \n # Resize image to standard size (224x224)\n img_arr = cv.resize(img_arr,(IMG_SIZE,IMG_SIZE))\n \n # Append [image, label] pair to data list\n data.append([img_arr,label])"
6134
},
6235
{
6336
"cell_type": "code",
6437
"execution_count": null,
6538
"id": "738cc5fa",
6639
"metadata": {},
6740
"outputs": [],
68-
"source": [
69-
"# to shuffle the data\n",
70-
"random.shuffle(data)"
71-
]
41+
"source": "# Shuffle the data to randomize the order\n# This ensures the model doesn't learn based on the order of data\nrandom.shuffle(data)"
7242
},
7343
{
7444
"cell_type": "code",
7545
"execution_count": null,
7646
"id": "f56a39be",
7747
"metadata": {},
7848
"outputs": [],
79-
"source": [
80-
"X = []\n",
81-
"y = []\n",
82-
"\n",
83-
"for features, labels in data:\n",
84-
" X.append(features)\n",
85-
" y.append(labels)"
86-
]
49+
"source": "# Separate features (X) and labels (y)\nX = [] # Will store image arrays (features)\ny = [] # Will store corresponding labels\n\n# Extract features and labels from the data list\nfor features, labels in data:\n X.append(features)\n y.append(labels)"
8750
},
8851
{
8952
"cell_type": "code",
9053
"execution_count": null,
9154
"id": "1a78a065",
9255
"metadata": {},
9356
"outputs": [],
94-
"source": [
95-
"# changed X & y into arrays and stored it in respective variables\n",
96-
"X = np.array(X)\n",
97-
"y = np.array(y)"
98-
]
57+
"source": "# Convert lists to NumPy arrays for efficient computation\n# X contains all image data, y contains all corresponding labels\nX = np.array(X)\ny = np.array(y)"
9958
},
10059
{
10160
"cell_type": "code",
10261
"execution_count": null,
10362
"id": "093080fa",
10463
"metadata": {},
10564
"outputs": [],
106-
"source": [
107-
"X"
108-
]
65+
"source": "# Display the features array\nX"
10966
},
11067
{
11168
"cell_type": "code",
@@ -115,9 +72,7 @@
11572
"scrolled": true
11673
},
11774
"outputs": [],
118-
"source": [
119-
"y"
120-
]
75+
"source": "# Display the labels array\ny"
12176
},
12277
{
12378
"cell_type": "code",
@@ -127,11 +82,7 @@
12782
"scrolled": true
12883
},
12984
"outputs": [],
130-
"source": [
131-
"# storing the data in a pickle file\n",
132-
"X = pickle.dump(X,open('X.pkl', 'wb'))\n",
133-
"y = pickle.dump(y,open('y.pkl', 'wb'))\n"
134-
]
85+
"source": "# Save the preprocessed data to pickle files for later use\n# Pickle files preserve the NumPy array format and can be loaded quickly\nX = pickle.dump(X,open('X.pkl', 'wb')) # Save features\ny = pickle.dump(y,open('y.pkl', 'wb')) # Save labels"
13586
},
13687
{
13788
"cell_type": "code",
@@ -168,4 +119,4 @@
168119
},
169120
"nbformat": 4,
170121
"nbformat_minor": 5
171-
}
122+
}

0 commit comments

Comments
 (0)