Added Documentation for Data Loader

animikhaich · animikhaich · commit 94ee052daa66 · 2021-04-06T08:21:03.000+05:30
diff --git a/utils/data_loader.py b/utils/data_loader.py
@@ -14,20 +14,55 @@
 import matplotlib.pyplot as plt
 import tensorflow as tf
 from glob import glob
-import numpy as np
 
 
 # TODO: Add Augmentations from Albumentations (https://github.com/albumentations-team/albumentations)
 # TODO: Add Tunable Augmentation Loading from a Config File
-# TODO: Add Check for num_min_samples
 
 
 class ImageClassificationDataLoader:
-    __supported_im_formats = [".jpg", ".jpeg", ".png"]
+    """
+    Data Loader for Image Classification
+
+    - Optimized Tf.Data implementation for maximum GPU usage
+    - Automatically handle errors such as corrupted images
+    - Built-in Dataset Verification
+    - Built-in Checks for if dataset is of a supported format
+    - Supports Auto Detect Sub-folders get class information
+    - Auto Generate Class Label Map
+    - Built in Image Augmentation
+    - Dataset Batch Visualization (With and Without Augment)
+
+
+    Raises:
+        ValueError: Dataset Directory Path is Invalid
+        ValueError: Raise when unsupported files are detected
+        ValueError: Raise when Number of images are less than minimum specified
+        AssertionError: get_batch_size method is called without initializing dataset_generator
+    """
+
+    __supported_im_formats = [".jpg", ".jpeg", ".png", ".bmp"]
 
     def __init__(
-        self, data_dir, image_dims=(224, 224), grayscale=False, num_min_samples=500
+        self,
+        data_dir: str,
+        image_dims: tuple = (224, 224),
+        grayscale: bool = False,
+        num_min_samples: int = 500,
     ) -> None:
+        """
+        __init__
+
+        - Instance Variable Initialization
+        - Dataset Verification
+        - Listing all files in the given path
+
+        Args:
+            data_dir (str): Path to the Dataset Directory
+            image_dims (tuple, optional): Image Dimensions (width & height). Defaults to (224, 224).
+            grayscale (bool, optional): If Grayscale, Select Single Channel, else RGB. Defaults to False.
+            num_min_samples (int, optional): Minimum Number of Required Images per Class. Defaults to 500.
+        """
 
         self.BATCH_SIZE = None
         self.LABELS = []
@@ -44,14 +79,26 @@ def __init__(
         )
 
     def __dataset_verification(self) -> bool:
+        """
+        __dataset_verification
+
+        Dataset Verification & Checks
+
+        Raises:
+            ValueError: Dataset Directory Path is Invalid
+            ValueError: Raise when unsupported files are detected
+            ValueError: Raise when Number of images are less than minimum specified
+        Returns:
+            bool: True if all checks are passed
+        """
         # Check if the given directory is a valid directory path
         if not os.path.isdir(self.DATA_DIR):
             raise ValueError(f"Data Directory Path is Invalid")
 
         # Assume the directory names as label names and get the label names
         self.LABELS = self.extract_labels()
 
-        # Check if all files in each folder is an image. If not, raise an alert
+        # Check if all files in each folder is an image
         format_issues = {}
         quant_issues = {}
         for label in self.LABELS:
@@ -68,12 +115,13 @@ def __dataset_verification(self) -> bool:
 
             quant_issues[label] = len(paths)
 
-        # Check if any of the dict values has any entry. If any entry is there, raise alert
+        # Check if any of the classes have files that are not supported
         if any([len(format_issues[key]) for key in format_issues.keys()]):
             raise ValueError(
                 f"Invalid File(s) Detected: {format_issues}\n\nSupported Formats: {self.__supported_im_formats}"
             )
 
+        # Check if any of the classes have number of images less than the minimum
         if any(
             [quant_issues[key] < self.NUM_MIN_SAMPLES for key in quant_issues.keys()]
         ):
@@ -89,6 +137,14 @@ def __dataset_verification(self) -> bool:
         return True
 
     def extract_labels(self) -> list:
+        """
+        extract_labels
+
+        Extract the labels from the directory path (Folder Names)
+
+        Returns:
+            list: List of Class Labels
+        """
         labels = [
             label
             for label in os.listdir(self.DATA_DIR)
@@ -97,10 +153,36 @@ def extract_labels(self) -> list:
         return labels
 
     def get_encoded_labels(self, file_path) -> list:
+        """
+        get_encoded_labels
+
+        Get the One-Hot Version of the Labels
+
+        Args:
+            file_path (str): Complete path of the Image
+
+        Returns:
+            list: One Hot Encoded Labelmap
+        """
         parts = tf.strings.split(file_path, os.path.sep)
         return parts[-2] == self.LABELS
 
     def load_image(self, file_path) -> tuple:
+        """
+        load_image
+
+        Read Image for Tf.Data
+        - Read File & Load the Image
+        - Decode the Image (jpg, png, bmp)
+        - Normalize & Resize the Image
+        - Match Input & Label and return
+
+        Args:
+            file_path (str): Path of the Image File
+
+        Returns:
+            tuple: Image and One-Hot Encoded Label
+        """
         label = self.get_encoded_labels(file_path)
         img = tf.io.read_file(file_path)
         img = tf.io.decode_image(
@@ -112,6 +194,24 @@ def load_image(self, file_path) -> tuple:
         return img, tf.cast(label, tf.float32)
 
     def augment_batch(self, image, label) -> tuple:
+        """
+        augment_batch
+
+        Image Augmentation for Training:
+        - Random Contrast
+        - Random Brightness
+        - Random Hue (Color)
+        - Random Saturation
+        - Random Horizontal Flip
+        - Random Reduction in Image Quality
+
+        Args:
+            image (Tensor Image): Raw Image
+            label (Tensor): One-Hot Encoded Label
+
+        Returns:
+            tuple: Raw Image, One-Hot Encoded Label
+        """
         if tf.random.normal([1]) < 0:
             image = tf.image.random_contrast(image, 0.2, 0.9)
         if tf.random.normal([1]) < 0:
@@ -127,27 +227,98 @@ def augment_batch(self, image, label) -> tuple:
         return image, label
 
     def get_supported_formats(self) -> str:
+        """
+        get_supported_formats
+
+        Get the Supported Image Formats (String)
+
+        Returns:
+            str: Supported Image Formats
+        """
         return f"Supported File Extensions: {', '.join(self.__supported_im_formats)}"
 
     def get_supported_formats_list(self) -> list:
+        """
+        get_supported_formats
+
+        Get the Supported Image Formats (List)
+
+        Returns:
+            list: Supported Image Formats
+        """
         return self.__supported_im_formats
 
     def get_labelmap(self) -> dict:
+        """
+        get_labelmap
+
+        Get the Labelmap for the Classes
+        Returns a List of Dictionaries containing the details
+        Sample:
+        [
+            {
+                "id": 0,
+                "name": "male"
+            },
+            {
+                "id": 1,
+                "name": "female"
+            }
+        ]
+
+        Returns:
+            dict: Labelmap (ID and Label)
+        """
         labelmap = []
         for i, label in enumerate(self.LABELS):
             labelmap.append({"id": i, "name": label})
         return labelmap
 
     def get_labels(self) -> list:
+        """
+        get_labels
+
+        Get List of Labels (Class Names)
+
+        Returns:
+            list: List of Labels (Class Names)
+        """
         return self.LABELS
 
     def get_num_classes(self) -> int:
+        """
+        get_num_classes
+
+        Get Total Number of Classes
+
+        Returns:
+            int: Number of Classes (Labels)
+        """
         return len(self.LABELS)
 
     def get_dataset_size(self) -> int:
+        """
+        get_dataset_size
+
+        Get the Dataset Size (Number of Images)
+
+        Returns:
+            int: Total Number of images in Dataset
+        """
         return len(list(self.dataset_files))
 
     def get_num_steps(self) -> int:
+        """
+        get_num_steps
+
+        Get the Number of Steps Required per Batch for Training
+
+        Raises:
+            AssertionError: Dataset Generator needs to be Initialized First
+
+        Returns:
+            int: Number of Steps Required for Training Per Batch
+        """
         if self.BATCH_SIZE is None:
             raise AssertionError(
                 f"Batch Size is not Initialized. Call this method only after calling: {self.dataset_generator}"
@@ -156,6 +327,18 @@ def get_num_steps(self) -> int:
         return num_steps
 
     def dataset_generator(self, batch_size=32, augment=False):
+        """
+        dataset_generator
+
+        Create the Data Loader Pipeline and Return a Generator to Generate Datsets
+
+        Args:
+            batch_size (int, optional): Batch Size. Defaults to 32.
+            augment (bool, optional): Enable/Disable Augmentation. Defaults to False.
+
+        Returns:
+            Tf.Data Generator: Dataset Generator
+        """
         self.BATCH_SIZE = batch_size
 
         dataset = self.dataset_files.map(
@@ -174,6 +357,16 @@ def dataset_generator(self, batch_size=32, augment=False):
         return dataset
 
     def visualize_batch(self, augment=True) -> None:
+        """
+        visualize_batch
+
+        Dataset Sample Visualization
+        - Supports Augmentation
+        - Automatically Adjusts for Grayscale Images
+
+        Args:
+            augment (bool, optional): Enable/Disable Augmentation. Defaults to True.
+        """
         if self.NUM_CHANNELS == 1:
             cmap = "gray"
         else: