optimize pytorch yolo loops

f4str · f4str · commit a95b42bcb699 · 2023-05-31T14:19:28.000-07:00
Signed-off-by: Farhan Ahmed &lt;Farhan.Ahmed@ibm.com&gt;
diff --git a/art/estimators/object_detection/pytorch_object_detector.py b/art/estimators/object_detection/pytorch_object_detector.py
@@ -24,6 +24,7 @@
 import numpy as np
 
 from art.estimators.object_detection.object_detector import ObjectDetectorMixin
+from art.estimators.object_detection.utils import cast_inputs_to_pt
 from art.estimators.pytorch import PyTorchEstimator
 
 if TYPE_CHECKING:
@@ -180,6 +181,69 @@ def device(self) -> "torch.device":
         """
         return self._device
 
+    def _preprocess_and_convert_inputs(
+        self,
+        x: Union[np.ndarray, "torch.Tensor"],
+        y: Optional[List[Dict[str, Union[np.ndarray, "torch.Tensor"]]]] = None,
+        fit: bool = False,
+        no_grad: bool = True,
+    ) -> Tuple["torch.Tensor", List[Dict[str, "torch.Tensor"]]]:
+        """
+        Apply preprocessing on inputs `(x, y)` and convert to tensors, if needed.
+
+        :param x: Samples of shape NCHW or NHWC.
+        :param y: Target values of format `List[Dict[str, Union[np.ndarray, torch.Tensor]]]`, one for each input image.
+                  The fields of the Dict are as follows:
+
+                  - boxes [N, 4]: the boxes in [x1, y1, x2, y2] format, with 0 <= x1 < x2 <= W and 0 <= y1 < y2 <= H.
+                  - labels [N]: the labels for each image.
+        :param fit: `True` if the function is call before fit/training and `False` if the function is called before a
+                    predict operation.
+        :param no_grad: `True` if no gradients required.
+        :return: Preprocessed inputs `(x, y)` as tensors.
+        """
+        import torch
+
+        if self.clip_values is not None:
+            norm_factor = self.clip_values[1]
+        else:
+            norm_factor = 1.0
+
+        if self.all_framework_preprocessing:
+            # Convert samples into tensor
+            x_tensor, y_tensor = cast_inputs_to_pt(x, y)
+
+            if not self.channels_first:
+                x_tensor = torch.permute(x_tensor, (0, 3, 1, 2))
+            x_tensor /= norm_factor
+
+            # Set gradients
+            if not no_grad:
+                x_tensor.requires_grad = True
+
+            # Apply framework-specific preprocessing
+            x_preprocessed, y_preprocessed = self._apply_preprocessing(x=x_tensor, y=y_tensor, fit=fit, no_grad=no_grad)
+
+        elif isinstance(x, np.ndarray):
+            # Apply preprocessing
+            x_preprocessed, y_preprocessed = self._apply_preprocessing(x=x, y=y, fit=fit, no_grad=no_grad)
+
+            # Convert inputs into tensor
+            x_preprocessed, y_preprocessed = cast_inputs_to_pt(x_preprocessed, y_preprocessed)
+
+            if not self.channels_first:
+                x_preprocessed = torch.permute(x_preprocessed, (0, 3, 1, 2))
+            x_preprocessed /= norm_factor
+
+            # Set gradients
+            if not no_grad:
+                x_preprocessed.requires_grad = True
+
+        else:
+            raise NotImplementedError("Combination of inputs and preprocessing not supported.")
+
+        return x_preprocessed, y_preprocessed
+
     def _get_losses(
         self, x: np.ndarray, y: List[Dict[str, Union[np.ndarray, "torch.Tensor"]]]
     ) -> Tuple[Dict[str, "torch.Tensor"], List["torch.Tensor"], List["torch.Tensor"]]:
diff --git a/art/estimators/object_detection/pytorch_yolo.py b/art/estimators/object_detection/pytorch_yolo.py
@@ -26,6 +26,7 @@
 import numpy as np
 
 from art.estimators.object_detection.object_detector import ObjectDetectorMixin
+from art.estimators.object_detection.utils import cast_inputs_to_pt
 from art.estimators.pytorch import PyTorchEstimator
 
 if TYPE_CHECKING:
@@ -296,28 +297,12 @@ def _preprocess_and_convert_inputs(
             norm_factor = 1.0
 
         if self.all_framework_preprocessing:
-            if isinstance(x, np.ndarray):
-                # Convert samples into tensor
-                x_tensor = torch.from_numpy(x / norm_factor).to(self.device)
-            else:
-                x_tensor = (x / norm_factor).to(self.device)
+            # Convert samples into tensor
+            x_tensor, y_tensor = cast_inputs_to_pt(x, y)
 
             if not self.channels_first:
                 x_tensor = torch.permute(x_tensor, (0, 3, 1, 2))
-
-            # Convert targets into tensor
-            if y is not None and isinstance(y[0]["boxes"], np.ndarray):
-                y_tensor = []
-                for y_i in y:
-                    y_t = {
-                        "boxes": torch.from_numpy(y_i["boxes"]).to(device=self.device, dtype=torch.float32),
-                        "labels": torch.from_numpy(y_i["labels"]).to(device=self.device, dtype=torch.int64),
-                    }
-                    if "masks" in y_i:
-                        y_t["masks"] = torch.from_numpy(y_i["masks"]).to(device=self.device, dtype=torch.uint8)
-                    y_tensor.append(y_t)
-            else:
-                y_tensor = y  # type: ignore
+            x_tensor /= norm_factor
 
             # Set gradients
             if not no_grad:
@@ -328,33 +313,19 @@ def _preprocess_and_convert_inputs(
 
         elif isinstance(x, np.ndarray):
             # Apply preprocessing
-            x_preprocessed, y_preprocessed = self._apply_preprocessing(x, y=y, fit=fit, no_grad=no_grad)
+            x_preprocessed, y_preprocessed = self._apply_preprocessing(x=x, y=y, fit=fit, no_grad=no_grad)
 
-            # Convert samples into tensor
-            x_preprocessed = torch.from_numpy(x_preprocessed / norm_factor).to(self.device)
+            # Convert inputs into tensor
+            x_preprocessed, y_preprocessed = cast_inputs_to_pt(x_preprocessed, y_preprocessed)
 
             if not self.channels_first:
                 x_preprocessed = torch.permute(x_preprocessed, (0, 3, 1, 2))
+            x_preprocessed /= norm_factor
 
             # Set gradients
             if not no_grad:
                 x_preprocessed.requires_grad = True
 
-            # Convert targets into tensor
-            if y_preprocessed is not None and isinstance(y_preprocessed[0]["boxes"], np.ndarray):
-                y_preprocessed_tensor = []
-                for y_i in y_preprocessed:
-                    y_preprocessed_t = {
-                        "boxes": torch.from_numpy(y_i["boxes"]).to(device=self.device, dtype=torch.float32),
-                        "labels": torch.from_numpy(y_i["labels"]).to(device=self.device, dtype=torch.int64),
-                    }
-                    if "masks" in y_i:
-                        y_preprocessed_t["masks"] = torch.from_numpy(y_i["masks"]).to(
-                            device=self.device, dtype=torch.uint8
-                        )
-                    y_preprocessed_tensor.append(y_preprocessed_t)
-                y_preprocessed = y_preprocessed_tensor
-
         else:
             raise NotImplementedError("Combination of inputs and preprocessing not supported.")
 
@@ -380,6 +351,7 @@ def _get_losses(
         x_preprocessed, y_preprocessed = self._preprocess_and_convert_inputs(x=x, y=y, fit=False, no_grad=False)
         x_grad = x_preprocessed
 
+        # Extract height and width
         if self.channels_first:
             height = self.input_shape[1]
             width = self.input_shape[2]
@@ -389,7 +361,7 @@ def _get_losses(
 
         labels_t = translate_labels_x1y1x2y2_to_xcycwh(labels_x1y1x2y2=y_preprocessed, height=height, width=width)
 
-        loss_components = self._model(x_grad, labels_t)
+        loss_components = self._model(x_grad.to(self.device), labels_t.to(self.device))
 
         return loss_components, x_grad
 
@@ -463,30 +435,34 @@ def predict(self, x: np.ndarray, batch_size: int = 128, **kwargs) -> List[Dict[s
                  - scores [N]: the scores of each prediction.
         """
         import torch
+        from torch.utils.data import TensorDataset, DataLoader
 
         # Set model to evaluation mode
         self._model.eval()
 
         # Apply preprocessing and convert to tensors
         x_preprocessed, _ = self._preprocess_and_convert_inputs(x=x, y=None, fit=False, no_grad=True)
 
-        predictions: List[Dict[str, np.ndarray]] = []
+        # Create dataloader
+        dataset = TensorDataset(x_preprocessed)
+        dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=False)
 
+        # Extract height and width
         if self.channels_first:
             height = self.input_shape[1]
             width = self.input_shape[2]
         else:
             height = self.input_shape[0]
             width = self.input_shape[1]
 
-        # Run prediction
-        num_batch = int(np.ceil(len(x_preprocessed) / float(batch_size)))
-        for m in range(num_batch):
-            # Batch using indices
-            i_batch = x_preprocessed[m * batch_size : (m + 1) * batch_size]
+        predictions: List[Dict[str, np.ndarray]] = []
+        for (x_batch,) in dataloader:
+            # Move inputs to device
+            x_batch = x_batch.to(self._device)
 
+            # Run prediction
             with torch.no_grad():
-                predictions_xcycwh = self._model(i_batch)
+                predictions_xcycwh = self._model(x_batch.to(self.device))
 
             predictions_x1y1x2y2 = translate_predictions_xcycwh_to_x1y1x2y2(
                 y_pred_xcycwh=predictions_xcycwh, height=height, width=width
@@ -533,6 +509,8 @@ def fit(  # pylint: disable=W0221
         :param kwargs: Dictionary of framework-specific arguments. This parameter is not currently supported for PyTorch
                        and providing it takes no effect.
         """
+        import torch
+        from torch.utils.data import Dataset, DataLoader
 
         # Set model to train mode
         self._model.train()
@@ -541,41 +519,54 @@ def fit(  # pylint: disable=W0221
             raise ValueError("An optimizer is needed to train the model, but none for provided.")
 
         # Apply preprocessing and convert to tensors
-        x_preprocessed, y_preprocessed_list = self._preprocess_and_convert_inputs(x=x, y=y, fit=True, no_grad=True)
-
-        # Cast to np.ndarray to use list indexing
-        y_preprocessed = np.asarray(y_preprocessed_list)
+        x_preprocessed, y_preprocessed = self._preprocess_and_convert_inputs(x=x, y=y, fit=True, no_grad=True)
+
+        class ObjectDetectorDataset(Dataset):
+            def __init__(self, x, y):
+                self.x = x
+                self.y = y
+
+            def __len__(self):
+                return len(self.x)
+
+            def __getitem__(self, idx):
+                return self.x[idx], self.y[idx]
+
+        # Create dataloader
+        dataset = ObjectDetectorDataset(x_preprocessed, y_preprocessed)
+        dataloader = DataLoader(
+            dataset=dataset,
+            batch_size=batch_size,
+            shuffle=True,
+            drop_last=drop_last,
+            collate_fn=lambda batch: list(zip(*batch)),
+        )
 
+        # Extract height and width
         if self.channels_first:
             height = self.input_shape[1]
             width = self.input_shape[2]
         else:
             height = self.input_shape[0]
             width = self.input_shape[1]
 
-        num_batch = len(x_preprocessed) / float(batch_size)
-        if drop_last:
-            num_batch = int(np.floor(num_batch))
-        else:
-            num_batch = int(np.ceil(num_batch))
-        ind = np.arange(len(x_preprocessed))
-
         # Start training
         for _ in range(nb_epochs):
-            # Shuffle the examples
-            np.random.shuffle(ind)
-
             # Train for one epoch
-            for m in range(num_batch):
-                i_batch = x_preprocessed[ind[m * batch_size : (m + 1) * batch_size]]
-                o_batch = y_preprocessed[ind[m * batch_size : (m + 1) * batch_size]]
+            for x_batch, y_batch in dataloader:
+                # Convert labels to YOLO
+                x_batch = torch.stack(x_batch)
+                y_batch = translate_labels_x1y1x2y2_to_xcycwh(labels_x1y1x2y2=y_batch, height=height, width=width)
+
+                # Move inputs to device
+                x_batch = x_batch.to(self.device)
+                y_batch = y_batch.to(self.device)
 
                 # Zero the parameter gradients
                 self._optimizer.zero_grad()
 
                 # Form the loss function
-                labels_t = translate_labels_x1y1x2y2_to_xcycwh(labels_x1y1x2y2=o_batch, height=height, width=width)
-                loss_components = self._model(i_batch, labels_t)
+                loss_components = self._model(x_batch, y_batch)
                 if isinstance(loss_components, dict):
                     loss = sum(loss_components.values())
                 else:
diff --git a/art/estimators/object_detection/utils.py b/art/estimators/object_detection/utils.py
@@ -18,10 +18,14 @@
 """
 This module contains utility functions for object detection.
 """
-from typing import Dict, List
+from typing import Dict, List, Any, Tuple, Union, Optional, TYPE_CHECKING
 
 import numpy as np
 
+if TYPE_CHECKING:
+    # pylint: disable=C0412
+    import torch
+
 
 def convert_tf_to_pt(y: List[Dict[str, np.ndarray]], height: int, width: int) -> List[Dict[str, np.ndarray]]:
     """
@@ -88,3 +92,50 @@ def convert_pt_to_tf(y: List[Dict[str, np.ndarray]], height: int, width: int) ->
         y[i]["labels"] = y[i]["labels"] - 1
 
     return y
+
+
+def cast_inputs_to_pt(
+    x: np.ndarray,
+    y: Optional[List[Dict[str, np.ndarray]]] = None,
+) -> Tuple["torch.Tensor", List[Dict[str, "torch.Tensor"]]]:
+    """
+    Cast object detection inputs `(x, y)` to PyTorch tensors.
+
+    :param x: Samples of shape NCHW or NHWC.
+    :param y: Target values of format `List[Dict[str, Union[np.ndarray, torch.Tensor]]]`, one for each input image.
+                The fields of the Dict are as follows:
+
+                - boxes [N, 4]: the boxes in [x1, y1, x2, y2] format, with 0 <= x1 < x2 <= W and 0 <= y1 < y2 <= H.
+                - labels [N]: the labels for each image.
+    :return: Object detection inputs `(x, y)` as tensors.
+    """
+    import torch
+
+    # Convert images into tensor
+    if isinstance(x, np.ndarray):
+        x_tensor = torch.from_numpy(x)
+    else:
+        x_tensor = x
+
+    # Convert labels into tensor
+    if y is not None and isinstance(y, list) and isinstance(y[0]["boxes"], np.ndarray):
+        y_tensor = []
+        for y_i in y:
+            y_t = {
+                "boxes": torch.from_numpy(y_i["boxes"]).to(dtype=torch.float32),
+                "labels": torch.from_numpy(y_i["labels"]).to(dtype=torch.int64),
+            }
+            if "masks" in y_i:
+                y_t["masks"] = torch.from_numpy(y_i["masks"]).to(dtype=torch.uint8)
+            y_tensor.append(y_t)
+    elif y is not None and isinstance(y, dict):
+        y_tensor = []
+        for i in range(y["boxes"].shape[0]):
+            y_t = {}
+            y_t["boxes"] = y["boxes"][i]
+            y_t["labels"] = y["labels"][i]
+            y_tensor.append(y_t)
+    else:
+        y_tensor = y  # type: ignore
+
+    return x_tensor, y_tensor