@@ -58,10 +58,10 @@ def translate_predictions_xcycwh_to_x1y1x2y2(
5858 for y_pred in y_pred_xcycwh :
5959 boxes = torch .vstack (
6060 [
61- torch .maximum ((y_pred [:, 0 ] - y_pred [:, 2 ] / 2 ), torch .tensor (0 ). to ( device )),
62- torch .maximum ((y_pred [:, 1 ] - y_pred [:, 3 ] / 2 ), torch .tensor (0 ). to ( device )),
63- torch .minimum ((y_pred [:, 0 ] + y_pred [:, 2 ] / 2 ), torch .tensor (input_height ). to ( device )),
64- torch .minimum ((y_pred [:, 1 ] + y_pred [:, 3 ] / 2 ), torch .tensor (input_width ). to ( device )),
61+ torch .maximum ((y_pred [:, 0 ] - y_pred [:, 2 ] / 2 ), torch .tensor (0 , device = device )),
62+ torch .maximum ((y_pred [:, 1 ] - y_pred [:, 3 ] / 2 ), torch .tensor (0 , device = device )),
63+ torch .minimum ((y_pred [:, 0 ] + y_pred [:, 2 ] / 2 ), torch .tensor (input_height , device = device )),
64+ torch .minimum ((y_pred [:, 1 ] + y_pred [:, 3 ] / 2 ), torch .tensor (input_width , device = device )),
6565 ]
6666 ).permute ((1 , 0 ))
6767 labels = torch .argmax (y_pred [:, 5 :], dim = 1 , keepdim = False )
@@ -106,10 +106,10 @@ def translate_labels_x1y1x2y2_to_xcycwh(
106106 labels [:, 3 :6 :2 ] /= input_height
107107
108108 # convert from x1y1x2y2 to xcycwh
109- labels [:, 4 ] = labels [:, 4 ] - labels [:, 2 ]
110- labels [:, 5 ] = labels [:, 5 ] - labels [:, 3 ]
111- labels [:, 2 ] = labels [:, 2 ] + labels [:, 4 ] / 2
112- labels [:, 3 ] = labels [:, 3 ] + labels [:, 5 ] / 2
109+ labels [:, 4 ] -= labels [:, 2 ]
110+ labels [:, 5 ] -= labels [:, 3 ]
111+ labels [:, 2 ] += labels [:, 4 ] / 2
112+ labels [:, 3 ] += labels [:, 5 ] / 2
113113 labels_xcycwh_list .append (labels )
114114
115115 labels_xcycwh = torch .vstack (labels_xcycwh_list )
@@ -148,13 +148,13 @@ def __init__(
148148 Initialization.
149149
150150 :param model: Object detection model wrapped as demonstrated in examples/get_started_yolo.py.
151- The output of the model is `List[Dict[Tensor]]`, one for each input
152- image. The fields of the Dict are as follows:
151+ The output of the model is `List[Dict[str, torch. Tensor]]`, one for each input image.
152+ The fields of the Dict are as follows:
153153
154- - boxes (FloatTensor [N, 4]) : the predicted boxes in [x1, y1, x2, y2] format, with values
155- between 0 and H and 0 and W
156- - labels (Int64Tensor [N]) : the predicted labels for each image
157- - scores (Tensor [N]) : the scores or each prediction
154+ - boxes [N, 4]: the boxes in [x1, y1, x2, y2] format, with 0 <= x1 < x2 <= W and
155+ 0 <= y1 < y2 <= H.
156+ - labels [N]: the labels for each image
157+ - scores [N]: the scores of each prediction.
158158 :param input_shape: The shape of one input sample.
159159 :param optimizer: The optimizer for training the classifier.
160160 :param clip_values: Tuple of the form `(min, max)` of floats or `np.ndarray` representing the minimum and
@@ -274,44 +274,37 @@ def _preprocess_and_convert_inputs(
274274 :param y: Target values of format `List[Dict[str, Union[np.ndarray, torch.Tensor]]]`, one for each input image.
275275 The fields of the Dict are as follows:
276276
277- - boxes (FloatTensor[N, 4]): the predicted boxes in [x1, y1, x2, y2] format, with values
278- between 0 and H and 0 and W
279- - labels (Int64Tensor[N]): the predicted labels for each image
280- - scores (Tensor[N]): the scores or each prediction.
277+ - boxes [N, 4]: the boxes in [x1, y1, x2, y2] format, with 0 <= x1 < x2 <= W and 0 <= y1 < y2 <= H.
278+ - labels [N]: the labels for each image
279+ - scores [N]: the scores of each prediction.
281280 :param fit: `True` if the function is call before fit/training and `False` if the function is called before a
282281 predict operation.
283282 :param no_grad: `True` if no gradients required.
284283 :return: Preprocessed inputs `(x, y)` as tensors.
285284 """
286285 import torch
287- import torchvision
288286
289287 if self .clip_values is not None :
290288 norm_factor = self .clip_values [1 ]
291289 else :
292290 norm_factor = 1.0
293291
294- transform = torchvision .transforms .Compose ([torchvision .transforms .ToTensor ()])
295-
296292 if self .all_framework_preprocessing :
297293 if isinstance (x , np .ndarray ):
298294 # Convert samples into tensor
299- if self .channels_first :
300- x_tensor = torch .from_numpy (x / norm_factor ).to (self .device )
301- else :
302- x_tensor = torch .stack ([transform (x_i / norm_factor ).to (self .device ) for x_i in x ])
295+ x_tensor = torch .from_numpy (x / norm_factor ).to (self .device )
303296 else :
304- if self .channels_first :
305- x_tensor = x . to ( self . device )
306- else :
307- x_tensor = torch .permute (x , (0 , 3 , 1 , 2 )). to ( self . device )
297+ x_tensor = ( x / norm_factor ). to ( self .device )
298+
299+ if not self . channels_first :
300+ x_tensor = torch .permute (x_tensor , (0 , 3 , 1 , 2 ))
308301
309302 # Convert targets into tensor
310303 if y is not None and isinstance (y [0 ]["boxes" ], np .ndarray ):
311304 y_tensor = []
312305 for y_i in y :
313306 y_t = {
314- "boxes" : torch .from_numpy (y_i ["boxes" ]).to (device = self .device , dtype = torch .float ),
307+ "boxes" : torch .from_numpy (y_i ["boxes" ]).to (device = self .device , dtype = torch .float32 ),
315308 "labels" : torch .from_numpy (y_i ["labels" ]).to (device = self .device , dtype = torch .int64 ),
316309 }
317310 if "masks" in y_i :
@@ -332,10 +325,10 @@ def _preprocess_and_convert_inputs(
332325 x_preprocessed , y_preprocessed = self ._apply_preprocessing (x , y = y , fit = fit , no_grad = no_grad )
333326
334327 # Convert samples into tensor
335- if self .channels_first :
336- x_preprocessed = torch . from_numpy ( x_preprocessed / norm_factor ). to ( self . device )
337- else :
338- x_preprocessed = torch .stack ([ transform ( x_i / norm_factor ). to ( self . device ) for x_i in x_preprocessed ] )
328+ x_preprocessed = torch . from_numpy ( x_preprocessed / norm_factor ). to ( self .device )
329+
330+ if not self . channels_first :
331+ x_preprocessed = torch .permute ( x_preprocessed , ( 0 , 3 , 1 , 2 ) )
339332
340333 # Set gradients
341334 if not no_grad :
@@ -346,7 +339,7 @@ def _preprocess_and_convert_inputs(
346339 y_preprocessed_tensor = []
347340 for y_i in y_preprocessed :
348341 y_preprocessed_t = {
349- "boxes" : torch .from_numpy (y_i ["boxes" ]).to (device = self .device , dtype = torch .float ),
342+ "boxes" : torch .from_numpy (y_i ["boxes" ]).to (device = self .device , dtype = torch .float32 ),
350343 "labels" : torch .from_numpy (y_i ["labels" ]).to (device = self .device , dtype = torch .int64 ),
351344 }
352345 if "masks" in y_i :
@@ -371,9 +364,9 @@ def _get_losses(
371364 :param y: Target values of format `List[Dict[str, Union[np.ndarray, torch.Tensor]]]`, one for each input image.
372365 The fields of the Dict are as follows:
373366
374- - boxes (FloatTensor [N, 4]) : the boxes in [x1, y1, x2, y2] format, with 0 <= x1 < x2 <= W and
375- 0 <= y1 < y2 <= H.
376- - labels (Int64Tensor [N]) : the labels for each image
367+ - boxes [N, 4]: the boxes in [x1, y1, x2, y2] format, with 0 <= x1 < x2 <= W and 0 <= y1 < y2 <= H.
368+ - labels [N]: the labels for each image
369+ - scores [N]: the scores of each prediction.
377370 :return: Loss gradients of the same shape as `x`.
378371 """
379372 self ._model .train ()
@@ -407,10 +400,9 @@ def loss_gradient( # pylint: disable=W0613
407400 :param y: Target values of format `List[Dict[str, Union[np.ndarray, torch.Tensor]]]`, one for each input image.
408401 The fields of the Dict are as follows:
409402
410- - boxes (FloatTensor[N, 4]): the predicted boxes in [x1, y1, x2, y2] format, with values
411- between 0 and H and 0 and W
412- - labels (Int64Tensor[N]): the predicted labels for each image
413- - scores (Tensor[N]): the scores or each prediction.
403+ - boxes [N, 4]: the boxes in [x1, y1, x2, y2] format, with 0 <= x1 < x2 <= W and 0 <= y1 < y2 <= H.
404+ - labels [N]: the labels for each image
405+ - scores [N]: the scores of each prediction.
414406 :return: Loss gradients of the same shape as `x`.
415407 """
416408 import torch
@@ -461,12 +453,12 @@ def predict(self, x: np.ndarray, batch_size: int = 128, **kwargs) -> List[Dict[s
461453
462454 :param x: Samples of shape NCHW or NHWC.
463455 :param batch_size: Batch size.
464- :param y: Target values of format `List[Dict[str, Union[ np.ndarray, torch.Tensor]]] `, one for each input image.
465- The fields of the Dict are as follows:
456+ :return: Predictions of format `List[Dict[str, np.ndarray]] `, one for each input image. The fields of the Dict
457+ are as follows:
466458
467- - boxes [N, 4]: the boxes in [x1, y1, x2, y2] format, with 0 <= x1 < x2 <= W and 0 <= y1 < y2 <= H.
468- - labels [N]: the labels for each image
469- - scores [N]: the scores or each prediction.
459+ - boxes [N, 4]: the boxes in [x1, y1, x2, y2] format, with 0 <= x1 < x2 <= W and 0 <= y1 < y2 <= H.
460+ - labels [N]: the labels for each image
461+ - scores [N]: the scores of each prediction.
470462 """
471463 import torch
472464
@@ -528,10 +520,9 @@ def fit( # pylint: disable=W0221
528520 :param y: Target values of format `List[Dict[str, Union[np.ndarray, torch.Tensor]]]`, one for each input image.
529521 The fields of the Dict are as follows:
530522
531- - boxes (FloatTensor[N, 4]): the predicted boxes in [x1, y1, x2, y2] format, with values
532- between 0 and H and 0 and W
533- - labels (Int64Tensor[N]): the predicted labels for each image
534- - scores (Tensor[N]): the scores or each prediction.
523+ - boxes [N, 4]: the boxes in [x1, y1, x2, y2] format, with 0 <= x1 < x2 <= W and 0 <= y1 < y2 <= H.
524+ - labels [N]: the labels for each image
525+ - scores [N]: the scores of each prediction.
535526 :param batch_size: Size of batches.
536527 :param nb_epochs: Number of epochs to use for training.
537528 :param drop_last: Set to ``True`` to drop the last incomplete batch, if the dataset size is not divisible by
@@ -612,10 +603,9 @@ def compute_losses(
612603 :param y: Target values of format `List[Dict[str, Union[np.ndarray, torch.Tensor]]]`, one for each input image.
613604 The fields of the Dict are as follows:
614605
615- - boxes (FloatTensor[N, 4]): the predicted boxes in [x1, y1, x2, y2] format, with values
616- between 0 and H and 0 and W
617- - labels (Int64Tensor[N]): the predicted labels for each image
618- - scores (Tensor[N]): the scores or each prediction.
606+ - boxes [N, 4]: the boxes in [x1, y1, x2, y2] format, with 0 <= x1 < x2 <= W and 0 <= y1 < y2 <= H.
607+ - labels [N]: the labels for each image
608+ - scores [N]: the scores of each prediction.
619609 :return: Dictionary of loss components.
620610 """
621611 loss_components , _ = self ._get_losses (x = x , y = y )
@@ -634,10 +624,9 @@ def compute_loss( # type: ignore
634624 :param y: Target values of format `List[Dict[str, Union[np.ndarray, torch.Tensor]]]`, one for each input image.
635625 The fields of the Dict are as follows:
636626
637- - boxes (FloatTensor[N, 4]): the predicted boxes in [x1, y1, x2, y2] format, with values
638- between 0 and H and 0 and W
639- - labels (Int64Tensor[N]): the predicted labels for each image
640- - scores (Tensor[N]): the scores or each prediction.
627+ - boxes [N, 4]: the boxes in [x1, y1, x2, y2] format, with 0 <= x1 < x2 <= W and 0 <= y1 < y2 <= H.
628+ - labels [N]: the labels for each image
629+ - scores [N]: the scores of each prediction.
641630 :return: Loss.
642631 """
643632 import torch
0 commit comments