Merge pull request #635 from Trusted-AI/fix_ds_cornercase

beat-buesser · web-flow · commit 41f23b8b9760 · 2020-10-01T18:25:46.000+01:00
Fix deepspeech estimator cornercase
diff --git a/art/attacks/evasion/imperceptible_asr/imperceptible_asr_pytorch.py b/art/attacks/evasion/imperceptible_asr/imperceptible_asr_pytorch.py
@@ -145,7 +145,8 @@ def __init__(
         :param opt_level: Specify a pure or mixed precision optimization level. Used when use_amp is True. Accepted
                           values are `O0`, `O1`, `O2`, and `O3`.
         :param loss_scale: Loss scaling. Used when use_amp is True. Default is 1.0 due to warp-ctc not supporting
-                           scaling of gradients.
+                           scaling of gradients. If passed as a string, must be a string representing a number,
+                           e.g., “1.0”, or the string “dynamic”.
         """
         import torch  # lgtm [py/repeated-import]
         from torch.autograd import Variable
diff --git a/art/estimators/speech_recognition/pytorch_deep_speech.py b/art/estimators/speech_recognition/pytorch_deep_speech.py
@@ -94,7 +94,8 @@ def __init__(
         :param opt_level: Specify a pure or mixed precision optimization level. Used when use_amp is True. Accepted
                           values are `O0`, `O1`, `O2`, and `O3`.
         :param loss_scale: Loss scaling. Used when use_amp is True. Default is 1.0 due to warp-ctc not supporting
-                           scaling of gradients.
+                           scaling of gradients. If passed as a string, must be a string representing a number,
+                           e.g., “1.0”, or the string “dynamic”.
         :param decoder_type: Decoder type. Either `greedy` or `beam`. This parameter is only used when users want
                              transcription outputs.
         :param lm_path: Path to an (optional) kenlm language model for use with beam search. This parameter is only
@@ -285,7 +286,7 @@ def predict(
         """
         import torch  # lgtm [py/repeated-import]
 
-        x_ = x.copy()
+        x_ = np.array([x_i for x_i in x] + [np.array([0.1]), np.array([0.1, 0.2])])[:-2]
 
         # Put the model in the eval mode
         self._model.eval()
@@ -370,7 +371,7 @@ def loss_gradient(self, x: np.ndarray, y: np.ndarray, **kwargs) -> np.ndarray:
         """
         from warpctc_pytorch import CTCLoss
 
-        x_ = x.copy()
+        x_ = np.array([x_i for x_i in x] + [np.array([0.1]), np.array([0.1, 0.2])])[:-2]
 
         # Put the model in the training mode
         self._model.train()
@@ -432,8 +433,6 @@ def fit(self, x: np.ndarray, y: np.ndarray, batch_size: int = 128, nb_epochs: in
         """
         import random
 
-        import torch  # lgtm [py/repeated-import]
-
         from warpctc_pytorch import CTCLoss
 
         # Put the model in the training mode
@@ -466,8 +465,10 @@ def fit(self, x: np.ndarray, y: np.ndarray, batch_size: int = 128, nb_epochs: in
                 )
 
                 # Extract random batch
-                i_batch = x_preprocessed[ind[begin:end]].copy()
-                o_batch = y_preprocessed[ind[begin:end]].copy()
+                i_batch = np.array(
+                    [x_i for x_i in x_preprocessed[ind[begin : end]]] + [np.array([0.1]), np.array([0.1, 0.2])]
+                )[:-2]
+                o_batch = y_preprocessed[ind[begin : end]]
 
                 # Transform data into the model input space
                 inputs, targets, input_rates, target_sizes, batch_idx = self.transform_model_input(
diff --git a/tests/estimators/speech_recognition/test_pytorch_deep_speech.py b/tests/estimators/speech_recognition/test_pytorch_deep_speech.py
@@ -183,6 +183,19 @@ def _test_all(self, request, setup_class):
         expected_transcriptions = np.array(["", "", ""])
         assert (expected_transcriptions == transcriptions).all()
 
+        # Test transcription outputs, corner case
+        if request.param is True:
+            transcriptions = self.speech_recognizer_amp.predict(
+                np.array([self.x[0]]), batch_size=2, transcription_output=True
+            )
+        else:
+            transcriptions = self.speech_recognizer.predict(
+                np.array([self.x[0]]), batch_size=2, transcription_output=True
+            )
+
+        expected_transcriptions = np.array([""])
+        assert (expected_transcriptions == transcriptions).all()
+
         # Now test loss gradients
         # Create labels
         y = np.array(["SIX", "HI", "GOOD"])