@@ -295,7 +295,7 @@ def predict(
295295 x_preprocessed , _ = self ._apply_preprocessing (x_ , y = None , fit = False )
296296
297297 # Transform x into the model input space
298- inputs , targets , input_rates , target_sizes , batch_idx = self .transform_model_input (x = x_preprocessed )
298+ inputs , targets , input_rates , target_sizes , batch_idx = self ._transform_model_input (x = x_preprocessed )
299299
300300 # Compute real input sizes
301301 input_sizes = input_rates .mul_ (inputs .size ()[- 1 ]).int ()
@@ -323,7 +323,8 @@ def predict(
323323
324324 # Aggregate results
325325 result_outputs = np .zeros (
326- (x_preprocessed .shape [0 ], result_output_sizes .max (), results [0 ].shape [- 1 ]), dtype = np .float32
326+ shape = (x_preprocessed .shape [0 ], result_output_sizes .max (), results [0 ].shape [- 1 ]),
327+ dtype = config .ART_NUMPY_DTYPE ,
327328 )
328329
329330 for m in range (num_batch ):
@@ -345,7 +346,7 @@ def predict(
345346 # Check if users want transcription outputs
346347 transcription_output = kwargs .get ("transcription_output" )
347348
348- if transcription_output is False :
349+ if transcription_output is None or transcription_output is False :
349350 return result_outputs , result_output_sizes
350351
351352 # Now users want transcription outputs
@@ -381,7 +382,7 @@ def loss_gradient(self, x: np.ndarray, y: np.ndarray, **kwargs) -> np.ndarray:
381382 x_preprocessed , y_preprocessed = self ._apply_preprocessing (x_ , y , fit = False )
382383
383384 # Transform data into the model input space
384- inputs , targets , input_rates , target_sizes , batch_idx = self .transform_model_input (
385+ inputs , targets , input_rates , target_sizes , batch_idx = self ._transform_model_input (
385386 x = x_preprocessed , y = y_preprocessed , compute_gradient = True
386387 )
387388
@@ -446,6 +447,9 @@ def fit(self, x: np.ndarray, y: np.ndarray, batch_size: int = 128, nb_epochs: in
446447
447448 from warpctc_pytorch import CTCLoss
448449
450+ x_ = np .empty (len (x ), dtype = object )
451+ x_ [:] = list (x )
452+
449453 # Put the model in the training mode
450454 self ._model .train ()
451455
@@ -481,7 +485,7 @@ def fit(self, x: np.ndarray, y: np.ndarray, batch_size: int = 128, nb_epochs: in
481485 o_batch = y_preprocessed [ind [begin :end ]]
482486
483487 # Transform data into the model input space
484- inputs , targets , input_rates , target_sizes , batch_idx = self .transform_model_input (
488+ inputs , targets , input_rates , target_sizes , batch_idx = self ._transform_model_input (
485489 x = i_batch , y = o_batch , compute_gradient = False
486490 )
487491
@@ -512,7 +516,44 @@ def fit(self, x: np.ndarray, y: np.ndarray, batch_size: int = 128, nb_epochs: in
512516
513517 self ._optimizer .step ()
514518
515- def transform_model_input (
519+ def preprocess_transform_model_input (
520+ self , x : "torch.Tensor" , y : np .ndarray , real_lengths : np .ndarray ,
521+ ) -> Tuple ["torch.Tensor" , "torch.Tensor" , "torch.Tensor" , "torch.Tensor" , List ]:
522+ """
523+ Apply preprocessing and then transform the user input space into the model input space. This function is used
524+ by the ASR attack to attack into the PytorchDeepSpeech estimator whose defences are called with the
525+ `_apply_preprocessing` function.
526+
527+ :param x: Samples of shape (nb_samples, seq_length).
528+ :param y: Target values of shape (nb_samples). Each sample in `y` is a string and it may possess different
529+ lengths. A possible example of `y` could be: `y = np.array(['SIXTY ONE', 'HELLO'])`.
530+ :param real_lengths: Real lengths of original sequences.
531+ :return: A tuple of inputs and targets in the model space with the original index
532+ `(inputs, targets, input_percentages, target_sizes, batch_idx)`, where:
533+ - inputs: model inputs of shape (nb_samples, nb_frequencies, seq_length).
534+ - targets: ground truth targets of shape (sum over nb_samples of real seq_lengths).
535+ - input_percentages: percentages of real inputs in inputs.
536+ - target_sizes: list of real seq_lengths.
537+ - batch_idx: original index of inputs.
538+ """
539+ import torch # lgtm [py/repeated-import]
540+
541+ # Apply preprocessing
542+ x_batch = []
543+ for i in range (len (x )):
544+ preprocessed_x_i , _ = self ._apply_preprocessing (x = x [i ], y = None , no_grad = False )
545+ x_batch .append (preprocessed_x_i )
546+
547+ x = torch .stack (x_batch )
548+
549+ # Transform the input space
550+ inputs , targets , input_rates , target_sizes , batch_idx = self ._transform_model_input (
551+ x = x , y = y , compute_gradient = False , tensor_input = True , real_lengths = real_lengths ,
552+ )
553+
554+ return inputs , targets , input_rates , target_sizes , batch_idx
555+
556+ def _transform_model_input (
516557 self ,
517558 x : Union [np .ndarray , "torch.Tensor" ],
518559 y : Optional [np .ndarray ] = None ,
0 commit comments