NVIDIA-NeMo
diff --git a/‎examples/nlp/language_modeling/conf/megatron_quantization.yaml‎
Lines changed: 1 addition & 1 deletion b/‎examples/nlp/language_modeling/conf/megatron_quantization.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py‎
Lines changed: 18 additions & 5 deletions b/‎nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py‎
Lines changed: 18 additions & 5 deletions
diff --git a/‎nemo/collections/nlp/modules/common/megatron/clip_grads.py‎
Lines changed: 7 additions & 5 deletions b/‎nemo/collections/nlp/modules/common/megatron/clip_grads.py‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎nemo/collections/tts/g2p/models/t5.py‎
Lines changed: 33 additions & 19 deletions b/‎nemo/collections/tts/g2p/models/t5.py‎
Lines changed: 33 additions & 19 deletions
@@ -31,7 +31,7 @@ export:
   decoder_type: llama # gptnext, gpt2, llama
   inference_tensor_parallel: 1 # Default using 1 TP for inference
   inference_pipeline_parallel: 1 # Default using 1 PP for inference
-  dtype: 16 # Default precision data type
+  dtype: bf16 # Default precision data type
 
 model_file: llama2-7b-fp16.nemo # Nemo file path
 model_save: llama2-7b-fp8.qnemo # Path where the quantized model will be saved
 
@@ -58,6 +58,9 @@ def _states_to_device(dec_state, device='cpu'):
     return dec_state
 
 
+_DECODER_LENGTHS_NONE_WARNING = "Passing in decoder_lengths=None for CTC decoding is likely to be an error, since it is unlikely that each element of your batch has exactly the same length. decoder_lengths will default to decoder_output.shape[0]."
+
+
 class GreedyCTCInfer(Typing, ConfidenceMethodMixin):
     """A greedy CTC decoder.
 
@@ -148,7 +151,7 @@ def __init__(
     def forward(
         self,
         decoder_output: torch.Tensor,
-        decoder_lengths: torch.Tensor,
+        decoder_lengths: Optional[torch.Tensor],
     ):
         """Returns a list of hypotheses given an input batch of the encoder hidden embedding.
         Output token is generated auto-repressively.
@@ -167,6 +170,9 @@ def forward(
             mode=logging_mode.ONCE,
         )
 
+        if decoder_lengths is None:
+            logging.warning(_DECODER_LENGTHS_NONE_WARNING, mode=logging_mode.ONCE)
+
         with torch.inference_mode():
             hypotheses = []
             # Process each sequence independently
@@ -213,7 +219,7 @@ def forward(
         return (packed_result,)
 
     @torch.no_grad()
-    def _greedy_decode_logprobs(self, x: torch.Tensor, out_len: torch.Tensor):
+    def _greedy_decode_logprobs(self, x: torch.Tensor, out_len: Optional[torch.Tensor]):
         # x: [T, D]
         # out_len: [seq_len]
 
@@ -243,7 +249,7 @@ def _greedy_decode_logprobs(self, x: torch.Tensor, out_len: torch.Tensor):
         return hypothesis
 
     @torch.no_grad()
-    def _greedy_decode_labels(self, x: torch.Tensor, out_len: torch.Tensor):
+    def _greedy_decode_labels(self, x: torch.Tensor, out_len: Optional[torch.Tensor]):
         # x: [T]
         # out_len: [seq_len]
 
@@ -370,7 +376,7 @@ def __init__(
     def forward(
         self,
         decoder_output: torch.Tensor,
-        decoder_lengths: torch.Tensor,
+        decoder_lengths: Optional[torch.Tensor],
     ):
         """Returns a list of hypotheses given an input batch of the encoder hidden embedding.
         Output token is generated auto-repressively.
@@ -383,11 +389,18 @@ def forward(
         Returns:
             packed list containing batch number of sentences (Hypotheses).
         """
+
+        input_decoder_lengths = decoder_lengths
+
+        if decoder_lengths is None:
+            logging.warning(_DECODER_LENGTHS_NONE_WARNING, mode=logging_mode.ONCE)
+            decoder_lengths = torch.tensor([decoder_output.shape[1]], dtype=torch.long).expand(decoder_output.shape[0])
+
         if decoder_output.ndim == 2:
             hypotheses = self._greedy_decode_labels_batched(decoder_output, decoder_lengths)
         else:
             hypotheses = self._greedy_decode_logprobs_batched(decoder_output, decoder_lengths)
-        packed_result = pack_hypotheses(hypotheses, decoder_lengths)
+        packed_result = pack_hypotheses(hypotheses, input_decoder_lengths)
         return (packed_result,)
 
     @torch.no_grad()
 
@@ -142,28 +142,30 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2, use_fsdp=False):
                 grad_norm = torch.zeros(1, device='cuda', dtype=torch.float32).squeeze()
             # Since we will be summing across data parallel groups,
             # we need the pow(norm-type).
-            total_norm = grad_norm ** norm_type
+            total_norm = grad_norm**norm_type
             if use_fsdp:
                 if len(sharded_grads_for_norm) > 0:
                     sharded_grad_norm, _ = multi_tensor_applier(
                         amp_C.multi_tensor_l2norm, dummy_overflow_buf.fill_(0), [sharded_grads_for_norm], False
                     )
                 else:
                     sharded_grad_norm = torch.zeros(1, device='cuda', dtype=torch.float32).squeeze()
-                total_sharded_norm = sharded_grad_norm ** norm_type
+                total_sharded_norm = sharded_grad_norm**norm_type
         else:
             for grad in grads_for_norm:
                 grad_norm = torch.norm(grad, norm_type)
-                total_norm += grad_norm ** norm_type
+                total_norm += grad_norm**norm_type
             if use_fsdp:
                 for grad in sharded_grads_for_norm:
                     grad_norm = torch.norm(grad, norm_type)
-                    total_sharded_norm += grad_norm ** norm_type
+                    total_sharded_norm += grad_norm**norm_type
 
         if use_fsdp:
             # Sum norm of grad shards across data-parallel GPUs.
             torch.distributed.all_reduce(
-                total_sharded_norm, op=torch.distributed.ReduceOp.SUM, group=parallel_state.get_data_parallel_group(),
+                total_sharded_norm,
+                op=torch.distributed.ReduceOp.SUM,
+                group=parallel_state.get_data_parallel_group(with_context_parallel=True),
             )
             total_norm += total_sharded_norm.squeeze()
 
 
@@ -46,17 +46,23 @@ class T5G2PModel(G2PModel, Exportable):
 
     @property
     def input_types(self) -> Optional[Dict[str, NeuralType]]:
-        return {
-            "input_ids": NeuralType(('B', 'T'), TokenIndex()),
-            "attention_mask": NeuralType(('B', 'T'), MaskType(), optional=True),
-            "labels": NeuralType(('B', 'T'), LabelsType()),
-        }
+        if self._input_types is None:
+            return {
+                "input_ids": NeuralType(('B', 'T'), TokenIndex()),
+                "attention_mask": NeuralType(('B', 'T'), MaskType(), optional=True),
+                "labels": NeuralType(('B', 'T'), LabelsType()),
+            }
+        return self._input_types
 
     @property
     def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        return {"loss": NeuralType((), LossType())}
+        if self._output_types is None:
+            return {"loss": NeuralType((), LossType())}
+        return self._output_types
 
     def __init__(self, cfg: DictConfig, trainer: Trainer = None):
+        self._input_types = None
+        self._output_types = None
         self.world_size = 1
         if trainer is not None:
             self.world_size = trainer.num_nodes * trainer.num_devices
@@ -91,7 +97,11 @@ def forward(self, input_ids, attention_mask, labels):
     # ===== Training Functions ===== #
     def training_step(self, batch, batch_idx):
         input_ids, attention_mask, labels = batch
-        train_loss = self.forward(input_ids=input_ids, attention_mask=attention_mask, labels=labels,)
+        train_loss = self.forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            labels=labels,
+        )
 
         self.log('train_loss', train_loss)
         return train_loss
@@ -126,7 +136,10 @@ def _setup_infer_dataloader(self, cfg) -> 'torch.utils.data.DataLoader':
 
     # Functions for inference
     @torch.no_grad()
-    def _infer(self, config: DictConfig,) -> List[int]:
+    def _infer(
+        self,
+        config: DictConfig,
+    ) -> List[int]:
         """
         Runs model inference.
 
@@ -161,7 +174,11 @@ def validation_step(self, batch, batch_idx, dataloader_idx=0, split="val"):
         input_ids, attention_mask, labels = batch
 
         # Get loss from forward step
-        val_loss = self.forward(input_ids=input_ids, attention_mask=attention_mask, labels=labels,)
+        val_loss = self.forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            labels=labels,
+        )
 
         # Get preds from generate function and calculate PER
         labels_str = self._tokenizer.batch_decode(
@@ -287,15 +304,8 @@ def _prepare_for_export(self, **kwargs):
         }
 
     def _export_teardown(self):
-        self._input_types = self._output_types = None
-
-    @property
-    def input_types(self):
-        return self._input_types
-
-    @property
-    def output_types(self):
-        return self._output_types
+        self._input_types = None
+        self._output_types = None
 
     def input_example(self, max_batch=1, max_dim=44):
         """
@@ -307,7 +317,11 @@ def input_example(self, max_batch=1, max_dim=44):
         sentence = "Kupil sem si bicikel in mu zamenjal stol."
         input_ids = [sentence]
         input_encoding = self._tokenizer(
-            input_ids, padding='longest', max_length=self.max_source_len, truncation=True, return_tensors='pt',
+            input_ids,
+            padding='longest',
+            max_length=self.max_source_len,
+            truncation=True,
+            return_tensors='pt',
         )
         return (input_encoding.input_ids,)