Changed [Frame| Feature] to [Request]

arushidNV · arushidNV · commit aaa509111a16 · 2025-12-31T09:35:53.000+05:30
Signed-off-by: arushid &lt;arushid@nvidia.com&gt;
diff --git a/nemo/collections/asr/inference/pipelines/cache_aware_ctc_pipeline.py b/nemo/collections/asr/inference/pipelines/cache_aware_ctc_pipeline.py
@@ -29,7 +29,7 @@
 from nemo.collections.asr.inference.streaming.decoders.greedy.greedy_ctc_decoder import CTCGreedyDecoder
 from nemo.collections.asr.inference.streaming.endpointing.greedy.greedy_ctc_endpointing import CTCGreedyEndpointing
 from nemo.collections.asr.inference.streaming.framing.multi_stream import ContinuousBatchedRequestStreamer
-from nemo.collections.asr.inference.streaming.framing.request import FeatureBuffer, Frame
+from nemo.collections.asr.inference.streaming.framing.request import FeatureBuffer, Frame, Request
 from nemo.collections.asr.inference.streaming.framing.request_options import ASRRequestOptions
 from nemo.collections.asr.inference.streaming.state.cache_aware_ctc_state import CacheAwareCTCStreamingState
 from nemo.collections.asr.inference.utils.endpointing_utils import millisecond_to_frames
@@ -214,17 +214,19 @@ def preprocess(self, buffers: list[Tensor], right_paddings: list[int] | None = N
         feature_buffers = torch.cat(feature_buffers).to(self.device)
         return feature_buffers, feature_buffer_lens
 
-    def run_greedy_decoder(self, state: CacheAwareCTCStreamingState, frame: Frame | FeatureBuffer, log_probs: Tensor):
+    def run_greedy_decoder(
+        self, state: CacheAwareCTCStreamingState, request: Request, log_probs: Tensor
+    ):
         """
         Run the greedy CTC decoder on the log_probs and update the state
         Args:
             state: (CacheAwareCTCStreamingState) The state of the stream
-            frame: (Frame | FeatureBuffer) The current frame or feature buffer
-            log_probs: (Tensor) The log probabilities of the current frame
+            request: (Request) The current request (frame or feature buffer)
+            log_probs: (Tensor) The log probabilities of the current request
         Returns:
             (bool) Whether EOU is detected.
         """
-        eou_detected = frame.is_last
+        eou_detected = request.is_last
         last_token = state.label_buffer[-1] if len(state.label_buffer) > 0 else self.blank_id
         cur_output = self.greedy_ctc_decoder(log_probs, compute_confidence=True, previous=last_token)
         state.update_label_buffer(cur_output["labels"])
@@ -242,28 +244,28 @@ def run_greedy_decoder(self, state: CacheAwareCTCStreamingState, frame: Frame |
 
     def decode_log_probs(
         self,
-        frames: list[Frame | FeatureBuffer],
+        requests: list[Request],
         log_probs: Tensor,
         tail_log_probs: Tensor | None,
         ready_state_ids: set,
     ) -> None:
         """
         Decode the log probabilities and update the state
         Args:
-            frames: (list[Frame | FeatureBuffer]) List of frames or feature buffers to transcribe.
+            requests: (list[Request]) List of requests (frames or feature buffers) to transcribe.
             log_probs: (Tensor) Log probabilities.
             tail_log_probs: (Tensor | None) Tail log probabilities.
             ready_state_ids: (set) Set of ready state IDs.
         """
 
-        for idx, frame in enumerate(frames):
-            state = self.get_state(frame.stream_id)
-            eou_detected = self.run_greedy_decoder(state, frame, log_probs[idx])
+        for idx, request in enumerate(requests):
+            state = self.get_state(request.stream_id)
+            eou_detected = self.run_greedy_decoder(state, request, log_probs[idx])
 
             if eou_detected:
                 self.bpe_decoder.decode_bpe_tokens(state)
                 state.cleanup_after_eou()
-                ready_state_ids.add(frame.stream_id)
+                ready_state_ids.add(request.stream_id)
 
             if tail_log_probs is not None:
                 last_token = state.label_buffer[-1] if len(state.label_buffer) > 0 else self.blank_id
@@ -274,15 +276,15 @@ def decode_log_probs(
 
     def cache_aware_transcribe_step(
         self,
-        frames: list[Frame | FeatureBuffer],
+        requests: list[Request],
         buffered_features: list[Tensor],
         right_paddings: list[int] | None,
         ready_state_ids: set,
         keep_all_outputs: bool = False,
     ) -> None:
         """
         Cache Aware Transcribe Step
-        It receives a list of frames (Frame or FeatureBuffer) and features and do the following:
+        It receives a list of requests (Frame or FeatureBuffer) and features and do the following:
 
         1. Preprocess the features by stacking them and computing the lengths
         2. Get the context and mapping from the context manager for cache aware streaming
@@ -291,16 +293,16 @@ def cache_aware_transcribe_step(
         5. Decode the log probabilities and update the state
 
         Args:
-            frames: (list[Frame | FeatureBuffer]) List of frames or feature buffers to transcribe.
+            requests: (list[Request]) List of requests (frames or feature buffers) to transcribe.
             buffered_features: (list[Tensor]) List of buffered features.
             right_paddings: (list[int] | None) List of right paddings.
             ready_state_ids: (set) Set of ready state IDs.
             keep_all_outputs: (bool) Whether to keep all outputs or not.
         """
         feature_buffers, feature_buffer_lens = self.preprocess(buffered_features, right_paddings)
 
-        stream_ids = [frame.stream_id for frame in frames]
-        eos_flags = [frame.is_last for frame in frames]
+        stream_ids = [request.stream_id for request in requests]
+        eos_flags = [request.is_last for request in requests]
         context, mapping = self.context_manager.get_context(stream_ids)
 
         drop_extra_pre_encoded = 0 if not self.use_cache else self.asr_model.drop_extra_pre_encoded
@@ -319,7 +321,7 @@ def cache_aware_transcribe_step(
             log_probs = normalize_log_probs(log_probs)
         self.context_manager.update_cache(stream_ids, new_context, mapping)
         self.context_manager.reset_slots(stream_ids, eos_flags)
-        self.decode_log_probs(frames, log_probs, tail_log_probs, ready_state_ids)
+        self.decode_log_probs(requests, log_probs, tail_log_probs, ready_state_ids)
 
     def transcribe_step_for_frames(self, frames: list[Frame]) -> None:
         """
diff --git a/nemo/collections/asr/inference/pipelines/cache_aware_rnnt_pipeline.py b/nemo/collections/asr/inference/pipelines/cache_aware_rnnt_pipeline.py
@@ -29,7 +29,7 @@
 from nemo.collections.asr.inference.streaming.decoders.greedy.greedy_rnnt_decoder import RNNTGreedyDecoder
 from nemo.collections.asr.inference.streaming.endpointing.greedy.greedy_rnnt_endpointing import RNNTGreedyEndpointing
 from nemo.collections.asr.inference.streaming.framing.multi_stream import ContinuousBatchedRequestStreamer
-from nemo.collections.asr.inference.streaming.framing.request import FeatureBuffer, Frame
+from nemo.collections.asr.inference.streaming.framing.request import FeatureBuffer, Frame, Request
 from nemo.collections.asr.inference.streaming.framing.request_options import ASRRequestOptions
 from nemo.collections.asr.inference.streaming.state.cache_aware_rnnt_state import CacheAwareRNNTStreamingState
 from nemo.collections.asr.inference.utils.endpointing_utils import millisecond_to_frames
@@ -231,18 +231,18 @@ def preprocess(self, buffers: list[Tensor], right_paddings: list[int] | None = N
         return feature_buffers, feature_buffer_lens
 
     def run_greedy_decoder(
-        self, state: CacheAwareRNNTStreamingState, frame: Frame | FeatureBuffer, hyp: Hypothesis
+        self, state: CacheAwareRNNTStreamingState, request: Request, hyp: Hypothesis
     ) -> bool:
         """
         Run the greedy RNNT decoder on the hypothesis and update the state
         Args:
             state: (CacheAwareRNNTStreamingState) The state of the stream
-            frame: (Frame | FeatureBuffer) The current frame or feature buffer
-            hyp: (Hypothesis) The hypothesis of the current frame
+            request: (Request) The current request (frame or feature buffer)
+            hyp: (Hypothesis) The hypothesis of the current request
         Returns:
             (bool) Whether EOU is detected.
         """
-        eou_detected = frame.is_last
+        eou_detected = request.is_last
         cur_output, cur_labels, new_offset = self.greedy_rnnt_decoder(
             global_timestamps=hyp.timestamp,
             tokens=hyp.y_sequence,
@@ -266,15 +266,15 @@ def run_greedy_decoder(
 
     def cache_aware_transcribe_step(
         self,
-        frames: list[Frame | FeatureBuffer],
+        requests: list[Request],
         features: list[Tensor],
         right_paddings: list[int],
         ready_state_ids: set,
         keep_all_outputs: bool = False,
     ) -> None:
         """
         Cache Aware Transcribe Step
-        It receives a list of frames (Frame or FeatureBuffer) and features and do the following:
+        It receives a list of requests (Frame or FeatureBuffer) and features and do the following:
 
         1. Preprocess the features by stacking them and computing the lengths
         2. Collecting previous hypotheses for stateful decoding
@@ -285,7 +285,7 @@ def cache_aware_transcribe_step(
         7. Perform greedy RNNT decoding to get the best hypothesis and update the states
         8. Update the ready states to indicate that the state is ready for text post-processing
         Args:
-            frames: (list[Frame | FeatureBuffer]) List of frames or feature buffers to transcribe.
+            requests: (list[Request]) List of requests (frames or feature buffers) to transcribe.
             features: (list[Tensor]) List of feature buffers.
             right_paddings: (list[int] | None) List of right paddings.
             ready_state_ids: (set) Set of ready state IDs.
@@ -294,10 +294,10 @@ def cache_aware_transcribe_step(
 
         feature_buffers, feature_buffer_lens = self.preprocess(features, right_paddings)
         states, stream_ids, eos_flags = [], [], []
-        for frame in frames:
-            states.append(self.get_state(frame.stream_id))
-            stream_ids.append(frame.stream_id)
-            eos_flags.append(frame.is_last)
+        for request in requests:
+            states.append(self.get_state(request.stream_id))
+            stream_ids.append(request.stream_id)
+            eos_flags.append(request.is_last)
 
         previous_hypotheses = [state.get_previous_hypothesis() for state in states]
         context, mapping = self.context_manager.get_context(stream_ids)
@@ -324,20 +324,19 @@ def cache_aware_transcribe_step(
         self.context_manager.reset_slots(stream_ids, eos_flags)
 
         # update the previous hypothesis and reset the previous hypothesis for the streams that has ended
-        for i, (state, hyp, eos) in enumerate(zip(states, best_hyp, eos_flags)):
-            hyp_len = len(hyp.y_sequence) if hyp is not None and hasattr(hyp, 'y_sequence') else 0
+        for state, hyp, eos in zip(states, best_hyp, eos_flags):
             if eos:
                 state.reset_previous_hypothesis()
             else:
                 state.set_previous_hypothesis(hyp)
 
-        # run greedy decoder for each frame-state-hypothesis tuple
-        for frame, state, hyp in zip(frames, states, best_hyp):
-            eou_detected = self.run_greedy_decoder(state, frame, hyp)
+        # run greedy decoder for each request-state-hypothesis tuple
+        for request, state, hyp in zip(requests, states, best_hyp):
+            eou_detected = self.run_greedy_decoder(state, request, hyp)
             if eou_detected:
                 self.bpe_decoder.decode_bpe_tokens(state)
                 state.cleanup_after_eou()
-                ready_state_ids.add(frame.stream_id)
+                ready_state_ids.add(request.stream_id)
 
     def transcribe_step_for_feature_buffers(self, fbuffers: list[FeatureBuffer]) -> None:
         """