Add enable_frame_num='sequence' mode to video readers. (#6237)

JanuszL · web-flow · commit 86c416d4b53b · 2026-03-06T07:38:26.000+01:00
- extends the `enable_frame_num` argument in both the legacy
  (`fn.readers.video`) and experimental (`fn.experimental.readers.video`)
  video reader operators from a boolean to a string enum, following the
  same convention as `out_of_bounds_policy`:
  * ``"none"``/`False` (default) - no frame number output (previous `False`)
  * ``"scalar"``/`True` - returns the index of the first frame in the decoded
    sequence as a scalar output with shape `(1,)` (previous `True`)
  * ``"sequence"`` - returns the frame index of each decoded frame as an
    additional output with shape `(F,)`; padded frames get index `-1`
- the `FrameNumPolicy` enum and `ParseFrameNumPolicy` helper are added to
  `video_utils.h` and shared by both readers.
- tests are added for the `sequence` mode covering basic stride behavior,
  constant-padding (``-1`` sentinel), and consistency between `"scalar"`
  and `"sequence"` outputs.

Signed-off-by: Janusz Lisiecki &lt;jlisiecki@nvidia.com&gt;
diff --git a/dali/operators/video/legacy/reader/nvdecoder/sequencewrapper.h b/dali/operators/video/legacy/reader/nvdecoder/sequencewrapper.h
@@ -48,6 +48,7 @@ struct SequenceWrapper {
 
     timestamps.clear();
     timestamps.reserve(max_count);
+    frame_idxs.clear();
 
     if (!event_) {
       event_ = CUDAEvent::CreateWithFlags(cudaEventBlockingSync | cudaEventDisableTiming);
@@ -83,6 +84,7 @@ struct SequenceWrapper {
   int channels = -1;
   int label = -1;
   vector<double> timestamps;
+  vector<int> frame_idxs;
   int first_frame_idx = -1;
   DALIDataType dtype = DALI_NO_TYPE;
   std::function<void(void)> read_sample_f;
diff --git a/dali/operators/video/legacy/reader/video_reader_op.cc b/dali/operators/video/legacy/reader/video_reader_op.cc
@@ -108,9 +108,14 @@ sequence and a warning. This option is mutually exclusive with `filenames`
 and `file_root`.)code",
       std::string())
   .AddOptionalArg("enable_frame_num",
-      R"code(If the `file_list` or `filenames` argument is passed, returns the frame number
-output.)code",
-      false)
+      R"code(Determines what frame number information is returned as an additional output.
+Only available when `file_list` or `filenames` with `labels` is passed.
+
+* ``None`` or ``False`` (default): No frame number output.
+* ``"scalar"`` or ``True``: Returns the index of the first frame in the decoded sequence, shape ``(1,)``.
+* ``"sequence"``: Returns the frame index of each decoded frame, shape ``(F,)``. For padded
+  frames, the index is ``-1``.)code",
+      std::string("none"))
   .AddOptionalArg("enable_timestamps",
       R"code(If the `file_list` or `filenames` argument is passed, returns the timestamps
 output. )code",
diff --git a/dali/operators/video/legacy/reader/video_reader_op.h b/dali/operators/video/legacy/reader/video_reader_op.h
@@ -30,14 +30,15 @@ inline int VideoReaderOutputFn(const OpSpec &spec) {
   std::vector<std::string> file_names = spec.GetRepeatedArgument<std::string>("filenames");
   std::vector<int> labels;
   bool has_labels_arg = spec.TryGetRepeatedArgument(labels, "labels");
-  bool enable_frame_num = spec.GetArgument<bool>("enable_frame_num");
+  FrameNumPolicy frame_num_policy =
+      ParseFrameNumPolicy(spec.GetArgument<std::string>("enable_frame_num"));
   bool enable_timestamps = spec.GetArgument<bool>("enable_timestamps");
   int num_outputs = 1;
   if ((!file_names.empty() && has_labels_arg) || !file_root.empty() || !file_list.empty()) {
     ++num_outputs;
   }
   if (!file_list.empty() || !file_names.empty()) {
-    if (enable_frame_num) num_outputs++;
+    if (frame_num_policy != FrameNumPolicy::None) num_outputs++;
     if (enable_timestamps) num_outputs++;
   }
   return num_outputs;
@@ -51,9 +52,10 @@ class VideoReader : public DataReader<GPUBackend, SequenceWrapper, SequenceWrapp
         filenames_(spec.GetRepeatedArgument<std::string>("filenames")),
         file_root_(spec.GetArgument<std::string>("file_root")),
         file_list_(spec.GetArgument<std::string>("file_list")),
-        enable_frame_num_(spec.GetArgument<bool>("enable_frame_num")),
+        frame_num_policy_(ParseFrameNumPolicy(spec.GetArgument<std::string>("enable_frame_num"))),
         enable_timestamps_(spec.GetArgument<bool>("enable_timestamps")),
         count_(spec.GetArgument<int>("sequence_length")),
+        stride_(spec.GetArgument<int>("stride")),
         channels_(spec.GetArgument<int>("channels")),
         dtype_(spec.GetArgument<DALIDataType>("dtype")) {
     DALIImageType image_type(spec.GetArgument<DALIImageType>("image_type"));
@@ -75,7 +77,7 @@ class VideoReader : public DataReader<GPUBackend, SequenceWrapper, SequenceWrapp
 
     can_use_frames_timestamps_ = !file_list_.empty() || (!filenames_.empty() && has_labels_arg);
 
-    DALI_ENFORCE(can_use_frames_timestamps_ || !enable_frame_num_,
+    DALI_ENFORCE(can_use_frames_timestamps_ || frame_num_policy_ == FrameNumPolicy::None,
                  "frame numbers can be enabled only when "
                  "`file_list`, or `filenames` with `labels` argument are passed");
     DALI_ENFORCE(can_use_frames_timestamps_ || !enable_timestamps_,
@@ -99,7 +101,10 @@ class VideoReader : public DataReader<GPUBackend, SequenceWrapper, SequenceWrapp
     label_shape_ = uniform_list_shape(max_batch_size_, {1});
 
     if (can_use_frames_timestamps_) {
-      if (enable_frame_num_) frame_num_shape_ = label_shape_;
+      if (frame_num_policy_ == FrameNumPolicy::Scalar)
+        frame_num_shape_ = label_shape_;
+      else if (frame_num_policy_ == FrameNumPolicy::Sequence)
+        frame_num_shape_ = uniform_list_shape(max_batch_size_, {count_});
       if (enable_timestamps_) timestamp_shape_ = uniform_list_shape(max_batch_size_, {count_});
     }
 
@@ -134,7 +139,7 @@ class VideoReader : public DataReader<GPUBackend, SequenceWrapper, SequenceWrapp
       label_output_ = &ws.Output<GPUBackend>(output_index++);
       label_output_->Resize(label_shape_, DALI_INT32);
       if (can_use_frames_timestamps_) {
-        if (enable_frame_num_) {
+        if (frame_num_policy_ != FrameNumPolicy::None) {
           frame_num_output_ = &ws.Output<GPUBackend>(output_index++);
           frame_num_output_->Resize(frame_num_shape_, DALI_INT32);
         }
@@ -163,10 +168,28 @@ class VideoReader : public DataReader<GPUBackend, SequenceWrapper, SequenceWrapp
       CUDA_CALL(
           cudaMemcpyAsync(label, &prefetched_video.label, sizeof(int), cudaMemcpyDefault, stream));
       if (can_use_frames_timestamps_) {
-        if (enable_frame_num_) {
+        if (frame_num_policy_ == FrameNumPolicy::Scalar) {
           auto *frame_num = frame_num_output_->mutable_tensor<int>(data_idx);
           CUDA_CALL(cudaMemcpyAsync(frame_num, &prefetched_video.first_frame_idx, sizeof(int),
                                     cudaMemcpyDefault, stream));
+        } else if (frame_num_policy_ == FrameNumPolicy::Sequence) {
+          // Compute per-frame frame indices from first_frame_idx and stride.
+          // Frames beyond the actual decoded count (padded frames) get index -1.
+          auto &idxs = prefetched_video.frame_idxs;
+          idxs.resize(count_);
+          for (int i = 0; i < count_; ++i) {
+            idxs[i] = (i < prefetched_video.count)
+                          ? (prefetched_video.first_frame_idx + i * stride_)
+                          : -1;
+          }
+          auto *frame_num_data = frame_num_output_->mutable_tensor<int>(data_idx);
+          frame_num_output_->type_info().Copy<GPUBackend, CPUBackend>(
+              frame_num_data,
+              std::nullopt,
+              idxs.data(),
+              std::nullopt,
+              idxs.size(),
+              stream);
         }
         if (enable_timestamps_) {
           auto *timestamp = timestamp_output_->mutable_tensor<double>(data_idx);
@@ -212,9 +235,10 @@ class VideoReader : public DataReader<GPUBackend, SequenceWrapper, SequenceWrapp
   std::vector<int> labels_;
   std::string file_root_;
   std::string file_list_;
-  bool enable_frame_num_;
+  FrameNumPolicy frame_num_policy_;
   bool enable_timestamps_;
   int count_;
+  int stride_;
   int channels_;
 
   TensorListShape<> label_shape_;
diff --git a/dali/operators/video/legacy/reader/video_reader_op_test.cc b/dali/operators/video/legacy/reader/video_reader_op_test.cc
@@ -469,7 +469,7 @@ TEST_F(VIDEO_READER_TEST_CLASS, FrameLabels) {
                        .AddArg("device", "gpu")
                        .AddArg("random_shuffle", false)
                        .AddArg("sequence_length", sequence_length)
-                       .AddArg("enable_frame_num", true)
+                       .AddArg("enable_frame_num", "scalar")
                        .AddArg("image_type", DALI_YCbCr)
                        .AddArg("file_list", file_list_path)
                        .AddOutput("frames", StorageDevice::GPU)
@@ -510,7 +510,7 @@ TEST_F(VIDEO_READER_TEST_CLASS, FrameLabelsFilenames) {
           .AddArg("device", "gpu")
           .AddArg("random_shuffle", false)
           .AddArg("sequence_length", sequence_length)
-          .AddArg("enable_frame_num", true)
+          .AddArg("enable_frame_num", "scalar")
           .AddArg("image_type", DALI_YCbCr)
           .AddArg("filenames", std::vector<std::string>{testing::dali_extra_path() +
                                                         "/db/video/frame_num_timestamp/test.mp4"})
@@ -558,7 +558,7 @@ TEST_F(VIDEO_READER_TEST_CLASS, LabelsFilenames) {
           .AddArg("device", "gpu")
           .AddArg("random_shuffle", false)
           .AddArg("sequence_length", sequence_length)
-          .AddArg("enable_frame_num", true)
+          .AddArg("enable_frame_num", "scalar")
           .AddArg("image_type", DALI_YCbCr)
           .AddArg("filenames", std::vector<std::string>{testing::dali_extra_path() +
                                                         "/db/video/frame_num_timestamp/test.mp4"})
@@ -621,7 +621,7 @@ TEST_F(VIDEO_READER_TEST_CLASS, FrameLabelsWithFileListFrameNum) {
                        .AddArg("device", "gpu")
                        .AddArg("random_shuffle", false)
                        .AddArg("sequence_length", sequence_length)
-                       .AddArg("enable_frame_num", true)
+                       .AddArg("enable_frame_num", "scalar")
                        .AddArg("enable_timestamps", true)
                        .AddArg("file_list_frame_num", true)
                        .AddArg("file_list_format", "frames")  // equivalent to file_list_frame_num in the old decoder
@@ -702,7 +702,7 @@ TEST_F(VIDEO_READER_TEST_CLASS, TimestampLabels) {
                        .AddArg("random_shuffle", false)
                        .AddArg("sequence_length", sequence_length)
                        .AddArg("enable_timestamps", true)
-                       .AddArg("enable_frame_num", true)
+                       .AddArg("enable_frame_num", "scalar")
                        .AddArg("image_type", DALI_YCbCr)
                        .AddArg("file_list", file_list_path)
                        .AddOutput("frames", StorageDevice::GPU)
diff --git a/dali/operators/video/reader/video_reader_decoder_op.cc b/dali/operators/video/reader/video_reader_decoder_op.cc
@@ -98,7 +98,7 @@ struct VideoSample : public VideoSampleDesc {
   // to be filled by Prefetch
   Tensor<Backend> data_;
   std::vector<double> timestamps_;
-  std::vector<int64_t> frame_idx_;
+  std::vector<int32_t> frame_idx_;
 };
 
 enum class FileListFormat {
@@ -394,7 +394,7 @@ class VideoReaderDecoder
 
   explicit VideoReaderDecoder(const OpSpec &spec)
       : Base(spec),
-        has_frame_idx_(spec.GetArgument<bool>("enable_frame_num")),
+        frame_num_policy_(ParseFrameNumPolicy(spec.GetArgument<std::string>("enable_frame_num"))),
         has_timestamps_(spec.GetArgument<bool>("enable_timestamps")),
         boundary_type_(GetBoundaryType(spec)),
         image_type_(spec.GetArgument<DALIImageType>("image_type")) {
@@ -465,9 +465,16 @@ class VideoReaderDecoder
       output_desc.push_back({label_shape, DALI_INT32});
     }
 
-    if (has_frame_idx_) {
+    if (frame_num_policy_ == FrameNumPolicy::Scalar) {
       TensorListShape<1> frame_idx_shape = uniform_list_shape<1>(batch_size, {1});
       output_desc.push_back({frame_idx_shape, DALI_INT32});
+    } else if (frame_num_policy_ == FrameNumPolicy::Sequence) {
+      TensorListShape<1> frame_idx_shape(batch_size);
+      for (int sample_id = 0; sample_id < batch_size; ++sample_id) {
+        auto num_frames = GetSample(sample_id).data_.shape()[0];
+        frame_idx_shape.set_tensor_shape(sample_id, {num_frames});
+      }
+      output_desc.push_back({frame_idx_shape, DALI_INT32});
     }
 
     if (has_timestamps_) {
@@ -526,10 +533,14 @@ class VideoReaderDecoder
         return make_cspan(&s.video_file_meta_->label, 1);
       });
     }
-    if (has_frame_idx_) {
+    if (frame_num_policy_ == FrameNumPolicy::Scalar) {
       OutputMetadata<int32_t>(ws, out_index++, [](auto &s) {
         return make_cspan(&s.start_, 1);
       });
+    } else if (frame_num_policy_ == FrameNumPolicy::Sequence) {
+      OutputMetadata<int32_t>(ws, out_index++, [](auto &s) {
+        return make_cspan(s.frame_idx_);
+      });
     }
     if (has_timestamps_) {
       OutputMetadata<double>(ws, out_index++, [](auto &s) {
@@ -601,6 +612,17 @@ class VideoReaderDecoder
                << ", boundary_type=" << to_string(boundary_type_) << std::endl;
       int roi_start = sample->video_file_meta_->start_frame;
       int roi_end = sample->video_file_meta_->end_frame;
+      if (frame_num_policy_ == FrameNumPolicy::Sequence) {
+        sample->frame_idx_.resize(num_frames);
+        for (int64_t i = 0; i < num_frames; ++i) {
+          sample->frame_idx_[i] = static_cast<int32_t>(decoder_->HandleBoundary(
+              boundary_type_,
+              static_cast<int>(sample->start_ + i * sample->stride_),
+              roi_start, roi_end));
+        }
+      } else {
+        sample->frame_idx_.clear();
+      }
       if (roi_start != 0 || roi_end != decoder_->NumFrames()) {
         frame_idxs_.clear();
         for (int frame_idx = sample->start_; frame_idx < sample->end_;
@@ -626,7 +648,7 @@ class VideoReaderDecoder
   }
 
  private:
-  bool has_frame_idx_;
+  FrameNumPolicy frame_num_policy_;
   bool has_timestamps_;
   boundary::BoundaryType boundary_type_;
   DALIImageType image_type_;
@@ -658,22 +680,28 @@ The following codecs are supported by the GPU backend only:
 * AV1
 * MPEG-4
 
-The outputs of the operator are: video, [labels], [frame_idx], [timestamp].
+The outputs of the operator are: video, [labels], [frame_num], [timestamps].
 
 * ``video``: A sequence of frames with shape ``(F, H, W, C)`` where ``F`` is the number of frames in the sequence
   (can vary between samples), ``H`` is the frame height in pixels, ``W`` is the frame width in pixels, and ``C`` is
   the number of color channels.
 * ``labels``: Label associated with the sample. Only available when using ``labels`` with ``filenames``, or when
   using ``file_list`` or ``file_root``.
-* ``frame_idx``: Index of first frame in sequence. Only available when ``enable_frame_num=True``.
+* ``frame_num``: Frame number information. Shape and content depend on ``enable_frame_num``:
+
+  * ``"scalar"`` or ``True``: Index of the first frame in the decoded sequence, shape ``(1,)``.
+  * ``"sequence"``: Frame index of each decoded frame, shape ``(F,)``. Padded frames (e.g. when
+    using ``pad_mode='constant'``) have index ``-1``.
 * ``timestamps``: Time in seconds of each frame in the sequence. Only available when ``enable_timestamps=True``.
 )code")
     .NumInput(0)
     .OutputFn([](const OpSpec &spec) {
       bool has_labels = spec.HasArgument("labels") || spec.HasArgument("file_list") ||
                         spec.HasArgument("file_root");
-      return 1 + has_labels + spec.GetArgument<bool>("enable_frame_num") +
-             spec.GetArgument<bool>("enable_timestamps");
+      bool has_frame_num =
+          ParseFrameNumPolicy(spec.GetArgument<std::string>("enable_frame_num")) !=
+          FrameNumPolicy::None;
+      return 1 + has_labels + has_frame_num + spec.GetArgument<bool>("enable_timestamps");
     })
     .AddOptionalArg("filenames",
                     R"code(Absolute paths to the video files to load.
@@ -705,7 +733,7 @@ Default: ``timestamps``.)code",
         R"code(How to handle non-exact frame matches:
 
 * ``start_down_end_up`` (default): Round start down and end up
-* ``start_up_end_down``: Round start up and end down 
+* ``start_up_end_down``: Round start up and end down
 * ``all_up``: Round both up
 * ``all_down``: Round both down)code",
         "start_down_end_up")
@@ -717,9 +745,13 @@ Default: ``timestamps``.)code",
                                  nullptr)
     .AddArg("sequence_length", R"code(Frames to load per sequence.)code", DALI_INT32)
     .AddOptionalArg("enable_frame_num",
-                    R"code(If set, returns the index of the first frame in the decoded sequence
-as an additional output.)code",
-                    false)
+                    R"code(Determines what frame number information is returned as an additional output.
+
+* ``"none"`` or ``False`` (default): No frame number output.
+* ``"scalar"`` or ``True``: Returns the index of the first frame in the decoded sequence, shape ``(1,)``.
+* ``"sequence"``: Returns the frame index of each decoded frame, shape ``(F,)``. For padded
+  frames (e.g. when using ``pad_mode='constant'``), the index is ``-1``.)code",
+                    std::string("none"))
     .AddOptionalArg("enable_timestamps",
                     R"code(If set, returns the timestamp of the frames in the decoded sequence
 as an additional output.)code",
@@ -736,7 +768,7 @@ When the value is less than 0, `step` is set to `sequence_length`.)code",
         R"code(How to handle videos with insufficient frames when using start_frame/sequence_length/stride:
 
 * ``'none'``: Return shorter sequences if not enough frames: ABC -> ABC
-* ``'constant'``: Pad with a fixed value (specified by ``pad_value``): ABC -> ABCPPP  
+* ``'constant'``: Pad with a fixed value (specified by ``pad_value``): ABC -> ABCPPP
 * ``'edge'`` or ``'repeat'``: Repeat the last valid frame: ABC -> ABCCCC
 * ``'reflect_1001'`` or ``'symmetric'``: Reflect padding, including the last element: ABC -> ABCCBA
 * ``'reflect_101'`` or ``'reflect'``: Reflect padding, not including the last element: ABC -> ABCBA
@@ -747,7 +779,7 @@ Not relevant when using ``frames`` argument.)code",
                     R"code(Value(s) used to pad missing frames when ``pad_mode='constant'``'.
 
 Each value must be in range [0, 255].
-If a single value is provided, it will be used for all channels. 
+If a single value is provided, it will be used for all channels.
 Otherwise, the number of values must match the number of channels in the video.)code",
                     std::vector<int>{
                         0,
diff --git a/dali/operators/video/reader/video_reader_decoder_op_test.cc b/dali/operators/video/reader/video_reader_decoder_op_test.cc
@@ -137,7 +137,7 @@ class VideoReaderDecoderBaseTest : public VideoTestBase {
       .AddArg("device", backend)
       .AddArg("sequence_length", sequence_length)
       .AddArg("random_shuffle", true)
-      .AddArg("enable_frame_num", true)
+      .AddArg("enable_frame_num", "scalar")
       .AddArg("initial_fill", cfr_videos_[0].NumFrames())
       .AddArg(
         "filenames",
diff --git a/dali/operators/video/video_utils.h b/dali/operators/video/video_utils.h
@@ -59,6 +59,22 @@ std::vector<VideoFileMeta> GetVideoFiles(const std::string& file_root,
                                          const std::vector<int>& labels,
                                          const std::string& file_list);
 
+enum class FrameNumPolicy {
+  None,      // no frame number output
+  Scalar,    // first frame index as a scalar with shape (1,)
+  Sequence   // per-frame indices with shape (F,); padded frames get -1
+};
+
+inline FrameNumPolicy ParseFrameNumPolicy(const std::string &s) {
+  // "True"/"False" are the Python str(bool) representations, kept for backward compatibility
+  // with code that passes enable_frame_num=True/False (Python bools).
+  if (s == "none" || s == "False")   return FrameNumPolicy::None;
+  if (s == "scalar" || s == "True")  return FrameNumPolicy::Scalar;
+  if (s == "sequence")               return FrameNumPolicy::Sequence;
+  DALI_FAIL(make_string("Invalid enable_frame_num value: '", s,
+                        "'. Valid values are: 'none', 'scalar', 'sequence'."));
+}
+
 inline boundary::BoundaryType GetBoundaryType(const OpSpec &spec) {
   auto pad_mode_str = spec.template GetArgument<std::string>("pad_mode");
   boundary::BoundaryType boundary_type = boundary::BoundaryType::ISOLATED;
diff --git a/dali/test/python/decoder/test_video.py b/dali/test/python/decoder/test_video.py