pytorch · NicolasHug · Jul 31, 2025 · Jun 7, 2024 · Jun 7, 2024 · Jun 7, 2024
diff --git a/examples/cpp/run_model.cpp b/examples/cpp/run_model.cpp
@@ -21,7 +21,7 @@ int main(int argc, const char* argv[]) {
     // Deserialize the ScriptModule from a file using torch::jit::load().
     model = torch::jit::load(argv[1]);
     std::cout << "Model loaded\n";
-  } catch (const torch::Error& e) {
+  } catch (const torch::Error&) {
     std::cout << "error loading the model.\n";
     return -1;
   } catch (const std::exception& e) {

diff --git a/ios/VisionTestApp/VisionTestApp/AppDelegate.h b/ios/VisionTestApp/VisionTestApp/AppDelegate.h
@@ -1,7 +1,7 @@
 #import <UIKit/UIKit.h>
 
-@interface AppDelegate : UIResponder <UIApplicationDelegate>
+@interface AppDelegate : UIResponder<UIApplicationDelegate>
 
-@property(strong, nonatomic) UIWindow *window;
+@property(strong, nonatomic) UIWindow* window;
 
 @end
diff --git a/ios/VisionTestApp/VisionTestApp/ModelRunner.h b/ios/VisionTestApp/VisionTestApp/ModelRunner.h
@@ -5,7 +5,7 @@ NS_ASSUME_NONNULL_BEGIN
 
 @interface ModelRunner : NSObject
 
-+ (NSString* )run;
++ (NSString*)run;
 + (BOOL)setUp;
 
 @end

diff --git a/ios/VisionTestApp/VisionTestApp/ViewController.h b/ios/VisionTestApp/VisionTestApp/ViewController.h
@@ -3,5 +3,4 @@
 
 @interface ViewController : UIViewController
 
-
 @end
diff --git a/references/depth/stereo/train.py b/references/depth/stereo/train.py
@@ -4,7 +4,7 @@
 from pathlib import Path
 from typing import List, Union
 
-import numpy as np
+import numpy.typing as npt
 import torch
 import torch.distributed as dist
 import torchvision.models.optical_flow
@@ -33,7 +33,7 @@ def make_stereo_flow(flow: Union[torch.Tensor, List[torch.Tensor]], model_out_ch
     return flow
 
 
-def make_lr_schedule(args: argparse.Namespace, optimizer: torch.optim.Optimizer) -> np.ndarray:
+def make_lr_schedule(args: argparse.Namespace, optimizer: torch.optim.Optimizer) -> npt.NDArray:
     """Helper function to return a learning rate scheduler for CRE-stereo"""
     if args.decay_after_steps < args.warmup_steps:
         raise ValueError(f"decay_after_steps: {args.function} must be greater than warmup_steps: {args.warmup_steps}")

diff --git a/references/depth/stereo/visualization.py b/references/depth/stereo/visualization.py
@@ -2,6 +2,7 @@
 from typing import List
 
 import numpy as np
+import numpy.typing as npt
 import torch
 from torch import Tensor
 from torchvision.utils import make_grid
@@ -64,7 +65,7 @@ def make_training_sample_grid(
     disparities: Tensor,
     masks: Tensor,
     predictions: List[Tensor],
-) -> np.ndarray:
+) -> npt.NDArray:
     # detach images and renormalize to [0, 1]
     images_left = left_images.detach().cpu() * 0.5 + 0.5
     images_right = right_images.detach().cpu() * 0.5 + 0.5
@@ -84,7 +85,7 @@ def make_training_sample_grid(
 
 
 @torch.no_grad()
-def make_disparity_sequence_grid(predictions: List[Tensor], disparities: Tensor) -> np.ndarray:
+def make_disparity_sequence_grid(predictions: List[Tensor], disparities: Tensor) -> npt.NDArray:
     # right most we will be adding the ground truth
     seq_len = len(predictions) + 1
     predictions = list(map(lambda x: x[:, :1, :, :].detach().cpu(), predictions + [disparities]))

diff --git a/test/cpp/test_custom_operators.cpp b/test/cpp/test_custom_operators.cpp
@@ -7,7 +7,8 @@
 
 TEST(test_custom_operators, nms) {
   // make sure that the torchvision ops are visible to the jit interpreter
-  auto& ops = torch::jit::getAllOperatorsFor(torch::jit::Symbol::fromQualString("torchvision::nms"));
+  auto& ops = torch::jit::getAllOperatorsFor(
+      torch::jit::Symbol::fromQualString("torchvision::nms"));
   ASSERT_EQ(ops.size(), 1);
 
   auto& op = ops.front();
@@ -24,29 +25,35 @@ TEST(test_custom_operators, nms) {
 
   at::Tensor output = vision::ops::nms(boxes, scores, thresh);
   ASSERT_TRUE(output_jit.allclose(output));
-
 }
 
 TEST(test_custom_operators, roi_align_visible) {
-  // make sure that the torchvision ops are visible to the jit interpreter even if
-  // not explicitly included
-  auto& ops = torch::jit::getAllOperatorsFor(torch::jit::Symbol::fromQualString("torchvision::roi_align"));
+  // make sure that the torchvision ops are visible to the jit interpreter even
+  // if not explicitly included
+  auto& ops = torch::jit::getAllOperatorsFor(
+      torch::jit::Symbol::fromQualString("torchvision::roi_align"));
   ASSERT_EQ(ops.size(), 1);
 
   auto& op = ops.front();
   ASSERT_EQ(op->schema().name(), "torchvision::roi_align");
 
   torch::jit::Stack stack;
-  float roi_data[] = {
-    0., 0., 0., 5., 5.,
-    0., 5., 5., 10., 10.
-  };
-  at::Tensor input = at::rand({1, 2, 10, 10}), rois = at::from_blob(roi_data, {2, 5});
+  float roi_data[] = {0., 0., 0., 5., 5., 0., 5., 5., 10., 10.};
+  at::Tensor input = at::rand({1, 2, 10, 10}),
+             rois = at::from_blob(roi_data, {2, 5});
   double spatial_scale = 1.0;
   int64_t pooled_height = 3, pooled_width = 3, sampling_ratio = -1;
   bool aligned = true;
 
-  torch::jit::push(stack, input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio, aligned);
+  torch::jit::push(
+      stack,
+      input,
+      rois,
+      spatial_scale,
+      pooled_height,
+      pooled_width,
+      sampling_ratio,
+      aligned);
   op->getOperation()(stack);
   at::Tensor output_jit;
   torch::jit::pop(stack, output_jit);

diff --git a/torchvision/csrc/io/decoder/audio_stream.cpp b/torchvision/csrc/io/decoder/audio_stream.cpp
@@ -1,6 +1,5 @@
 #include "audio_stream.h"
 #include <c10/util/Logging.h>
-#include <limits>
 #include "util.h"
 
 namespace ffmpeg {

diff --git a/torchvision/csrc/io/decoder/decoder.cpp b/torchvision/csrc/io/decoder/decoder.cpp
@@ -285,6 +285,8 @@ bool Decoder::init(
       return false;
     }
 
+    avioCtx_->max_packet_size = params.maxEncodedBufferSize;
+
     inputCtx_->pb = avioCtx_;
     inputCtx_->flags |= AVFMT_FLAG_CUSTOM_IO;
   }
@@ -382,7 +384,30 @@ bool Decoder::init(
     av_seek_frame(inputCtx_, -1, offset, AVSEEK_FLAG_BACKWARD);
   }
 
+  for (unsigned int i = 0; i < inputCtx_->nb_streams; i++) {
+    if (
+#if LIBAVUTIL_VERSION_MAJOR < 56 // Before FFMPEG 4.0
+        inputCtx_->streams[i]->codec->codec_type == AVMEDIA_TYPE_VIDEO
+#else // FFMPEG 4.0+
+        inputCtx_->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_VIDEO
+#endif
+        && inputCtx_->streams[i]->duration > 0) {
+      // There is at least two 1/r_frame_rates from the frame before the last
+      // one until the video duration, let's prefer to set duration after the
+      // frame before the last one, but as early as possible
+      double correction = 2 * inputCtx_->streams[i]->r_frame_rate.den /
+              (double)inputCtx_->streams[i]->r_frame_rate.num -
+          1 / (double)AV_TIME_BASE;
+      videoDurationMs_ = 1000 * inputCtx_->streams[i]->duration *
+              inputCtx_->streams[i]->time_base.num /
+              (double)inputCtx_->streams[i]->time_base.den -
+          1000 * correction;
+      break;
+    }
+  }
+
   VLOG(1) << "Decoder initialized, log level: " << params_.logLevel;
+  VLOG(1) << "Video duration: " << videoDurationMs_;
   return true;
 }
 
@@ -590,13 +615,30 @@ int Decoder::getFrame(size_t workingTimeInMs) {
     result = 0;
 
     av_packet_unref(avPacket);
+
+    if (params_.uniformSampling > 1) {
+      if (doSeek_) {
+        double duration =
+            videoDurationMs_ > 0 ? videoDurationMs_ : params_.expectedDuration;
+        double step =
+            (duration * AV_TIME_BASE) / (1000 * (params_.uniformSampling - 1));
+        avformat_seek_file(
+            inputCtx_,
+            -1,
+            static_cast<int64_t>(step * kFramesDecoded_) + 1,
+            static_cast<int64_t>(step * (kFramesDecoded_ + 1)),
+            static_cast<int64_t>(step * (kFramesDecoded_ + 1)),
+            0);
+        ++kFramesDecoded_;
+        doSeek_ = false;
+      }
+    }
   }
 
   av_packet_free(&avPacket);
-  VLOG(2) << "Interrupted loop"
-          << ", interrupted_ " << interrupted_ << ", inRange_.any() "
-          << inRange_.any() << ", decodedFrame " << decodedFrame << ", result "
-          << result;
+  VLOG(2) << "Interrupted loop" << ", interrupted_ " << interrupted_
+          << ", inRange_.any() " << inRange_.any() << ", decodedFrame "
+          << decodedFrame << ", result " << result;
 
   // loop can be terminated, either by:
   // 1. explicitly interrupted
@@ -660,13 +702,35 @@ int Decoder::processPacket(
       startCondition = msg.header.pts >= params_.startOffset;
     }
     if (endInRange && startCondition) {
-      *hasMsg = true;
-      push(std::move(msg));
+      *hasMsg = pushMsg(std::move(msg));
     }
   }
   return result;
 }
 
+bool Decoder::pushMsg(DecoderOutputMessage&& msg) {
+  pastDecodedPTS_ = currentDecodedPTS_;
+  currentDecodedPTS_ = msg.header.pts;
+
+  if (params_.uniformSampling <= 1) {
+    push(std::move(msg));
+    return true;
+  }
+
+  double duration =
+      videoDurationMs_ > 0 ? videoDurationMs_ : params_.expectedDuration;
+  double step =
+      (duration * AV_TIME_BASE) / (1000 * (params_.uniformSampling - 1));
+  if (pastDecodedPTS_ < step * kFramesDecoded_ &&
+      step * kFramesDecoded_ <= currentDecodedPTS_) {
+    push(std::move(msg));
+    doSeek_ = true;
+    return true;
+  }
+
+  return false;
+}
+
 void Decoder::flushStreams() {
   VLOG(1) << "Flushing streams...";
   for (auto& stream : streams_) {
@@ -678,7 +742,7 @@ void Decoder::flushStreams() {
           params_.endOffset <= 0 || msg.header.pts <= params_.endOffset;
       inRange_.set(stream.second->getIndex(), endInRange);
       if (endInRange && msg.header.pts >= params_.startOffset) {
-        push(std::move(msg));
+        pushMsg(std::move(msg));
       } else {
         msg.payload.reset();
       }

diff --git a/torchvision/csrc/io/decoder/decoder.h b/torchvision/csrc/io/decoder/decoder.h
@@ -56,6 +56,7 @@ class Decoder : public MediaDecoder {
   int* getPrintPrefix() {
     return &printPrefix;
   }
+  double videoDurationMs_ = -1;
 
  private:
   // mark below function for a proper invocation
@@ -76,6 +77,8 @@ class Decoder : public MediaDecoder {
       bool fastSeek = false);
   void flushStreams();
   void cleanUp();
+  bool pushMsg(DecoderOutputMessage&&
+                   msg); // returns whether frame is passed to downstream
 
  protected:
   DecoderParameters params_;
@@ -89,5 +92,9 @@ class Decoder : public MediaDecoder {
   AVIOContext* avioCtx_{nullptr};
   std::unordered_map<ssize_t, std::unique_ptr<Stream>> streams_;
   std::bitset<64> inRange_;
+  int kFramesDecoded_{0};
+  int64_t pastDecodedPTS_{-1};
+  int64_t currentDecodedPTS_{-1};
+  bool doSeek_{false};
 };
 } // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/defs.h b/torchvision/csrc/io/decoder/defs.h
@@ -219,6 +219,17 @@ struct DecoderParameters {
   // it is dispersed into the stream, but will increase latency. Must be an
   // integer not lesser than 32. It is 5000000 by default.
   int64_t probeSize{5000000};
+
+  // Expected duration of the video to be decoded, mainly used with uniform
+  // sampling
+  float expectedDuration{0.0f};
+
+  // Sample N key-frames from the video roughly uniformly across the timeline
+  int uniformSampling{0};
+
+  // with 0, ffmpeg allocates buffers of size 32768 bytes for encoded frames.
+  // Override this with bigger buffer size if needed.
+  int64_t maxEncodedBufferSize{0};
 };
 
 struct DecoderHeader {

diff --git a/torchvision/csrc/io/decoder/gpu/gpu_decoder.cpp b/torchvision/csrc/io/decoder/gpu/gpu_decoder.cpp
@@ -46,8 +46,8 @@ void GPUDecoder::seek(double timestamp, bool keyframes_only) {
   demuxer.seek(timestamp, flag);
 }
 
-c10::Dict<std::string, c10::Dict<std::string, double>> GPUDecoder::
-    get_metadata() const {
+c10::Dict<std::string, c10::Dict<std::string, double>>
+GPUDecoder::get_metadata() const {
   c10::Dict<std::string, c10::Dict<std::string, double>> metadata;
   c10::Dict<std::string, double> video_metadata;
   video_metadata.insert("duration", demuxer.get_duration());

diff --git a/torchvision/csrc/io/decoder/memory_buffer.cpp b/torchvision/csrc/io/decoder/memory_buffer.cpp
@@ -54,7 +54,7 @@ DecoderInCallback MemoryBuffer::getCallback(
   MemoryBuffer object(buffer, size);
   return
       [object](uint8_t* out, int size, int whence, uint64_t timeoutMs) mutable
-          -> int {
+      -> int {
         if (out) { // see defs.h file
           // read mode
           return object.read(out, size);

diff --git a/torchvision/csrc/io/decoder/stream.cpp b/torchvision/csrc/io/decoder/stream.cpp
@@ -1,6 +1,5 @@
 #include "stream.h"
 #include <c10/util/Logging.h>
-#include <stdio.h>
 #include <string.h>
 #include "util.h"
 

diff --git a/torchvision/csrc/io/decoder/subtitle_stream.cpp b/torchvision/csrc/io/decoder/subtitle_stream.cpp
@@ -1,6 +1,5 @@
 #include "subtitle_stream.h"
 #include <c10/util/Logging.h>
-#include <limits>
 #include "util.h"
 
 namespace ffmpeg {

diff --git a/torchvision/csrc/io/decoder/sync_decoder_test.cpp b/torchvision/csrc/io/decoder/sync_decoder_test.cpp
@@ -158,8 +158,7 @@ void runDecoder(SyncDecoder& decoder) {
       AVSubtitle sub;
       memset(&sub, 0, sizeof(sub));
       EXPECT_TRUE(Util::deserialize(*out.payload, &sub));
-      LOG(INFO) << "Found subtitles"
-                << ", num rects: " << sub.num_rects;
+      LOG(INFO) << "Found subtitles" << ", num rects: " << sub.num_rects;
       for (int i = 0; i < sub.num_rects; ++i) {
         std::string text = "picture";
         if (sub.rects[i]->type == SUBTITLE_TEXT) {
@@ -210,9 +209,9 @@ TEST(SyncDecoder, TestSyncDecoderPerformance) {
   auto new8x8 = measurePerformanceUs(stats, kRounds, 8, 8);
   auto new16x8 = measurePerformanceUs(stats, kRounds, 16, 8);
   auto new32x4 = measurePerformanceUs(stats, kRounds, 32, 4);
-  LOG(INFO) << "Clip decoding (us)"
-            << ", new(4x2): " << new4x2 << ", new(8x8): " << new8x8
-            << ", new(16x8): " << new16x8 << ", new(32x4): " << new32x4;
+  LOG(INFO) << "Clip decoding (us)" << ", new(4x2): " << new4x2
+            << ", new(8x8): " << new8x8 << ", new(16x8): " << new16x8
+            << ", new(32x4): " << new32x4;
 }
 
 TEST(SyncDecoder, Test) {
@@ -361,7 +360,7 @@ TEST(SyncDecoder, TestMemoryBufferNoSeekableWithFullRead) {
   CHECK(decoder.init(
       params,
       [object](uint8_t* out, int size, int whence, uint64_t timeoutMs) mutable
-          -> int {
+      -> int {
         if (out) { // see defs.h file
           // read mode
           return object.read(out, size);
@@ -401,7 +400,7 @@ TEST(SyncDecoder, TestMemoryBufferNoSeekableWithPartialRead) {
   CHECK(!decoder.init(
       params,
       [object](uint8_t* out, int size, int whence, uint64_t timeoutMs) mutable
-          -> int {
+      -> int {
         if (out) { // see defs.h file
           // read mode
           return object.read(out, size);
Original file line number	Diff line number	Diff line change
Expand Up		@@ -3,5 +3,4 @@

		@interface ViewController : UIViewController


		@end