pytorch
diff --git a/‎examples/cpp/run_model.cpp‎
Lines changed: 1 addition & 1 deletion b/‎examples/cpp/run_model.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ios/VisionTestApp/VisionTestApp/AppDelegate.h‎
Lines changed: 2 additions & 2 deletions b/‎ios/VisionTestApp/VisionTestApp/AppDelegate.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎ios/VisionTestApp/VisionTestApp/ModelRunner.h‎
Lines changed: 1 addition & 1 deletion b/‎ios/VisionTestApp/VisionTestApp/ModelRunner.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ios/VisionTestApp/VisionTestApp/ViewController.h‎
Lines changed: 0 additions & 1 deletion b/‎ios/VisionTestApp/VisionTestApp/ViewController.h‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎references/depth/stereo/train.py‎
Lines changed: 2 additions & 2 deletions b/‎references/depth/stereo/train.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎references/depth/stereo/visualization.py‎
Lines changed: 3 additions & 2 deletions b/‎references/depth/stereo/visualization.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎test/cpp/test_custom_operators.cpp‎
Lines changed: 18 additions & 11 deletions b/‎test/cpp/test_custom_operators.cpp‎
Lines changed: 18 additions & 11 deletions
diff --git a/‎torchvision/csrc/io/decoder/audio_stream.cpp‎
Lines changed: 0 additions & 1 deletion b/‎torchvision/csrc/io/decoder/audio_stream.cpp‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎torchvision/csrc/io/decoder/decoder.cpp‎
Lines changed: 71 additions & 7 deletions b/‎torchvision/csrc/io/decoder/decoder.cpp‎
Lines changed: 71 additions & 7 deletions
diff --git a/‎torchvision/csrc/io/decoder/decoder.h‎
Lines changed: 7 additions & 0 deletions b/‎torchvision/csrc/io/decoder/decoder.h‎
Lines changed: 7 additions & 0 deletions
@@ -21,7 +21,7 @@ int main(int argc, const char* argv[]) {
     // Deserialize the ScriptModule from a file using torch::jit::load().
     model = torch::jit::load(argv[1]);
     std::cout << "Model loaded\n";
-  } catch (const torch::Error& e) {
+  } catch (const torch::Error&) {
     std::cout << "error loading the model.\n";
     return -1;
   } catch (const std::exception& e) {
 
@@ -1,7 +1,7 @@
 #import <UIKit/UIKit.h>
 
-@interface AppDelegate : UIResponder <UIApplicationDelegate>
+@interface AppDelegate : UIResponder<UIApplicationDelegate>
 
-@property(strong, nonatomic) UIWindow *window;
+@property(strong, nonatomic) UIWindow* window;
 
 @end
@@ -5,7 +5,7 @@ NS_ASSUME_NONNULL_BEGIN
 
 @interface ModelRunner : NSObject
 
-+ (NSString* )run;
++ (NSString*)run;
 + (BOOL)setUp;
 
 @end
 
@@ -3,5 +3,4 @@
 
 @interface ViewController : UIViewController
 
-
 @end
@@ -4,7 +4,7 @@
 from pathlib import Path
 from typing import List, Union
 
-import numpy as np
+import numpy.typing as npt
 import torch
 import torch.distributed as dist
 import torchvision.models.optical_flow
@@ -33,7 +33,7 @@ def make_stereo_flow(flow: Union[torch.Tensor, List[torch.Tensor]], model_out_ch
     return flow
 
 
-def make_lr_schedule(args: argparse.Namespace, optimizer: torch.optim.Optimizer) -> np.ndarray:
+def make_lr_schedule(args: argparse.Namespace, optimizer: torch.optim.Optimizer) -> npt.NDArray:
     """Helper function to return a learning rate scheduler for CRE-stereo"""
     if args.decay_after_steps < args.warmup_steps:
         raise ValueError(f"decay_after_steps: {args.function} must be greater than warmup_steps: {args.warmup_steps}")
 
@@ -2,6 +2,7 @@
 from typing import List
 
 import numpy as np
+import numpy.typing as npt
 import torch
 from torch import Tensor
 from torchvision.utils import make_grid
@@ -64,7 +65,7 @@ def make_training_sample_grid(
     disparities: Tensor,
     masks: Tensor,
     predictions: List[Tensor],
-) -> np.ndarray:
+) -> npt.NDArray:
     # detach images and renormalize to [0, 1]
     images_left = left_images.detach().cpu() * 0.5 + 0.5
     images_right = right_images.detach().cpu() * 0.5 + 0.5
@@ -84,7 +85,7 @@ def make_training_sample_grid(
 
 
 @torch.no_grad()
-def make_disparity_sequence_grid(predictions: List[Tensor], disparities: Tensor) -> np.ndarray:
+def make_disparity_sequence_grid(predictions: List[Tensor], disparities: Tensor) -> npt.NDArray:
     # right most we will be adding the ground truth
     seq_len = len(predictions) + 1
     predictions = list(map(lambda x: x[:, :1, :, :].detach().cpu(), predictions + [disparities]))
 
@@ -7,7 +7,8 @@
 
 TEST(test_custom_operators, nms) {
   // make sure that the torchvision ops are visible to the jit interpreter
-  auto& ops = torch::jit::getAllOperatorsFor(torch::jit::Symbol::fromQualString("torchvision::nms"));
+  auto& ops = torch::jit::getAllOperatorsFor(
+      torch::jit::Symbol::fromQualString("torchvision::nms"));
   ASSERT_EQ(ops.size(), 1);
 
   auto& op = ops.front();
@@ -24,29 +25,35 @@ TEST(test_custom_operators, nms) {
 
   at::Tensor output = vision::ops::nms(boxes, scores, thresh);
   ASSERT_TRUE(output_jit.allclose(output));
-
 }
 
 TEST(test_custom_operators, roi_align_visible) {
-  // make sure that the torchvision ops are visible to the jit interpreter even if
-  // not explicitly included
-  auto& ops = torch::jit::getAllOperatorsFor(torch::jit::Symbol::fromQualString("torchvision::roi_align"));
+  // make sure that the torchvision ops are visible to the jit interpreter even
+  // if not explicitly included
+  auto& ops = torch::jit::getAllOperatorsFor(
+      torch::jit::Symbol::fromQualString("torchvision::roi_align"));
   ASSERT_EQ(ops.size(), 1);
 
   auto& op = ops.front();
   ASSERT_EQ(op->schema().name(), "torchvision::roi_align");
 
   torch::jit::Stack stack;
-  float roi_data[] = {
-    0., 0., 0., 5., 5.,
-    0., 5., 5., 10., 10.
-  };
-  at::Tensor input = at::rand({1, 2, 10, 10}), rois = at::from_blob(roi_data, {2, 5});
+  float roi_data[] = {0., 0., 0., 5., 5., 0., 5., 5., 10., 10.};
+  at::Tensor input = at::rand({1, 2, 10, 10}),
+             rois = at::from_blob(roi_data, {2, 5});
   double spatial_scale = 1.0;
   int64_t pooled_height = 3, pooled_width = 3, sampling_ratio = -1;
   bool aligned = true;
 
-  torch::jit::push(stack, input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio, aligned);
+  torch::jit::push(
+      stack,
+      input,
+      rois,
+      spatial_scale,
+      pooled_height,
+      pooled_width,
+      sampling_ratio,
+      aligned);
   op->getOperation()(stack);
   at::Tensor output_jit;
   torch::jit::pop(stack, output_jit);
 
@@ -1,6 +1,5 @@
 #include "audio_stream.h"
 #include <c10/util/Logging.h>
-#include <limits>
 #include "util.h"
 
 namespace ffmpeg {
 
@@ -285,6 +285,8 @@ bool Decoder::init(
       return false;
     }
 
+    avioCtx_->max_packet_size = params.maxEncodedBufferSize;
+
     inputCtx_->pb = avioCtx_;
     inputCtx_->flags |= AVFMT_FLAG_CUSTOM_IO;
   }
@@ -382,7 +384,30 @@ bool Decoder::init(
     av_seek_frame(inputCtx_, -1, offset, AVSEEK_FLAG_BACKWARD);
   }
 
+  for (unsigned int i = 0; i < inputCtx_->nb_streams; i++) {
+    if (
+#if LIBAVUTIL_VERSION_MAJOR < 56 // Before FFMPEG 4.0
+        inputCtx_->streams[i]->codec->codec_type == AVMEDIA_TYPE_VIDEO
+#else // FFMPEG 4.0+
+        inputCtx_->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_VIDEO
+#endif
+        && inputCtx_->streams[i]->duration > 0) {
+      // There is at least two 1/r_frame_rates from the frame before the last
+      // one until the video duration, let's prefer to set duration after the
+      // frame before the last one, but as early as possible
+      double correction = 2 * inputCtx_->streams[i]->r_frame_rate.den /
+              (double)inputCtx_->streams[i]->r_frame_rate.num -
+          1 / (double)AV_TIME_BASE;
+      videoDurationMs_ = 1000 * inputCtx_->streams[i]->duration *
+              inputCtx_->streams[i]->time_base.num /
+              (double)inputCtx_->streams[i]->time_base.den -
+          1000 * correction;
+      break;
+    }
+  }
+
   VLOG(1) << "Decoder initialized, log level: " << params_.logLevel;
+  VLOG(1) << "Video duration: " << videoDurationMs_;
   return true;
 }
 
@@ -590,13 +615,30 @@ int Decoder::getFrame(size_t workingTimeInMs) {
     result = 0;
 
     av_packet_unref(avPacket);
+
+    if (params_.uniformSampling > 1) {
+      if (doSeek_) {
+        double duration =
+            videoDurationMs_ > 0 ? videoDurationMs_ : params_.expectedDuration;
+        double step =
+            (duration * AV_TIME_BASE) / (1000 * (params_.uniformSampling - 1));
+        avformat_seek_file(
+            inputCtx_,
+            -1,
+            static_cast<int64_t>(step * kFramesDecoded_) + 1,
+            static_cast<int64_t>(step * (kFramesDecoded_ + 1)),
+            static_cast<int64_t>(step * (kFramesDecoded_ + 1)),
+            0);
+        ++kFramesDecoded_;
+        doSeek_ = false;
+      }
+    }
   }
 
   av_packet_free(&avPacket);
-  VLOG(2) << "Interrupted loop"
-          << ", interrupted_ " << interrupted_ << ", inRange_.any() "
-          << inRange_.any() << ", decodedFrame " << decodedFrame << ", result "
-          << result;
+  VLOG(2) << "Interrupted loop" << ", interrupted_ " << interrupted_
+          << ", inRange_.any() " << inRange_.any() << ", decodedFrame "
+          << decodedFrame << ", result " << result;
 
   // loop can be terminated, either by:
   // 1. explicitly interrupted
@@ -660,13 +702,35 @@ int Decoder::processPacket(
       startCondition = msg.header.pts >= params_.startOffset;
     }
     if (endInRange && startCondition) {
-      *hasMsg = true;
-      push(std::move(msg));
+      *hasMsg = pushMsg(std::move(msg));
     }
   }
   return result;
 }
 
+bool Decoder::pushMsg(DecoderOutputMessage&& msg) {
+  pastDecodedPTS_ = currentDecodedPTS_;
+  currentDecodedPTS_ = msg.header.pts;
+
+  if (params_.uniformSampling <= 1) {
+    push(std::move(msg));
+    return true;
+  }
+
+  double duration =
+      videoDurationMs_ > 0 ? videoDurationMs_ : params_.expectedDuration;
+  double step =
+      (duration * AV_TIME_BASE) / (1000 * (params_.uniformSampling - 1));
+  if (pastDecodedPTS_ < step * kFramesDecoded_ &&
+      step * kFramesDecoded_ <= currentDecodedPTS_) {
+    push(std::move(msg));
+    doSeek_ = true;
+    return true;
+  }
+
+  return false;
+}
+
 void Decoder::flushStreams() {
   VLOG(1) << "Flushing streams...";
   for (auto& stream : streams_) {
@@ -678,7 +742,7 @@ void Decoder::flushStreams() {
           params_.endOffset <= 0 || msg.header.pts <= params_.endOffset;
       inRange_.set(stream.second->getIndex(), endInRange);
       if (endInRange && msg.header.pts >= params_.startOffset) {
-        push(std::move(msg));
+        pushMsg(std::move(msg));
       } else {
         msg.payload.reset();
       }
 
@@ -56,6 +56,7 @@ class Decoder : public MediaDecoder {
   int* getPrintPrefix() {
     return &printPrefix;
   }
+  double videoDurationMs_ = -1;
 
  private:
   // mark below function for a proper invocation
@@ -76,6 +77,8 @@ class Decoder : public MediaDecoder {
       bool fastSeek = false);
   void flushStreams();
   void cleanUp();
+  bool pushMsg(DecoderOutputMessage&&
+                   msg); // returns whether frame is passed to downstream
 
  protected:
   DecoderParameters params_;
@@ -89,5 +92,9 @@ class Decoder : public MediaDecoder {
   AVIOContext* avioCtx_{nullptr};
   std::unordered_map<ssize_t, std::unique_ptr<Stream>> streams_;
   std::bitset<64> inRange_;
+  int kFramesDecoded_{0};
+  int64_t pastDecodedPTS_{-1};
+  int64_t currentDecodedPTS_{-1};
+  bool doSeek_{false};
 };
 } // namespace ffmpeg
Original file line number	Diff line number	Diff line change
`@@ -3,5 +3,4 @@`
`3`	`3`
`4`	`4`	`@interface ViewController : UIViewController`
`5`	`5`
`6`		`-`
`7`	`6`	`@end`