Skip to content

Commit c01e115

Browse files
authored
[FBcode->GH] Import fbcode changes (#9159)
1 parent f52c4f1 commit c01e115

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

51 files changed

+278
-176
lines changed

examples/cpp/run_model.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ int main(int argc, const char* argv[]) {
2121
// Deserialize the ScriptModule from a file using torch::jit::load().
2222
model = torch::jit::load(argv[1]);
2323
std::cout << "Model loaded\n";
24-
} catch (const torch::Error& e) {
24+
} catch (const torch::Error&) {
2525
std::cout << "error loading the model.\n";
2626
return -1;
2727
} catch (const std::exception& e) {
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#import <UIKit/UIKit.h>
22

3-
@interface AppDelegate : UIResponder <UIApplicationDelegate>
3+
@interface AppDelegate : UIResponder<UIApplicationDelegate>
44

5-
@property(strong, nonatomic) UIWindow *window;
5+
@property(strong, nonatomic) UIWindow* window;
66

77
@end

ios/VisionTestApp/VisionTestApp/ModelRunner.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ NS_ASSUME_NONNULL_BEGIN
55

66
@interface ModelRunner : NSObject
77

8-
+ (NSString* )run;
8+
+ (NSString*)run;
99
+ (BOOL)setUp;
1010

1111
@end

ios/VisionTestApp/VisionTestApp/ViewController.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,4 @@
33

44
@interface ViewController : UIViewController
55

6-
76
@end

references/depth/stereo/train.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from pathlib import Path
55
from typing import List, Union
66

7-
import numpy as np
7+
import numpy.typing as npt
88
import torch
99
import torch.distributed as dist
1010
import torchvision.models.optical_flow
@@ -33,7 +33,7 @@ def make_stereo_flow(flow: Union[torch.Tensor, List[torch.Tensor]], model_out_ch
3333
return flow
3434

3535

36-
def make_lr_schedule(args: argparse.Namespace, optimizer: torch.optim.Optimizer) -> np.ndarray:
36+
def make_lr_schedule(args: argparse.Namespace, optimizer: torch.optim.Optimizer) -> npt.NDArray:
3737
"""Helper function to return a learning rate scheduler for CRE-stereo"""
3838
if args.decay_after_steps < args.warmup_steps:
3939
raise ValueError(f"decay_after_steps: {args.function} must be greater than warmup_steps: {args.warmup_steps}")

references/depth/stereo/visualization.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from typing import List
33

44
import numpy as np
5+
import numpy.typing as npt
56
import torch
67
from torch import Tensor
78
from torchvision.utils import make_grid
@@ -64,7 +65,7 @@ def make_training_sample_grid(
6465
disparities: Tensor,
6566
masks: Tensor,
6667
predictions: List[Tensor],
67-
) -> np.ndarray:
68+
) -> npt.NDArray:
6869
# detach images and renormalize to [0, 1]
6970
images_left = left_images.detach().cpu() * 0.5 + 0.5
7071
images_right = right_images.detach().cpu() * 0.5 + 0.5
@@ -84,7 +85,7 @@ def make_training_sample_grid(
8485

8586

8687
@torch.no_grad()
87-
def make_disparity_sequence_grid(predictions: List[Tensor], disparities: Tensor) -> np.ndarray:
88+
def make_disparity_sequence_grid(predictions: List[Tensor], disparities: Tensor) -> npt.NDArray:
8889
# right most we will be adding the ground truth
8990
seq_len = len(predictions) + 1
9091
predictions = list(map(lambda x: x[:, :1, :, :].detach().cpu(), predictions + [disparities]))

test/cpp/test_custom_operators.cpp

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77

88
TEST(test_custom_operators, nms) {
99
// make sure that the torchvision ops are visible to the jit interpreter
10-
auto& ops = torch::jit::getAllOperatorsFor(torch::jit::Symbol::fromQualString("torchvision::nms"));
10+
auto& ops = torch::jit::getAllOperatorsFor(
11+
torch::jit::Symbol::fromQualString("torchvision::nms"));
1112
ASSERT_EQ(ops.size(), 1);
1213

1314
auto& op = ops.front();
@@ -24,29 +25,35 @@ TEST(test_custom_operators, nms) {
2425

2526
at::Tensor output = vision::ops::nms(boxes, scores, thresh);
2627
ASSERT_TRUE(output_jit.allclose(output));
27-
2828
}
2929

3030
TEST(test_custom_operators, roi_align_visible) {
31-
// make sure that the torchvision ops are visible to the jit interpreter even if
32-
// not explicitly included
33-
auto& ops = torch::jit::getAllOperatorsFor(torch::jit::Symbol::fromQualString("torchvision::roi_align"));
31+
// make sure that the torchvision ops are visible to the jit interpreter even
32+
// if not explicitly included
33+
auto& ops = torch::jit::getAllOperatorsFor(
34+
torch::jit::Symbol::fromQualString("torchvision::roi_align"));
3435
ASSERT_EQ(ops.size(), 1);
3536

3637
auto& op = ops.front();
3738
ASSERT_EQ(op->schema().name(), "torchvision::roi_align");
3839

3940
torch::jit::Stack stack;
40-
float roi_data[] = {
41-
0., 0., 0., 5., 5.,
42-
0., 5., 5., 10., 10.
43-
};
44-
at::Tensor input = at::rand({1, 2, 10, 10}), rois = at::from_blob(roi_data, {2, 5});
41+
float roi_data[] = {0., 0., 0., 5., 5., 0., 5., 5., 10., 10.};
42+
at::Tensor input = at::rand({1, 2, 10, 10}),
43+
rois = at::from_blob(roi_data, {2, 5});
4544
double spatial_scale = 1.0;
4645
int64_t pooled_height = 3, pooled_width = 3, sampling_ratio = -1;
4746
bool aligned = true;
4847

49-
torch::jit::push(stack, input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio, aligned);
48+
torch::jit::push(
49+
stack,
50+
input,
51+
rois,
52+
spatial_scale,
53+
pooled_height,
54+
pooled_width,
55+
sampling_ratio,
56+
aligned);
5057
op->getOperation()(stack);
5158
at::Tensor output_jit;
5259
torch::jit::pop(stack, output_jit);

torchvision/csrc/io/decoder/audio_stream.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
#include "audio_stream.h"
22
#include <c10/util/Logging.h>
3-
#include <limits>
43
#include "util.h"
54

65
namespace ffmpeg {

torchvision/csrc/io/decoder/decoder.cpp

Lines changed: 71 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,8 @@ bool Decoder::init(
285285
return false;
286286
}
287287

288+
avioCtx_->max_packet_size = params.maxEncodedBufferSize;
289+
288290
inputCtx_->pb = avioCtx_;
289291
inputCtx_->flags |= AVFMT_FLAG_CUSTOM_IO;
290292
}
@@ -382,7 +384,30 @@ bool Decoder::init(
382384
av_seek_frame(inputCtx_, -1, offset, AVSEEK_FLAG_BACKWARD);
383385
}
384386

387+
for (unsigned int i = 0; i < inputCtx_->nb_streams; i++) {
388+
if (
389+
#if LIBAVUTIL_VERSION_MAJOR < 56 // Before FFMPEG 4.0
390+
inputCtx_->streams[i]->codec->codec_type == AVMEDIA_TYPE_VIDEO
391+
#else // FFMPEG 4.0+
392+
inputCtx_->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_VIDEO
393+
#endif
394+
&& inputCtx_->streams[i]->duration > 0) {
395+
// There is at least two 1/r_frame_rates from the frame before the last
396+
// one until the video duration, let's prefer to set duration after the
397+
// frame before the last one, but as early as possible
398+
double correction = 2 * inputCtx_->streams[i]->r_frame_rate.den /
399+
(double)inputCtx_->streams[i]->r_frame_rate.num -
400+
1 / (double)AV_TIME_BASE;
401+
videoDurationMs_ = 1000 * inputCtx_->streams[i]->duration *
402+
inputCtx_->streams[i]->time_base.num /
403+
(double)inputCtx_->streams[i]->time_base.den -
404+
1000 * correction;
405+
break;
406+
}
407+
}
408+
385409
VLOG(1) << "Decoder initialized, log level: " << params_.logLevel;
410+
VLOG(1) << "Video duration: " << videoDurationMs_;
386411
return true;
387412
}
388413

@@ -590,13 +615,30 @@ int Decoder::getFrame(size_t workingTimeInMs) {
590615
result = 0;
591616

592617
av_packet_unref(avPacket);
618+
619+
if (params_.uniformSampling > 1) {
620+
if (doSeek_) {
621+
double duration =
622+
videoDurationMs_ > 0 ? videoDurationMs_ : params_.expectedDuration;
623+
double step =
624+
(duration * AV_TIME_BASE) / (1000 * (params_.uniformSampling - 1));
625+
avformat_seek_file(
626+
inputCtx_,
627+
-1,
628+
static_cast<int64_t>(step * kFramesDecoded_) + 1,
629+
static_cast<int64_t>(step * (kFramesDecoded_ + 1)),
630+
static_cast<int64_t>(step * (kFramesDecoded_ + 1)),
631+
0);
632+
++kFramesDecoded_;
633+
doSeek_ = false;
634+
}
635+
}
593636
}
594637

595638
av_packet_free(&avPacket);
596-
VLOG(2) << "Interrupted loop"
597-
<< ", interrupted_ " << interrupted_ << ", inRange_.any() "
598-
<< inRange_.any() << ", decodedFrame " << decodedFrame << ", result "
599-
<< result;
639+
VLOG(2) << "Interrupted loop" << ", interrupted_ " << interrupted_
640+
<< ", inRange_.any() " << inRange_.any() << ", decodedFrame "
641+
<< decodedFrame << ", result " << result;
600642

601643
// loop can be terminated, either by:
602644
// 1. explicitly interrupted
@@ -660,13 +702,35 @@ int Decoder::processPacket(
660702
startCondition = msg.header.pts >= params_.startOffset;
661703
}
662704
if (endInRange && startCondition) {
663-
*hasMsg = true;
664-
push(std::move(msg));
705+
*hasMsg = pushMsg(std::move(msg));
665706
}
666707
}
667708
return result;
668709
}
669710

711+
bool Decoder::pushMsg(DecoderOutputMessage&& msg) {
712+
pastDecodedPTS_ = currentDecodedPTS_;
713+
currentDecodedPTS_ = msg.header.pts;
714+
715+
if (params_.uniformSampling <= 1) {
716+
push(std::move(msg));
717+
return true;
718+
}
719+
720+
double duration =
721+
videoDurationMs_ > 0 ? videoDurationMs_ : params_.expectedDuration;
722+
double step =
723+
(duration * AV_TIME_BASE) / (1000 * (params_.uniformSampling - 1));
724+
if (pastDecodedPTS_ < step * kFramesDecoded_ &&
725+
step * kFramesDecoded_ <= currentDecodedPTS_) {
726+
push(std::move(msg));
727+
doSeek_ = true;
728+
return true;
729+
}
730+
731+
return false;
732+
}
733+
670734
void Decoder::flushStreams() {
671735
VLOG(1) << "Flushing streams...";
672736
for (auto& stream : streams_) {
@@ -678,7 +742,7 @@ void Decoder::flushStreams() {
678742
params_.endOffset <= 0 || msg.header.pts <= params_.endOffset;
679743
inRange_.set(stream.second->getIndex(), endInRange);
680744
if (endInRange && msg.header.pts >= params_.startOffset) {
681-
push(std::move(msg));
745+
pushMsg(std::move(msg));
682746
} else {
683747
msg.payload.reset();
684748
}

torchvision/csrc/io/decoder/decoder.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ class Decoder : public MediaDecoder {
5656
int* getPrintPrefix() {
5757
return &printPrefix;
5858
}
59+
double videoDurationMs_ = -1;
5960

6061
private:
6162
// mark below function for a proper invocation
@@ -76,6 +77,8 @@ class Decoder : public MediaDecoder {
7677
bool fastSeek = false);
7778
void flushStreams();
7879
void cleanUp();
80+
bool pushMsg(DecoderOutputMessage&&
81+
msg); // returns whether frame is passed to downstream
7982

8083
protected:
8184
DecoderParameters params_;
@@ -89,5 +92,9 @@ class Decoder : public MediaDecoder {
8992
AVIOContext* avioCtx_{nullptr};
9093
std::unordered_map<ssize_t, std::unique_ptr<Stream>> streams_;
9194
std::bitset<64> inRange_;
95+
int kFramesDecoded_{0};
96+
int64_t pastDecodedPTS_{-1};
97+
int64_t currentDecodedPTS_{-1};
98+
bool doSeek_{false};
9299
};
93100
} // namespace ffmpeg

0 commit comments

Comments
 (0)