Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions benchmarks/decoders/gpu_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,18 +29,17 @@ def decode_full_video(video_path, decode_device_string, resize_device_string):
num_threads = None
if "cuda" in decode_device_string:
num_threads = 1
width = None
height = None

resize_spec = ""
if "native" in resize_device_string:
width = RESIZED_WIDTH
height = RESIZED_HEIGHT
resize_spec = f"resize, {RESIZED_HEIGHT}, {RESIZED_WIDTH}"

torchcodec._core._add_video_stream(
decoder,
stream_index=-1,
device=decode_device_string,
num_threads=num_threads,
width=width,
height=height,
transform_specs=resize_spec,
)

start_time = time.time()
Expand Down
96 changes: 71 additions & 25 deletions src/torchcodec/_core/custom_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,9 @@ TORCH_LIBRARY(torchcodec_ns, m) {
m.def(
"_create_from_file_like(int file_like_context, str? seek_mode=None) -> Tensor");
m.def(
"_add_video_stream(Tensor(a!) decoder, *, int? width=None, int? height=None, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str device=\"cpu\", str device_variant=\"default\", (Tensor, Tensor, Tensor)? custom_frame_mappings=None, str? color_conversion_library=None) -> ()");
"_add_video_stream(Tensor(a!) decoder, *, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str device=\"cpu\", str device_variant=\"default\", str transform_specs=\"\", (Tensor, Tensor, Tensor)? custom_frame_mappings=None, str? color_conversion_library=None) -> ()");
m.def(
"add_video_stream(Tensor(a!) decoder, *, int? width=None, int? height=None, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str device=\"cpu\", str device_variant=\"default\", (Tensor, Tensor, Tensor)? custom_frame_mappings=None) -> ()");
"add_video_stream(Tensor(a!) decoder, *, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str device=\"cpu\", str device_variant=\"default\", str transform_specs=\"\", (Tensor, Tensor, Tensor)? custom_frame_mappings=None) -> ()");
m.def(
"add_audio_stream(Tensor(a!) decoder, *, int? stream_index=None, int? sample_rate=None, int? num_channels=None) -> ()");
m.def("seek_to_pts(Tensor(a!) decoder, float seconds) -> ()");
Expand Down Expand Up @@ -183,6 +183,69 @@ SingleStreamDecoder::SeekMode seekModeFromString(std::string_view seekMode) {
}
}

int checkedToPositiveInt(const std::string& str) {
int ret = 0;
try {
ret = std::stoi(str);
} catch (const std::invalid_argument&) {
TORCH_CHECK(false, "String cannot be converted to an int:" + str);
} catch (const std::out_of_range&) {
TORCH_CHECK(false, "String would become integer out of range:" + str);
}
TORCH_CHECK(ret > 0, "String must be a positive integer:" + str);
return ret;
}

// Resize transform specs take the form:
//
// "resize, <height>, <width>"
//
// Where "resize" is the string literal and <height> and <width> are positive
// integers.
Transform* makeResizeTransform(
const std::vector<std::string>& resizeTransformSpec) {
TORCH_CHECK(
resizeTransformSpec.size() == 3,
"resizeTransformSpec must have 3 elements including its name");
int height = checkedToPositiveInt(resizeTransformSpec[1]);
int width = checkedToPositiveInt(resizeTransformSpec[2]);
return new ResizeTransform(FrameDims(height, width));
}

std::vector<std::string> split(const std::string& str, char delimiter) {
std::vector<std::string> tokens;
std::string token;
std::istringstream tokenStream(str);
while (std::getline(tokenStream, token, delimiter)) {
tokens.push_back(token);
}
return tokens;
}

// The transformSpecsRaw string is always in the format:
//
// "name1, param1, param2, ...; name2, param1, param2, ...; ..."
//
// Where "nameX" is the name of the transform, and "paramX" are the parameters.
std::vector<Transform*> makeTransforms(const std::string& transformSpecsRaw) {
std::vector<Transform*> transforms;
std::vector<std::string> transformSpecs = split(transformSpecsRaw, ';');
for (const std::string& transformSpecRaw : transformSpecs) {
std::vector<std::string> transformSpec = split(transformSpecRaw, ',');
TORCH_CHECK(
transformSpec.size() >= 1,
"Invalid transform spec: " + transformSpecRaw);

auto name = transformSpec[0];
if (name == "resize") {
transforms.push_back(makeResizeTransform(transformSpec));
} else {
TORCH_CHECK(false, "Invalid transform name: " + name);
}
}
return transforms;
}

} // namespace

// ==============================
Expand Down Expand Up @@ -252,36 +315,18 @@ at::Tensor _create_from_file_like(

void _add_video_stream(
at::Tensor& decoder,
std::optional<int64_t> width = std::nullopt,
std::optional<int64_t> height = std::nullopt,
std::optional<int64_t> num_threads = std::nullopt,
std::optional<std::string_view> dimension_order = std::nullopt,
std::optional<int64_t> stream_index = std::nullopt,
std::string_view device = "cpu",
std::string_view device_variant = "default",
std::string_view transform_specs = "",
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that we're using an empty spec, "", as the default rather than making it optional. I find this makes the code simpler and easier to reason about.

std::optional<std::tuple<at::Tensor, at::Tensor, at::Tensor>>
custom_frame_mappings = std::nullopt,
std::optional<std::string_view> color_conversion_library = std::nullopt) {
VideoStreamOptions videoStreamOptions;
videoStreamOptions.ffmpegThreadCount = num_threads;

// TODO: Eliminate this temporary bridge code. This exists because we have
// not yet exposed the transforms API on the Python side. We also want
// to remove the `width` and `height` arguments from the Python API.
//
// TEMPORARY BRIDGE CODE START
TORCH_CHECK(
width.has_value() == height.has_value(),
"width and height must both be set or unset.");
std::vector<Transform*> transforms;
if (width.has_value()) {
transforms.push_back(
new ResizeTransform(FrameDims(height.value(), width.value())));
width.reset();
height.reset();
}
// TEMPORARY BRIDGE CODE END

if (dimension_order.has_value()) {
std::string stdDimensionOrder{dimension_order.value()};
TORCH_CHECK(stdDimensionOrder == "NHWC" || stdDimensionOrder == "NCHW");
Expand Down Expand Up @@ -309,6 +354,9 @@ void _add_video_stream(
videoStreamOptions.device = torch::Device(std::string(device));
videoStreamOptions.deviceVariant = device_variant;

std::vector<Transform*> transforms =
makeTransforms(std::string(transform_specs));
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

An example of how using a default empty spec make things simpler than using an optional: we always call this function. If we have an empty spec, we just get back an empty vector.


std::optional<SingleStreamDecoder::FrameMappings> converted_mappings =
custom_frame_mappings.has_value()
? std::make_optional(makeFrameMappings(custom_frame_mappings.value()))
Expand All @@ -324,24 +372,22 @@ void _add_video_stream(
// Add a new video stream at `stream_index` using the provided options.
void add_video_stream(
at::Tensor& decoder,
std::optional<int64_t> width = std::nullopt,
std::optional<int64_t> height = std::nullopt,
std::optional<int64_t> num_threads = std::nullopt,
std::optional<std::string_view> dimension_order = std::nullopt,
std::optional<int64_t> stream_index = std::nullopt,
std::string_view device = "cpu",
std::string_view device_variant = "default",
std::string_view transform_specs = "",
const std::optional<std::tuple<at::Tensor, at::Tensor, at::Tensor>>&
custom_frame_mappings = std::nullopt) {
_add_video_stream(
decoder,
width,
height,
num_threads,
dimension_order,
stream_index,
device,
device_variant,
transform_specs,
custom_frame_mappings);
}

Expand Down
6 changes: 2 additions & 4 deletions src/torchcodec/_core/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,13 +299,12 @@ def create_from_tensor_abstract(
def _add_video_stream_abstract(
decoder: torch.Tensor,
*,
width: Optional[int] = None,
height: Optional[int] = None,
num_threads: Optional[int] = None,
dimension_order: Optional[str] = None,
stream_index: Optional[int] = None,
device: str = "cpu",
device_variant: str = "default",
transform_specs: str = "",
custom_frame_mappings: Optional[
tuple[torch.Tensor, torch.Tensor, torch.Tensor]
] = None,
Expand All @@ -318,13 +317,12 @@ def _add_video_stream_abstract(
def add_video_stream_abstract(
decoder: torch.Tensor,
*,
width: Optional[int] = None,
height: Optional[int] = None,
num_threads: Optional[int] = None,
dimension_order: Optional[str] = None,
stream_index: Optional[int] = None,
device: str = "cpu",
device_variant: str = "default",
transform_specs: str = "",
custom_frame_mappings: Optional[
tuple[torch.Tensor, torch.Tensor, torch.Tensor]
] = None,
Expand Down
3 changes: 1 addition & 2 deletions src/torchcodec/_samplers/video_clip_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,8 +147,7 @@ def forward(self, video_data: Tensor) -> Union[List[Any]]:
scan_all_streams_to_update_metadata(video_decoder)
add_video_stream(
video_decoder,
width=target_width,
height=target_height,
transform_specs=f"resize, {target_height}, {target_width}",
num_threads=self.decoder_args.num_threads,
)

Expand Down
60 changes: 51 additions & 9 deletions test/test_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -631,17 +631,15 @@ def test_color_conversion_library_with_scaling(
filtergraph_decoder = create_from_file(str(input_video.path))
_add_video_stream(
filtergraph_decoder,
width=target_width,
height=target_height,
transform_specs=f"resize, {target_height}, {target_width}",
color_conversion_library="filtergraph",
)
filtergraph_frame0, _, _ = get_next_frame(filtergraph_decoder)

swscale_decoder = create_from_file(str(input_video.path))
_add_video_stream(
swscale_decoder,
width=target_width,
height=target_height,
transform_specs=f"resize, {target_height}, {target_width}",
color_conversion_library="swscale",
)
swscale_frame0, _, _ = get_next_frame(swscale_decoder)
Expand All @@ -655,7 +653,53 @@ def test_scaling_on_cuda_fails(self):
RuntimeError,
match="Transforms are only supported for CPU devices.",
):
add_video_stream(decoder, device="cuda", width=100, height=100)
add_video_stream(decoder, device="cuda", transform_specs="resize, 100, 100")

def test_transform_fails(self):
decoder = create_from_file(str(NASA_VIDEO.path))
with pytest.raises(
RuntimeError,
match="Invalid transform spec",
):
add_video_stream(decoder, transform_specs=";")

with pytest.raises(
RuntimeError,
match="Invalid transform name",
):
add_video_stream(decoder, transform_specs="invalid, 1, 2")

def test_resize_transform_fails(self):
decoder = create_from_file(str(NASA_VIDEO.path))
with pytest.raises(
RuntimeError,
match="must have 3 elements",
):
add_video_stream(decoder, transform_specs="resize, 100, 100, 100")

with pytest.raises(
RuntimeError,
match="must be a positive integer",
):
add_video_stream(decoder, transform_specs="resize, -10, 100")

with pytest.raises(
RuntimeError,
match="must be a positive integer",
):
add_video_stream(decoder, transform_specs="resize, 100, 0")

with pytest.raises(
RuntimeError,
match="cannot be converted to an int",
):
add_video_stream(decoder, transform_specs="resize, blah, 100")

with pytest.raises(
RuntimeError,
match="out of range",
):
add_video_stream(decoder, transform_specs="resize, 100, 1000000000000")

@pytest.mark.parametrize("dimension_order", ("NHWC", "NCHW"))
@pytest.mark.parametrize("color_conversion_library", ("filtergraph", "swscale"))
Expand Down Expand Up @@ -763,17 +807,15 @@ def test_color_conversion_library_with_generated_videos(
filtergraph_decoder = create_from_file(str(video_path))
_add_video_stream(
filtergraph_decoder,
width=target_width,
height=target_height,
transform_specs=f"resize, {target_height}, {target_width}",
color_conversion_library="filtergraph",
)
filtergraph_frame0, _, _ = get_next_frame(filtergraph_decoder)

auto_decoder = create_from_file(str(video_path))
add_video_stream(
auto_decoder,
width=target_width,
height=target_height,
transform_specs=f"resize, {target_height}, {target_width}",
)
auto_frame0, _, _ = get_next_frame(auto_decoder)
assert_frames_equal(filtergraph_frame0, auto_frame0)
Expand Down
Loading