[video] make NVDec cache size adjustable (meta-pytorch#1246)

vkhalidov · NicolasHug · web-flow · commit 89ada0a91d43 · 2026-03-06T12:36:08.000Z
Co-authored-by: Nicolas Hug &lt;contact@nicolas-hug.com&gt;
diff --git a/docs/source/api_ref_decoders.rst b/docs/source/api_ref_decoders.rst
@@ -25,6 +25,8 @@ For an audio decoder tutorial, see: :ref:`sphx_glr_generated_examples_decoding_a
     :template: function.rst
 
     set_cuda_backend
+    set_nvdec_cache_capacity
+    get_nvdec_cache_capacity
 
 .. autosummary::
     :toctree: generated/
diff --git a/src/torchcodec/_core/CMakeLists.txt b/src/torchcodec/_core/CMakeLists.txt
@@ -137,6 +137,7 @@ function(make_torchcodec_libraries
         Transform.cpp
         Metadata.cpp
         SwScale.cpp
+        NVDECCacheConfig.cpp
     )
 
     if(ENABLE_CUDA)
@@ -163,9 +164,10 @@ function(make_torchcodec_libraries
     )
 
     if(ENABLE_CUDA)
-        # We have to define USE_CUDA because we rely on some APIs like
-        # aoti_torch_get_current_cuda_stream, which are only exposed in torch
-        # headers if is defined!
+        # We define USE_CUDA to guard CUDA-specific code paths (e.g.
+        # NVDECCache usage in NVDECCacheConfig.cpp) and because some torch
+        # APIs like aoti_torch_get_current_cuda_stream are only exposed when
+        # USE_CUDA is defined.
         # https://github.com/pytorch/pytorch/blob/98e36864e640023a716e058d894ea2d20e76e5f7/torch/csrc/inductor/aoti_torch/c/shim.h#L573-L602
         target_compile_definitions(${core_library_name} PRIVATE USE_CUDA)
     endif()
diff --git a/src/torchcodec/_core/NVDECCache.cpp b/src/torchcodec/_core/NVDECCache.cpp
@@ -9,6 +9,7 @@
 #include "CUDACommon.h"
 #include "FFMPEGCommon.h"
 #include "NVDECCache.h"
+#include "NVDECCacheConfig.h"
 
 #include <cuda_runtime.h> // For cudaGetDevice
 
@@ -19,9 +20,13 @@ extern "C" {
 
 namespace facebook::torchcodec {
 
-NVDECCache& NVDECCache::getCache(const StableDevice& device) {
+NVDECCache* NVDECCache::getCacheInstances() {
   static NVDECCache cacheInstances[MAX_CUDA_GPUS];
-  return cacheInstances[getDeviceIndex(device)];
+  return cacheInstances;
+}
+
+NVDECCache& NVDECCache::getCache(const StableDevice& device) {
+  return getCacheInstances()[getDeviceIndex(device)];
 }
 
 UniqueCUvideodecoder NVDECCache::getDecoder(CUVIDEOFORMAT* videoFormat) {
@@ -39,6 +44,21 @@ UniqueCUvideodecoder NVDECCache::getDecoder(CUVIDEOFORMAT* videoFormat) {
   return nullptr;
 }
 
+// Evicts the least-recently-used entry from cache_.
+// Caller must hold cacheLock_!!!
+void NVDECCache::evictLRUEntry() {
+  if (cache_.empty()) {
+    return;
+  }
+  auto victim = cache_.begin();
+  for (auto it = cache_.begin(); it != cache_.end(); ++it) {
+    if (it->second.lastUsed < victim->second.lastUsed) {
+      victim = it;
+    }
+  }
+  cache_.erase(victim);
+}
+
 void NVDECCache::returnDecoder(
     CUVIDEOFORMAT* videoFormat,
     UniqueCUvideodecoder decoder) {
@@ -47,25 +67,40 @@ void NVDECCache::returnDecoder(
   CacheKey key(videoFormat);
   std::lock_guard<std::mutex> lock(cacheLock_);
 
-  // Evict least recently used entry if at capacity.
-  // This search is O(MAX_CACHE_SIZE) but MAX_CACHE_SIZE is always small, so
-  // this isn't significant.
-  if (cache_.size() >= MAX_CACHE_SIZE) {
-    auto victim = cache_.begin();
-    for (auto it = cache_.begin(); it != cache_.end(); ++it) {
-      if (it->second.lastUsed < victim->second.lastUsed) {
-        victim = it;
-      }
-    }
-    cache_.erase(victim);
+  int capacity = getNVDECCacheCapacity();
+  if (capacity <= 0) {
+    return;
+  }
+
+  // Evict least recently used entries until under capacity.
+  // This search is O(capacity), which is supposed to be small,
+  // so linear vs constant search overhead is expected to be negligible.
+  while (cache_.size() >= static_cast<size_t>(capacity)) {
+    evictLRUEntry();
   }
 
   // Add the decoder back to cache
   cache_.emplace(key, CacheEntry(std::move(decoder), lastUsedCounter_++));
 
   STD_TORCH_CHECK(
-      cache_.size() <= MAX_CACHE_SIZE,
-      "Cache size exceeded maximum limit, please report a bug");
+      cache_.size() <= static_cast<size_t>(capacity),
+      "Cache size exceeded capacity, please report a bug");
+}
+
+void NVDECCache::evictExcessEntriesAcrossDevices(int capacity) {
+  NVDECCache* instances = getCacheInstances();
+  for (int i = 0; i < MAX_CUDA_GPUS; ++i) {
+    std::lock_guard<std::mutex> lock(instances[i].cacheLock_);
+    while (instances[i].cache_.size() > static_cast<size_t>(capacity)) {
+      instances[i].evictLRUEntry();
+    }
+  }
+}
+
+int NVDECCache::getCacheSizeForDevice(int device_index) {
+  NVDECCache* instances = getCacheInstances();
+  std::lock_guard<std::mutex> lock(instances[device_index].cacheLock_);
+  return static_cast<int>(instances[device_index].cache_.size());
 }
 
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/NVDECCache.h b/src/torchcodec/_core/NVDECCache.h
@@ -13,6 +13,7 @@
 #include <cuda.h>
 
 #include "NVCUVIDRuntimeLoader.h"
+#include "NVDECCacheConfig.h"
 #include "StableABICompat.h"
 #include "nvcuvid_include/cuviddec.h"
 #include "nvcuvid_include/nvcuvid.h"
@@ -56,6 +57,13 @@ class NVDECCache {
   // Return decoder to cache using LRU eviction.
   void returnDecoder(CUVIDEOFORMAT* videoFormat, UniqueCUvideodecoder decoder);
 
+  // Iterates all per-device cache instances and evicts LRU entries until each
+  // cache's size is at most capacity. Called from setNVDECCacheCapacity().
+  static void evictExcessEntriesAcrossDevices(int capacity);
+
+  // Returns the number of entries in the cache for a given device index.
+  static int getCacheSizeForDevice(int device_index);
+
  private:
   // Cache key struct: a decoder can be reused and taken from the cache only if
   // all these parameters match.
@@ -103,12 +111,13 @@ class NVDECCache {
   NVDECCache() = default;
   ~NVDECCache() = default;
 
+  void evictLRUEntry();
+
+  static NVDECCache* getCacheInstances();
+
   std::multimap<CacheKey, CacheEntry> cache_;
   std::mutex cacheLock_;
   uint64_t lastUsedCounter_ = 0;
-
-  // Max number of cached decoders, per device
-  static constexpr int MAX_CACHE_SIZE = 20;
 };
 
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/NVDECCacheConfig.cpp b/src/torchcodec/_core/NVDECCacheConfig.cpp
@@ -0,0 +1,62 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include "NVDECCacheConfig.h"
+
+#include <atomic>
+#include <mutex>
+
+#include "c10/util/Exception.h"
+
+#ifdef USE_CUDA
+#include "CUDACommon.h"
+#include "NVDECCache.h"
+#endif
+
+namespace facebook::torchcodec {
+
+static std::atomic<int> g_nvdecCacheCapacity{DEFAULT_NVDEC_CACHE_CAPACITY};
+// This mutex serializes setNVDECCacheCapacity() calls so that the atomic store
+// and the subsequent cache eviction happen as one unit. getNVDECCacheCapacity()
+// intentionally reads the atomic without this mutex: callers like
+// returnDecoder() may briefly see a stale value during an ongoing
+// setNVDECCacheCapacity(), which is acceptable because the worst case is a
+// single decoder being added back to the cache after eviction. That entry will
+// be consumed by a subsequent getDecoder() call or evicted by a future
+// returnDecoder() or setNVDECCacheCapacity() call.
+static std::mutex g_nvdecCacheCapacityMutex;
+
+void setNVDECCacheCapacity(int capacity) {
+  TORCH_CHECK(
+      capacity >= 0,
+      "NVDEC cache capacity must be non-negative, got ",
+      capacity);
+  std::lock_guard<std::mutex> lock(g_nvdecCacheCapacityMutex);
+  g_nvdecCacheCapacity.store(capacity);
+#ifdef USE_CUDA
+  NVDECCache::evictExcessEntriesAcrossDevices(capacity);
+#endif
+}
+
+int getNVDECCacheCapacity() {
+  return g_nvdecCacheCapacity.load();
+}
+
+int getNVDECCacheSize([[maybe_unused]] int device_index) {
+#ifdef USE_CUDA
+  TORCH_CHECK(
+      device_index >= 0 && device_index < MAX_CUDA_GPUS,
+      "device_index must be between 0 and ",
+      MAX_CUDA_GPUS - 1,
+      ", got ",
+      device_index);
+  return NVDECCache::getCacheSizeForDevice(device_index);
+#else
+  return 0;
+#endif
+}
+
+} // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/NVDECCacheConfig.h b/src/torchcodec/_core/NVDECCacheConfig.h
@@ -0,0 +1,30 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+// This header is intentionally CUDA-free so it can be included from
+// custom_ops.cpp which is compiled without CUDA headers.
+
+namespace facebook::torchcodec {
+
+// Default capacity of the per-device NVDEC decoder cache.
+// capacity == maximum number of cached instances allowed.
+constexpr int DEFAULT_NVDEC_CACHE_CAPACITY = 20;
+
+// Set the capacity of the per-device NVDEC decoder cache.
+// capacity must be non-negative.
+void setNVDECCacheCapacity(int capacity);
+
+// Get the current capacity of the per-device NVDEC decoder cache.
+int getNVDECCacheCapacity();
+
+// Get the current number of entries in the NVDEC decoder cache for a device.
+// This is currently only used for tests, and not publicly exposed.
+// TODO expose it?
+int getNVDECCacheSize(int device_index);
+
+} // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/__init__.py b/src/torchcodec/_core/__init__.py
@@ -17,6 +17,7 @@
     _add_video_stream,
     _get_backend_details,
     _get_key_frame_indices,
+    _get_nvdec_cache_size,
     _test_frame_pts_equality,
     add_audio_stream,
     add_video_stream,
@@ -42,6 +43,8 @@
     get_frames_in_range,
     get_json_metadata,
     get_next_frame,
+    get_nvdec_cache_capacity,
     scan_all_streams_to_update_metadata,
     seek_to_pts,
+    set_nvdec_cache_capacity,
 )
diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
@@ -12,6 +12,7 @@
 #include "AVIOFileLikeContext.h"
 #include "AVIOTensorContext.h"
 #include "Encoder.h"
+#include "NVDECCacheConfig.h"
 #include "SingleStreamDecoder.h"
 #include "StableABICompat.h"
 #include "ValidationUtils.h"
@@ -76,6 +77,9 @@ STABLE_TORCH_LIBRARY(torchcodec_ns, m) {
   m.def(
       "_test_frame_pts_equality(Tensor(a!) decoder, *, int frame_index, float pts_seconds_to_test) -> bool");
   m.def("scan_all_streams_to_update_metadata(Tensor(a!) decoder) -> ()");
+  m.def("set_nvdec_cache_capacity(int capacity) -> ()");
+  m.def("get_nvdec_cache_capacity() -> int");
+  m.def("_get_nvdec_cache_size(int device_index) -> int");
 }
 
 namespace {
@@ -1085,6 +1089,28 @@ void scan_all_streams_to_update_metadata(torch::stable::Tensor& decoder) {
   videoDecoder->scanFileAndUpdateMetadataAndIndex();
 }
 
+void set_nvdec_cache_capacity(int64_t capacity) {
+  int capacityInt = validateInt64ToInt(capacity, "capacity");
+  STD_TORCH_CHECK(
+      capacityInt >= 0,
+      "NVDEC cache capacity must be non-negative, got ",
+      capacityInt);
+  setNVDECCacheCapacity(capacityInt);
+}
+
+int64_t get_nvdec_cache_capacity() {
+  return static_cast<int64_t>(getNVDECCacheCapacity());
+}
+
+int64_t _get_nvdec_cache_size(int64_t device_index) {
+  int deviceIndexInt = validateInt64ToInt(device_index, "device_index");
+  STD_TORCH_CHECK(
+      deviceIndexInt >= 0,
+      "device_index must be non-negative, got ",
+      deviceIndexInt);
+  return static_cast<int64_t>(getNVDECCacheSize(deviceIndexInt));
+}
+
 STABLE_TORCH_LIBRARY_IMPL(torchcodec_ns, BackendSelect, m) {
   m.impl("create_from_file", TORCH_BOX(&create_from_file));
   m.impl("create_from_tensor", TORCH_BOX(&create_from_tensor));
@@ -1095,6 +1121,9 @@ STABLE_TORCH_LIBRARY_IMPL(torchcodec_ns, BackendSelect, m) {
   m.impl("encode_video_to_file", TORCH_BOX(&encode_video_to_file));
   m.impl("encode_video_to_tensor", TORCH_BOX(&encode_video_to_tensor));
   m.impl("_encode_video_to_file_like", TORCH_BOX(&_encode_video_to_file_like));
+  m.impl("set_nvdec_cache_capacity", TORCH_BOX(&set_nvdec_cache_capacity));
+  m.impl("get_nvdec_cache_capacity", TORCH_BOX(&get_nvdec_cache_capacity));
+  m.impl("_get_nvdec_cache_size", TORCH_BOX(&_get_nvdec_cache_size));
 }
 
 STABLE_TORCH_LIBRARY_IMPL(torchcodec_ns, CPU, m) {
diff --git a/src/torchcodec/_core/ops.py b/src/torchcodec/_core/ops.py
@@ -136,6 +136,9 @@ def add_video_stream(
     torch.ops.torchcodec_ns._get_json_ffmpeg_library_versions.default
 )
 _get_backend_details = torch.ops.torchcodec_ns._get_backend_details.default
+set_nvdec_cache_capacity = torch.ops.torchcodec_ns.set_nvdec_cache_capacity.default
+get_nvdec_cache_capacity = torch.ops.torchcodec_ns.get_nvdec_cache_capacity.default
+_get_nvdec_cache_size = torch.ops.torchcodec_ns._get_nvdec_cache_size.default
 
 
 # =============================
@@ -572,3 +575,18 @@ def get_ffmpeg_library_versions():
 @register_fake("torchcodec_ns::_get_backend_details")
 def _get_backend_details_abstract(decoder: torch.Tensor) -> str:
     return ""
+
+
+@register_fake("torchcodec_ns::set_nvdec_cache_capacity")
+def set_nvdec_cache_capacity_abstract(capacity: int) -> None:
+    return
+
+
+@register_fake("torchcodec_ns::get_nvdec_cache_capacity")
+def get_nvdec_cache_capacity_abstract() -> int:
+    return 0
+
+
+@register_fake("torchcodec_ns::_get_nvdec_cache_size")
+def _get_nvdec_cache_size_abstract(device_index: int) -> int:
+    return 0
diff --git a/src/torchcodec/decoders/__init__.py b/src/torchcodec/decoders/__init__.py
@@ -6,7 +6,11 @@
 
 from .._core import AudioStreamMetadata, VideoStreamMetadata
 from ._audio_decoder import AudioDecoder  # noqa
-from ._decoder_utils import set_cuda_backend  # noqa
+from ._decoder_utils import (  # noqa
+    get_nvdec_cache_capacity,
+    set_cuda_backend,
+    set_nvdec_cache_capacity,
+)
 from ._video_decoder import CpuFallbackStatus, VideoDecoder  # noqa
 
 SimpleVideoDecoder = VideoDecoder
diff --git a/src/torchcodec/decoders/_decoder_utils.py b/src/torchcodec/decoders/_decoder_utils.py
diff --git a/test/test_decoders.py b/test/test_decoders.py