NVIDIA
diff --git a/‎requirements.txt‎
Lines changed: 0 additions & 1 deletion b/‎requirements.txt‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎scripts/build_wheel.py‎
Lines changed: 5 additions & 2 deletions b/‎scripts/build_wheel.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎tensorrt_llm/runtime/__init__.py‎
Lines changed: 18 additions & 3 deletions b/‎tensorrt_llm/runtime/__init__.py‎
Lines changed: 18 additions & 3 deletions
diff --git a/‎tensorrt_llm/runtime/kv_cache_manager_v2/__init__.py‎
Lines changed: 13 additions & 1 deletion b/‎tensorrt_llm/runtime/kv_cache_manager_v2/__init__.py‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎tensorrt_llm/runtime/kv_cache_manager_v2/__init__.pyi‎
Lines changed: 54 additions & 6 deletions b/‎tensorrt_llm/runtime/kv_cache_manager_v2/__init__.pyi‎
Lines changed: 54 additions & 6 deletions
diff --git a/‎tensorrt_llm/runtime/kv_cache_manager_v2/_common.py‎
Lines changed: 1 addition & 0 deletions b/‎tensorrt_llm/runtime/kv_cache_manager_v2/_common.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tensorrt_llm/runtime/kv_cache_manager_v2/_copy_engine.py‎
Lines changed: 10 additions & 3 deletions b/‎tensorrt_llm/runtime/kv_cache_manager_v2/_copy_engine.py‎
Lines changed: 10 additions & 3 deletions
diff --git a/‎tensorrt_llm/runtime/kv_cache_manager_v2/_core/__init__.py‎
Lines changed: 10 additions & 3 deletions b/‎tensorrt_llm/runtime/kv_cache_manager_v2/_core/__init__.py‎
Lines changed: 10 additions & 3 deletions
diff --git a/‎tensorrt_llm/runtime/kv_cache_manager_v2/_core/_kv_cache.py‎
Lines changed: 31 additions & 13 deletions b/‎tensorrt_llm/runtime/kv_cache_manager_v2/_core/_kv_cache.py‎
Lines changed: 31 additions & 13 deletions
@@ -80,4 +80,3 @@ mistral-common==1.8.6
 torchao>=0.14.1
 cuda-core
 llist
-dynamic_path_manager
@@ -439,13 +439,14 @@ def build_kv_cache_manager_v2(project_dir, venv_python, use_mypyc=False):
             so_file.unlink()
 
     # Build rawref
-    print("-- Building kv_cache_manager_v2 rawref extension...")
+    print("-- Building kv_cache_manager_v2 rawref extension...", end=" ")
     rawref_dir = kv_cache_mgr_dir / "rawref"
     build_run(f'"{venv_python}" setup.py build_ext --inplace', cwd=rawref_dir)
+    print("Done")
 
     if use_mypyc:
         # Build mypyc
-        print("-- Building kv_cache_manager_v2 mypyc extensions...")
+        print("-- Building kv_cache_manager_v2 mypyc extensions...", end=" ")
         # setup_mypyc.py is in kv_cache_manager_v2 but executed from runtime dir
         setup_mypyc = kv_cache_mgr_dir / "setup_mypyc.py"
         build_run(f'"{venv_python}" "{setup_mypyc}" build_ext --inplace',
@@ -456,6 +457,8 @@ def build_kv_cache_manager_v2(project_dir, venv_python, use_mypyc=False):
             raise RuntimeError(
                 "Failed to build kv_cache_manager_v2: no shared library generated."
             )
+        print("Done")
+    print("-- Done building kv_cache_manager_v2.")
 
 
 def main(*,
 
@@ -13,16 +13,31 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
+import sys
+from contextlib import contextmanager
+from typing import Iterator
+
+
+# Duplicated from kv_cache_manager_v2._utils. We need this both inside and outside of
+# kv_cache_manager_v2 due to restriction of mypyc build process.
+@contextmanager
+def temporary_sys_path(path: str) -> Iterator[None]:
+    already_in_path = path in sys.path
+    if not already_in_path:
+        sys.path.insert(0, path)
+    try:
+        yield
+    finally:
+        if not already_in_path:
+            sys.path.remove(path)
 
-from dynamic_path_manager import DynamicPathManager
 
 # Add current directory to sys.path so kv_cache_manager_v2 can be imported as top-level package.
 # This is required because when kv_cache_manager_v2 is compiled with mypyc, it is compiled as
 # a top-level package (to avoid complex build paths), but at runtime it is used as a submodule.
 # The compiled extension might try to import its submodules using absolute imports based on its
 # compiled name.
-with DynamicPathManager(os.path.dirname(os.path.abspath(__file__)),
-                        clear_cache=False):
+with temporary_sys_path(os.path.dirname(os.path.abspath(__file__))):
     import kv_cache_manager_v2
 
 from .enc_dec_model_runner import EncDecModelRunner
 
@@ -36,8 +36,16 @@
     HostCacheTierConfig,
     KVCacheManagerConfig,
 )
-from ._core import BeamIndex, KVCacheManager, _KVCache
+from ._core import (
+    DEFAULT_BEAM_INDEX,
+    AggregatedPageDesc,
+    BeamIndex,
+    BufferSlice,
+    KVCacheManager,
+    _KVCache,
+)
 from ._life_cycle_registry import LayerGroupId, LifeCycleId
+from ._storage import BufferId
 
 __all__ = [
     "LifeCycleId",
@@ -47,6 +55,7 @@
     "KVCacheManager",
     "_KVCache",
     "BeamIndex",
+    "DEFAULT_BEAM_INDEX",
     "LayerId",
     "Priority",
     "CacheLevel",
@@ -64,4 +73,7 @@
     "CacheTierConfig",
     "gen_multi_modal_tokens",
     "rawref",
+    "BufferSlice",
+    "AggregatedPageDesc",
+    "BufferId",
 ]
@@ -23,6 +23,7 @@ from typing import (
     Final,
     Iterable,
     Iterator,
+    NamedTuple,
     NewType,
     Protocol,
     Sequence,
@@ -33,6 +34,7 @@ from typing import (
 
 # From _common.py
 NDEBUG: Final[int]
+DEFAULT_BEAM_INDEX: Final[BeamIndex]
 
 class CacheTier(enum.IntEnum):
     GPU_MEM = 0
@@ -49,6 +51,7 @@ CudaStream = NewType("CudaStream", int)
 BeamIndex = NewType("BeamIndex", int)
 MemAddress = NewType("MemAddress", int)
 Priority = NewType("Priority", int)
+PoolGroupIndex = NewType("PoolGroupIndex", int)
 
 # From _config.py
 DataRole = NewType("DataRole", str)
@@ -154,9 +157,12 @@ class _KVCache:
     @beam_width.setter
     def beam_width(self, beam_width: BeamIndex) -> None: ...
     def get_page_indices(self, layer_group_id: int, beam_id: BeamIndex = ...) -> IndexSeq: ...
-    def get_all_page_indices(
-        self, beam_id: BeamIndex, buf_ids: Iterable[tuple[LayerId, DataRole]]
-    ) -> Iterator[IndexSeq]: ...
+    def get_aggregated_page_indices(
+        self,
+        layer_group_id: LayerGroupId,
+        beam_id: BeamIndex = DEFAULT_BEAM_INDEX,
+        valid_only: bool = False,
+    ) -> Iterator[int]: ...
     def resize(self, capacity: int | None, history_length: int | None = None) -> bool: ...
     @property
     def capacity(self) -> int: ...
@@ -183,6 +189,39 @@ class _KVCache:
     @property
     def tokens_per_block(self) -> int: ...
 
+@dataclass(slots=True, frozen=True)
+class MemoryPoolDesc:
+    base: MemAddress
+    page_size: int
+
+@dataclass(slots=True, frozen=True)
+class MemoryPoolGroupDesc:
+    num_pages: int
+    pools: Sequence[MemoryPoolDesc]
+
+class BufferId(NamedTuple):
+    layer_id: LayerId
+    role: DataRole
+
+@dataclass(slots=True, frozen=True)
+class BufferSlice:
+    buffer_id: BufferId
+    num_slices: int = 1
+    slice_index: int = 1
+
+@dataclass(slots=True, frozen=True)
+class AggregatedPageDesc:
+    """The data you need would be in the following byte ranges.
+
+    (base + stride * i + Range(0, size) for i in aggregated_page_indices)
+    """
+
+    base: MemAddress
+    size: int
+    stride: int
+    layer_group_id: LayerGroupId
+    buffers: Sequence[BufferSlice]
+
 # From _core/_kv_cache_manager.py
 class KVCacheManager:
     def __init__(self, config: KVCacheManagerConfig) -> None: ...
@@ -200,14 +239,23 @@ class KVCacheManager:
     def resize(self, cache_level: CacheLevel, quota: int, best_efforts: bool = False) -> bool: ...
     def get_quota(self, cache_level: CacheLevel) -> int: ...
     @property
-    def cache_tier_list(self) -> tuple[CacheTier, ...]: ...
+    def cache_tier_list(self) -> Sequence[CacheTier]: ...
     @property
     def tokens_per_block(self) -> int: ...
     @property
     def allow_seq_rebasing(self) -> bool: ...
     @property
     def enable_partial_match(self) -> bool: ...
-    def get_layer_group_id(self, layer_id: LayerId) -> int: ...
     @property
-    def layer_grouping(self) -> tuple[tuple[LayerId, ...], ...]: ...
+    def num_layers(self) -> int: ...
+    @property
+    def layer_ids(self) -> Iterator[LayerId]: ...
+    def get_layer_group_id(self, layer_id: LayerId) -> LayerGroupId: ...
+    @property
+    def layer_grouping(self) -> Sequence[Sequence[LayerId]]: ...
+    @property
+    def all_buffer_ids(self) -> Iterator[BufferId]: ...
+    def get_aggregated_pages(
+        self, buffers: Iterable[BufferSlice]
+    ) -> Iterator[AggregatedPageDesc]: ...
     def clamp_max_seq_len_for_mem(self, batch_size: int, model_max_seq_len: int) -> int: ...
@@ -56,6 +56,7 @@ class CacheTier(enum.IntEnum):
 CudaStream = NewType("CudaStream", int)
 
 BeamIndex = NewType("BeamIndex", int)
+DEFAULT_BEAM_INDEX: Final[BeamIndex] = BeamIndex(0)
 
 UserId = NewType("UserId", int)
 
 
@@ -26,10 +26,17 @@
 from typing import ClassVar, NamedTuple, Sequence, cast
 
 import cuda.bindings.driver as drv
-from dynamic_path_manager import DynamicPathManager
 
 from ._common import Address, CacheTier, CudaStream, MemAddress
-from ._utils import CachedCudaEvent, HomoTuple, HostMem, _unwrap, div_up, stream_wait_events
+from ._utils import (
+    CachedCudaEvent,
+    HomoTuple,
+    HostMem,
+    _unwrap,
+    div_up,
+    stream_wait_events,
+    temporary_sys_path,
+)
 
 if "tensorrt_llm" in sys.modules:
     from tensorrt_llm.bindings.internal.batch_manager.kv_cache_manager_v2_utils import (  # noqa # type: ignore
@@ -50,7 +57,7 @@
     # fast path for dev, avoids importing the whole tensorrt_llm module
     spec = find_spec("kv_cache_manager_v2")
     assert spec is not None and spec.origin is not None
-    with DynamicPathManager(str(Path(spec.origin).parent.parent.parent), clear_cache=False):
+    with temporary_sys_path(str(Path(spec.origin).parent.parent.parent)):
         from bindings.internal.batch_manager.kv_cache_manager_v2_utils import (  # noqa
             DiskAddress,
             DiskToDiskTask,
 
@@ -13,8 +13,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .._common import BeamIndex
+from .._common import DEFAULT_BEAM_INDEX, BeamIndex
 from ._kv_cache import _KVCache
-from ._kv_cache_manager import KVCacheManager
+from ._kv_cache_manager import AggregatedPageDesc, BufferSlice, KVCacheManager
 
-__all__ = ["KVCacheManager", "_KVCache", "BeamIndex"]
+__all__ = [
+    "KVCacheManager",
+    "_KVCache",
+    "BeamIndex",
+    "DEFAULT_BEAM_INDEX",
+    "BufferSlice",
+    "AggregatedPageDesc",
+]
@@ -25,6 +25,7 @@
 from .._block_radix_tree import Block, RootBlock, UselessBlockError
 from .._common import (
     BAD_PAGE_INDEX,
+    DEFAULT_BEAM_INDEX,
     GPU_LEVEL,
     NDEBUG,
     BeamIndex,
@@ -49,7 +50,6 @@
     _SharedPageLock,
     batched_lock_to_gpu,
 )
-from .._storage._config import BufferId
 from .._storage_manager import StorageManager
 from .._utils import (
     CachedCudaEvent,
@@ -312,7 +312,7 @@ def beam_width(self, beam_width: BeamIndex) -> None:
     # Due to constraints of the current kernels, K/V data blocks and the correspondding quant scale blocks
     # share the same indices, so the output for DataRole.KEY_DATA and DataRole.KEY_BLOCK_SCALE are the same.
     def get_page_indices(
-        self, layer_group_id: LayerGroupId, beam_id: BeamIndex = BeamIndex(0)
+        self, layer_group_id: LayerGroupId, beam_id: BeamIndex = DEFAULT_BEAM_INDEX
     ) -> IndexSeq:
         indices = self._page_indices[beam_id][layer_group_id]
         assert NDEBUG or all(
@@ -321,13 +321,31 @@ def get_page_indices(
         )
         return indices
 
-    def get_all_page_indices(
-        self, beam_id: BeamIndex, buf_ids: Iterable[BufferId]
-    ) -> Iterator[IndexSeq]:
-        layer_to_lc_ids = self.manager._storage._layer_to_life_cycle_ids
-        for layer_id, _ in buf_ids:
-            lc = layer_to_lc_ids[layer_id]
-            yield self._page_indices[beam_id][lc]
+    def get_aggregated_page_indices(
+        self,
+        layer_group_id: LayerGroupId,
+        beam_id: BeamIndex = DEFAULT_BEAM_INDEX,
+        valid_only: bool = False,
+    ) -> Iterator[int]:
+        """
+        Get the internal slot indices for the given layer group and beam.
+        Each slot is a group of coalesced buffers in one memory pool group.
+        This API exposes internal slot indices, mainly for efficient data transfer.
+        For computation, use get_page_indices() instead.
+
+        Args:
+            layer_group_id: Layer group to inspect.
+            beam_id: Beam index to read. Defaults to DEFAULT_BEAM_INDEX.
+
+        Returns:
+            Aggregated page index for each block, or BAD_PAGE_INDEX for invalid blocks.
+        """
+        for b in self._blocks:
+            if (holder := b.pages[beam_id][layer_group_id]) is None:
+                if not valid_only:
+                    yield BAD_PAGE_INDEX
+            else:
+                yield holder.page.slot_id
 
     # reserve space for next inference. Request new blocks from KVCacheManager if necessary.
     # if capacity is increased and beam_width > 1, blocks containing new tokens should be allocated for each beam.
@@ -608,7 +626,7 @@ def _commit_block(self, ordinal: BlockOrdinal, is_last: bool) -> None:
         )
         seq_block = self._blocks[ordinal]
         assert typed_len(seq_block.pages) == 1, "Must have 1 beam only"
-        beam_idx = BeamIndex(0)
+        beam_idx = DEFAULT_BEAM_INDEX
         beam_block = seq_block.pages[beam_idx]
         tokens_per_block = self.tokens_per_block
         start = ordinal * tokens_per_block
@@ -756,7 +774,7 @@ def _get_tree_block(self, ordinal: BlockOrdinal) -> Block:
         assert self._blocks[ordinal].is_committed
         ret = unwrap_optional(self._blocks[ordinal].tree_block)
         if not NDEBUG:
-            for b in self._block(ordinal, BeamIndex(0)):
+            for b in self._block(ordinal, DEFAULT_BEAM_INDEX):
                 assert b is None or (isinstance(b.page, CommittedPage) and b.page.block() is ret)
         return ret
 
@@ -925,7 +943,7 @@ def check_no_page_stale(b: tuple[Block, int]):
             ],
         )
 
-        beam_idx = BeamIndex(0)
+        beam_idx = DEFAULT_BEAM_INDEX
         for lc_idx, lc in life_cycles.items():
             stale_start, stale_end = _KVCache._get_stale_range(
                 tokens_per_block, get_num_matched_tokens(matched), lc
@@ -1011,7 +1029,7 @@ def _update_page_index(
         return old
 
     def _get_page_indices_ref(
-        self, lc: LifeCycleId, beam_id: BeamIndex = BeamIndex(0)
+        self, lc: LifeCycleId, beam_id: BeamIndex = DEFAULT_BEAM_INDEX
     ) -> Iterator[int | None]:
         assert beam_id < self.beam_width
         assert self.is_active