v1: Introduce an offloading component

orozery · orozery · commit e7b1232f6a7b · 2025-08-10T17:24:08.000+03:00
This commit adds a new offloading component, composed of:
1. A scheduler side OffloadingManager (abstract) which kicks-off KV data transfers and keeps track of offloaded data.
2. A worker side OffloadingQueueManager which asynchronously manages KV transfers.

Signed-off-by: Or Ozeri &lt;oro@il.ibm.com&gt;
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -262,6 +262,7 @@ steps:
     - pytest -v -s v1/core
     - pytest -v -s v1/engine
     - pytest -v -s v1/entrypoints
+    - pytest -v -s v1/offloading
     - pytest -v -s v1/sample
     - pytest -v -s v1/worker
     - pytest -v -s v1/structured_output
diff --git a/tests/v1/offloading/test_worker.py b/tests/v1/offloading/test_worker.py
@@ -0,0 +1,140 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import threading
+
+from vllm.v1.offloading.abstract import LoadStoreSpec
+from vllm.v1.offloading.worker.worker import (OffloadingQueueManager,
+                                              TransferSpec)
+
+
+class LoadStoreSpec1(LoadStoreSpec):
+
+    def __init__(self, success: bool = True, exception: bool = False):
+        self.called_event = threading.Event()
+        self.finished_event = threading.Event()
+        self.success = success
+        self.exception = exception
+
+    @staticmethod
+    def medium() -> str:
+        return "1"
+
+
+class LoadStoreSpec2(LoadStoreSpec):
+
+    @staticmethod
+    def medium() -> str:
+        return "2"
+
+
+def transfer_function_1_to_2(transfer_spec: TransferSpec) -> bool:
+    srcs, dsts = transfer_spec
+    assert len(srcs) == 1
+    assert len(dsts) == 1
+
+    src, dst = srcs[0], dsts[0]
+    assert isinstance(src, LoadStoreSpec1)
+    assert isinstance(dst, LoadStoreSpec2)
+
+    src.called_event.set()
+    src.finished_event.wait()
+    if src.exception:
+        raise Exception("An expected exception. Don't worry!")
+    return src.success
+
+
+def transfer_function_2_to_1(transfer_spec: TransferSpec) -> bool:
+    srcs, dsts = transfer_spec
+    assert len(srcs) == 1
+    assert len(dsts) == 1
+
+    src, dst = srcs[0], dsts[0]
+    assert isinstance(src, LoadStoreSpec2)
+    assert isinstance(dst, LoadStoreSpec1)
+
+    dst.called_event.set()
+    dst.finished_event.wait()
+    if dst.exception:
+        raise Exception()
+    return dst.success
+
+
+def test_offloading_queue_manager():
+    """
+    Tests OffloadingQueueManager with 2 workers.
+    One worker performs 1->2 transfers, and the other handles 2->1.
+    """
+    offloading_queue_manager = OffloadingQueueManager()
+    offloading_queue_manager.register_worker(LoadStoreSpec1, LoadStoreSpec2,
+                                             transfer_function_1_to_2)
+    offloading_queue_manager.register_worker(LoadStoreSpec2, LoadStoreSpec1,
+                                             transfer_function_2_to_1)
+
+    # 1st transfer 1->2 (exception)
+    src1 = LoadStoreSpec1(exception=True)
+    dst1 = LoadStoreSpec2()
+    offloading_queue_manager.transfer_async(1, ([src1], [dst1]))
+
+    # 2ed transfer 1->2 (failure)
+    src2 = LoadStoreSpec1(success=False)
+    dst2 = LoadStoreSpec2()
+    offloading_queue_manager.transfer_async(2, ([src2], [dst2]))
+
+    # 3rd transfer 1->2 (success)
+    src3 = LoadStoreSpec1()
+    dst3 = LoadStoreSpec2()
+    offloading_queue_manager.transfer_async(3, ([src3], [dst3]))
+
+    # 4th transfer 2->1
+    src4 = LoadStoreSpec2()
+    dst4 = LoadStoreSpec1()
+    offloading_queue_manager.transfer_async(4, ([src4], [dst4]))
+
+    # 1st transfer started
+    assert src1.called_event.wait(timeout=1)
+
+    # 4th transfer started
+    assert dst4.called_event.wait(timeout=1)
+
+    # 2ed transfer have not started (blocked by 1st)
+    assert not src2.called_event.is_set()
+
+    # no transfer completed yet
+    assert offloading_queue_manager.get_finished() == []
+
+    # complete 1st transfer
+    src1.finished_event.set()
+
+    # 2ed transfer started
+    src2.called_event.wait(timeout=1)
+
+    # 1st transfer finished with failure (exception)
+    assert offloading_queue_manager.get_finished() == [(1, False)]
+
+    # complete 2ed, 3rd and 4th transfers
+    src2.finished_event.set()
+    src3.finished_event.set()
+    dst4.finished_event.set()
+
+    # 5th transfer 1->2
+    src5 = LoadStoreSpec1()
+    dst5 = LoadStoreSpec2()
+    offloading_queue_manager.transfer_async(5, ([src5], [dst5]))
+
+    # 6th transfer 2->1
+    src6 = LoadStoreSpec2()
+    dst6 = LoadStoreSpec1()
+    offloading_queue_manager.transfer_async(6, ([src6], [dst6]))
+
+    # 5th and 6th transfers started
+    assert src5.called_event.wait(timeout=1)
+    assert dst6.called_event.wait(timeout=1)
+
+    # verify result of 2ed, 3rd and 4th transfers
+    assert (sorted(offloading_queue_manager.get_finished()) == [(2, False),
+                                                                (3, True),
+                                                                (4, True)])
+
+    # complete 5th and 6th transfers
+    src5.finished_event.set()
+    dst6.finished_event.set()
diff --git a/vllm/v1/offloading/abstract.py b/vllm/v1/offloading/abstract.py
@@ -0,0 +1,144 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+OffloadingManager class for managing KV data offloading in vLLM v1
+
+This class runs in the scheduler, tracks which blocks are offloaded
+and their address.
+
+The class provides the following primitives:
+    lookup() - find the length of the maximal series of blocks,
+        starting from the first one, that are all offloaded.
+    parepare_load() - prepare given blocks to be read.
+        This given blocks will be protected from eviction.
+        This function returns a LoadSpec which encapsulates
+        information required for performing the load.
+    touch() - marks the give blocks as recently used. Can be used
+        to track block's LRU. This function is separated from the
+        prepare_load function to allow setting block recency even
+        for blocks which do not need reading from the cache, such as
+        blocks that are cached by the GPU prefix cache.
+    complete_load() - mark blocks which were previously prepared to be
+        loaded as done loading. This is to re-allow their eviction.
+    prepare_store() - prepare the given blocks to be written.
+        Returns a StoreSpec encapsulating offloading information,
+        as well as a list of blocks that were evicted as a result.
+    complete_store() - marks a previous store as completed.
+        Following this call, the given blocks will become loadable.
+"""
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Optional
+
+
+class LoadStoreSpec(ABC):
+    """
+    Abstract metadata that encapsulates information allowing a worker
+    to load, and optionally also to store, a block of KV data.
+    """
+
+    @staticmethod
+    @abstractmethod
+    def medium() -> str:
+        """
+        Returns a string representation of the medium type
+        this store/load targets.
+        """
+        pass
+
+
+@dataclass
+class PrepareStoreOutput:
+    block_hashes_to_store: list[int]
+    store_specs: list[LoadStoreSpec]
+    block_hashes_evicted: list[int]
+
+
+class OffloadingManager(ABC):
+
+    @abstractmethod
+    def lookup(self, block_hashes: list[int]) -> int:
+        """
+        Finds the length of the maximal series of blocks, starting from the
+        first one, that are all offloaded.
+
+        Args:
+            block_hashes: the hashes identifying the blocks to lookup.
+
+        Returns:
+            An integer representing the maximal number of blocks that
+            are currently offloaded.
+        """
+        pass
+
+    @abstractmethod
+    def prepare_load(self, block_hashes: list[int]) -> list[LoadStoreSpec]:
+        """
+        Prepare the given blocks to be read.
+        The given blocks will be protected from eviction until
+        complete_load is called.
+        It assumes all given blocks are offloaded.
+
+        Args:
+            block_hashes: the hashes identifying the blocks.
+
+        Returns:
+            A list of LoadStoreSpec, one per each block, that can be used by
+            a worker to locate and load the actual offloaded KV data.
+        """
+        pass
+
+    @abstractmethod
+    def touch(self, block_hashes: list[int]):
+        """
+        Mark the given blocks as recently used.
+        This could in practice mean moving them to the end of an LRU list.
+
+        Args:
+            block_hashes: the hashes identifying the blocks.
+        """
+        pass
+
+    @abstractmethod
+    def complete_load(self, block_hashes: list[int]):
+        """
+        Marks previous blocks that were prepared to load as done loading.
+
+        Args:
+            block_hashes: the hashes identifying the blocks.
+        """
+        pass
+
+    @abstractmethod
+    def prepare_store(self,
+                      block_hashes: list[int]) -> Optional[PrepareStoreOutput]:
+        """
+        Prepare the given blocks to be offloaded.
+        The given blocks will be protected from eviction until
+        complete_store is called.
+
+        Args:
+            block_hashes: the hashes identifying the blocks.
+
+        Returns:
+            A PrepareStoreOutput indicating which blocks need storing,
+            where to store them (LoadStoreSpec), and list of blocks that
+            were evicted as a result.
+            None is returned if the blocks cannot be stored.
+        """
+        pass
+
+    @abstractmethod
+    def complete_store(self, block_hashes: list[int], success: bool = True):
+        """
+        Marks blocks which were previously prepared to be stored, as stored.
+        Following this call, the blocks become loadable.
+        If if_success is False, blocks that were not marked as stored will be
+        removed.
+
+        Args:
+            block_hashes: the hashes identifying the blocks.
+            success: whether the blocks were stored successfully.
+        """
+        pass
diff --git a/vllm/v1/offloading/mediums.py b/vllm/v1/offloading/mediums.py
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC
+
+from vllm.v1.offloading.abstract import LoadStoreSpec
+
+
+class BlockIDLoadStoreSpec(LoadStoreSpec, ABC):
+    """
+    Spec for loading/storing a KV block from a given block number.
+    """
+
+    def __init__(self, block_id: int):
+        self.block_id = block_id
+
+    def __repr__(self) -> str:
+        return str(self.block_id)
+
+
+class GPULoadStoreSpec(BlockIDLoadStoreSpec):
+    """
+    Spec for loading/storing a KV block to GPU memory.
+    """
+
+    @staticmethod
+    def medium() -> str:
+        return "GPU"
+
+
+class CPULoadStoreSpec(BlockIDLoadStoreSpec):
+    """
+    Spec for loading/storing a KV block to CPU memory.
+    """
+
+    @staticmethod
+    def medium() -> str:
+        return "CPU"
diff --git a/vllm/v1/offloading/worker/worker.py b/vllm/v1/offloading/worker/worker.py