Skip to content

v1: Offloading connector #22595

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,7 @@ steps:
- pytest -v -s v1/core
- pytest -v -s v1/engine
- pytest -v -s v1/entrypoints
- pytest -v -s v1/offloading
- pytest -v -s v1/sample
- pytest -v -s v1/worker
- pytest -v -s v1/structured_output
Expand Down
2 changes: 2 additions & 0 deletions examples/online_serving/kv_events_subscriber.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,12 @@ class BlockStored(KVCacheEvent):
token_ids: list[int]
block_size: int
lora_id: Optional[int]
medium: Optional[str]


class BlockRemoved(KVCacheEvent):
block_hashes: list[int]
medium: Optional[str]


class AllBlocksCleared(KVCacheEvent):
Expand Down
22 changes: 15 additions & 7 deletions tests/v1/core/test_async_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.outputs import ModelRunnerOutput
from vllm.v1.request import RequestStatus
from vllm.v1.utils import ConstantList

from .utils import create_requests, create_scheduler

Expand Down Expand Up @@ -140,7 +141,8 @@ def test_prefix_caching_for_prefill_dedup():
requests = create_requests(num_requests=5,
num_tokens=num_prompt_tokens,
max_tokens=3,
same_prompt=True)
same_prompt=True,
block_size=BLOCK_SIZE)
requests_copy = requests.copy()

# Two requests with the same prompt.
Expand Down Expand Up @@ -188,7 +190,8 @@ def test_prefix_caching_for_multi_turn():
block_size=BLOCK_SIZE)
requests = create_requests(num_requests=5,
num_tokens=num_prompt_tokens,
max_tokens=num_output_tokens)
max_tokens=num_output_tokens,
block_size=BLOCK_SIZE)

for req in requests:
scheduler.add_request(req)
Expand All @@ -208,14 +211,19 @@ def test_prefix_caching_for_multi_turn():

# Create next-turn requests whose prompts are the full output of the
# previous turn.
next_turn_requests = create_requests(
num_requests=5,
num_tokens=num_prompt_tokens + num_output_tokens,
max_tokens=num_output_tokens,
)
next_turn_requests = create_requests(num_requests=5,
num_tokens=num_prompt_tokens +
num_output_tokens,
max_tokens=num_output_tokens,
block_size=BLOCK_SIZE)
for i, req in enumerate(next_turn_requests):
req.prompt_token_ids = (requests[i].prompt_token_ids +
list(requests[i].output_token_ids))
req._all_token_ids = req.prompt_token_ids.copy()
req.all_token_ids = ConstantList(req._all_token_ids)
req.block_hashes = []
req.block_hashes = req.get_hash_new_full_blocks()

# Schedule the next-turn requests.
for req in next_turn_requests:
scheduler.add_request(req)
Expand Down
48 changes: 26 additions & 22 deletions tests/v1/core/test_kv_cache_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
FreeKVCacheBlockQueue, KVCacheBlock, PrefixCachingMetrics,
estimate_max_model_len, generate_block_hash_extra_keys,
get_kv_cache_config, get_max_concurrency_for_kv_cache_config,
hash_block_tokens, hash_request_tokens, init_none_hash,
get_request_block_hasher, hash_block_tokens, init_none_hash,
is_kv_cache_type_uniform, unify_kv_cache_configs)
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
KVCacheGroupSpec, KVCacheTensor,
Expand All @@ -29,6 +29,8 @@

def make_request(request_id,
prompt_token_ids,
block_size=3,
hash_fn=hash,
mm_positions=None,
mm_hashes=None,
cache_salt=None):
Expand All @@ -37,18 +39,17 @@ def make_request(request_id,
else:
multi_modal_inputs = [MultiModalKwargs({})] * len(mm_positions)

return Request(
request_id=request_id,
prompt_token_ids=prompt_token_ids,
multi_modal_inputs=multi_modal_inputs,
multi_modal_hashes=mm_hashes,
multi_modal_placeholders=mm_positions,
sampling_params=SamplingParams(max_tokens=17),
pooling_params=None,
eos_token_id=100,
lora_request=None,
cache_salt=cache_salt,
)
return Request(request_id=request_id,
prompt_token_ids=prompt_token_ids,
multi_modal_inputs=multi_modal_inputs,
multi_modal_hashes=mm_hashes,
multi_modal_placeholders=mm_positions,
sampling_params=SamplingParams(max_tokens=17),
pooling_params=None,
eos_token_id=100,
lora_request=None,
cache_salt=cache_salt,
block_hasher=get_request_block_hasher(block_size, hash_fn))


def new_kv_cache_spec(block_size=16,
Expand Down Expand Up @@ -416,22 +417,22 @@ def test_hash_block_tokens(hash_fn):


@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor_64bit, hash])
def test_hash_request_tokens(hash_fn):
def test_request_block_hasher(hash_fn):
import vllm.v1.core.kv_cache_utils
init_none_hash(hash_fn)
request = make_request(
request_id=0,
prompt_token_ids=[_ for _ in range(6)],
block_size=3,
hash_fn=hash_fn,
mm_positions=[
PlaceholderRange(offset=0, length=3),
PlaceholderRange(offset=3, length=3),
],
mm_hashes=["hash1", "hash2"],
)

block_size = 3
block_hashes = hash_request_tokens(hash_fn, block_size, request)

block_hashes = request.block_hashes
assert len(block_hashes) == 2
assert isinstance(block_hashes[0], vllm.v1.core.kv_cache_utils.BlockHash)
assert isinstance(block_hashes[1], vllm.v1.core.kv_cache_utils.BlockHash)
Expand All @@ -452,6 +453,8 @@ def test_hash_tokens_different_mm_input(hash_fn):
request1 = make_request(
request_id=0,
prompt_token_ids=[_ for _ in range(6)],
block_size=3,
hash_fn=hash_fn,
mm_positions=[
PlaceholderRange(offset=0, length=3),
PlaceholderRange(offset=3, length=3),
Expand All @@ -467,9 +470,8 @@ def test_hash_tokens_different_mm_input(hash_fn):
],
mm_hashes=["hash3", "hash2"],
)
block_size = 3
block_hashes1 = hash_request_tokens(hash_fn, block_size, request1)
block_hashes2 = hash_request_tokens(hash_fn, block_size, request2)
block_hashes1 = request1.block_hashes
block_hashes2 = request2.block_hashes
assert block_hashes1[0] != block_hashes2[0]
assert block_hashes1[1] != block_hashes2[1]

Expand All @@ -481,12 +483,13 @@ def test_hash_request_tokens_no_mm_inputs(hash_fn):
request = make_request(
request_id=0,
prompt_token_ids=[_ for _ in range(6)],
block_size=3,
hash_fn=hash_fn,
mm_positions=None,
mm_hashes=None,
)

block_size = 3
block_hashes = hash_request_tokens(hash_fn, block_size, request)
block_hashes = request.block_hashes

assert len(block_hashes) == 2
assert block_hashes[0].token_ids == (0, 1, 2)
Expand Down Expand Up @@ -846,6 +849,7 @@ def test_allocate_with_lookahead():
request = make_request(
request_id=0,
prompt_token_ids=[],
block_size=block_size,
mm_positions=None,
mm_hashes=None,
)
Expand Down
Loading