Skip to content

Commit 81ede99

Browse files
authored
[Core] Deprecating block manager v1 and make block manager v2 default (#8704)
Removing the block manager v1. This is the initial piece of prefix-caching-centric design. In order to achieve prefix-caching-centric design, we need to simplify the code path so that we only use v2 block manager (which has much higher performance on prefix caching).
1 parent 5eda21e commit 81ede99

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+206
-2109
lines changed

.buildkite/test-pipeline.yaml

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,8 @@ steps:
7777
- vllm/
7878
- tests/basic_correctness/test_chunked_prefill
7979
commands:
80-
- VLLM_ATTENTION_BACKEND=XFORMERS VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s basic_correctness/test_chunked_prefill.py
81-
- VLLM_ATTENTION_BACKEND=FLASH_ATTN VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s basic_correctness/test_chunked_prefill.py
80+
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
81+
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
8282

8383
- label: Core Test # 10min
8484
mirror_hardwares: [amd]
@@ -88,11 +88,7 @@ steps:
8888
- vllm/distributed
8989
- tests/core
9090
commands:
91-
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core/test_scheduler.py
92-
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/test_chunked_prefill_scheduler.py
93-
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/block/e2e/test_correctness.py
94-
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/block/e2e/test_correctness_sliding_window.py
95-
- pytest -v -s core --ignore=core/block/e2e/test_correctness.py --ignore=core/test_scheduler.py --ignore=core/test_chunked_prefill_scheduler.py --ignore=core/block/e2e/test_correctness.py --ignore=core/block/e2e/test_correctness_sliding_window.py
91+
- pytest -v -s core
9692

9793
- label: Entrypoints Test # 40min
9894
working_dir: "/vllm-workspace/tests"
@@ -192,8 +188,7 @@ steps:
192188
- vllm/
193189
- tests/prefix_caching
194190
commands:
195-
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s prefix_caching/test_prefix_caching.py
196-
- pytest -v -s prefix_caching --ignore=prefix_caching/test_prefix_caching.py
191+
- pytest -v -s prefix_caching
197192

198193
- label: Samplers Test # 36min
199194
source_file_dependencies:
@@ -217,8 +212,7 @@ steps:
217212
- tests/spec_decode
218213
commands:
219214
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
220-
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s spec_decode/e2e/test_compatibility.py
221-
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_compatibility.py
215+
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
222216

223217
- label: LoRA Test %N # 15min each
224218
mirror_hardwares: [amd]
@@ -405,7 +399,7 @@ steps:
405399
- pytest -v -s ./compile/test_basic_correctness.py
406400
- pytest -v -s ./compile/test_wrapper.py
407401
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
408-
- TARGET_TEST_SUITE=L4 VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest basic_correctness/ -v -s -m distributed_2_gpus
402+
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
409403
# Avoid importing model tests that cause CUDA reinitialization error
410404
- pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
411405
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus

benchmarks/benchmark_latency.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@ def main(args: argparse.Namespace):
3838
quantization_param_path=args.quantization_param_path,
3939
device=args.device,
4040
ray_workers_use_nsight=args.ray_workers_use_nsight,
41-
use_v2_block_manager=args.use_v2_block_manager,
4241
enable_chunked_prefill=args.enable_chunked_prefill,
4342
download_dir=args.download_dir,
4443
block_size=args.block_size,
@@ -221,9 +220,6 @@ def run_to_completion(profile_dir: Optional[str] = None):
221220
parser.add_argument("--enable-prefix-caching",
222221
action='store_true',
223222
help="Enable automatic prefix caching")
224-
parser.add_argument('--use-v2-block-manager',
225-
action='store_true',
226-
default=EngineArgs.use_v2_block_manager)
227223
parser.add_argument(
228224
"--ray-workers-use-nsight",
229225
action='store_true',

benchmarks/benchmark_prefix_caching.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@
3333
from transformers import PreTrainedTokenizerBase
3434

3535
from vllm import LLM, SamplingParams
36-
from vllm.engine.arg_utils import EngineArgs
3736
from vllm.utils import FlexibleArgumentParser
3837

3938
try:
@@ -134,7 +133,6 @@ def main(args):
134133
tokenizer_mode='auto',
135134
trust_remote_code=True,
136135
enforce_eager=True,
137-
use_v2_block_manager=args.use_v2_block_manager,
138136
tensor_parallel_size=args.tensor_parallel_size,
139137
enable_prefix_caching=args.enable_prefix_caching)
140138

@@ -176,10 +174,6 @@ def main(args):
176174
parser.add_argument('--enable-prefix-caching',
177175
action='store_true',
178176
help='enable prefix caching')
179-
parser.add_argument('--use-v2-block-manager',
180-
action='store_true',
181-
default=EngineArgs.use_v2_block_manager,
182-
help='Use BlockSpaceMangerV2')
183177
parser.add_argument('--num-prompts',
184178
type=int,
185179
default=1,

benchmarks/benchmark_throughput.py

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,6 @@ def run_vllm(
8686
distributed_executor_backend: Optional[str],
8787
gpu_memory_utilization: float = 0.9,
8888
num_scheduler_steps: int = 1,
89-
use_v2_block_manager: bool = False,
9089
download_dir: Optional[str] = None,
9190
load_format: str = EngineArgs.load_format,
9291
disable_async_output_proc: bool = False,
@@ -113,7 +112,6 @@ def run_vllm(
113112
distributed_executor_backend=distributed_executor_backend,
114113
load_format=load_format,
115114
num_scheduler_steps=num_scheduler_steps,
116-
use_v2_block_manager=use_v2_block_manager,
117115
disable_async_output_proc=disable_async_output_proc,
118116
)
119117

@@ -176,7 +174,6 @@ async def run_vllm_async(
176174
distributed_executor_backend: Optional[str],
177175
gpu_memory_utilization: float = 0.9,
178176
num_scheduler_steps: int = 1,
179-
use_v2_block_manager: bool = False,
180177
download_dir: Optional[str] = None,
181178
load_format: str = EngineArgs.load_format,
182179
disable_async_output_proc: bool = False,
@@ -204,7 +201,6 @@ async def run_vllm_async(
204201
distributed_executor_backend=distributed_executor_backend,
205202
load_format=load_format,
206203
num_scheduler_steps=num_scheduler_steps,
207-
use_v2_block_manager=use_v2_block_manager,
208204
disable_async_output_proc=disable_async_output_proc,
209205
worker_use_ray=False,
210206
disable_log_requests=True,
@@ -341,8 +337,7 @@ def main(args: argparse.Namespace):
341337
args.enable_prefix_caching, args.enable_chunked_prefill,
342338
args.max_num_batched_tokens, args.distributed_executor_backend,
343339
args.gpu_memory_utilization, args.num_scheduler_steps,
344-
args.use_v2_block_manager, args.download_dir, args.load_format,
345-
args.disable_async_output_proc
340+
args.download_dir, args.load_format, args.disable_async_output_proc
346341
]
347342

348343
if args.async_engine:
@@ -471,10 +466,6 @@ def main(args: argparse.Namespace):
471466
type=int,
472467
default=1,
473468
help="Maximum number of forward steps per scheduler call.")
474-
parser.add_argument("--use-v2-block-manager",
475-
action='store_true',
476-
default=EngineArgs.use_v2_block_manager,
477-
help="Enable block manager v2.")
478469
parser.add_argument(
479470
"--enable-prefix-caching",
480471
action='store_true',

benchmarks/overheads/benchmark_hashing.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ def main(args):
1616
enforce_eager=True,
1717
enable_prefix_caching=True,
1818
tensor_parallel_size=args.tensor_parallel_size,
19-
use_v2_block_manager=args.use_v2_block_manager,
2019
)
2120

2221
sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
@@ -56,8 +55,5 @@ def main(args):
5655
parser.add_argument('--enable-prefix-caching',
5756
action='store_true',
5857
help='enable prefix caching')
59-
parser.add_argument('--use-v2-block-manager',
60-
action='store_true',
61-
help='Use BlockSpaceMangerV2')
6258
args = parser.parse_args()
6359
main(args)

docs/source/models/spec_decode.rst

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ The following code configures vLLM in an offline mode to use speculative decodin
3030
tensor_parallel_size=1,
3131
speculative_model="facebook/opt-125m",
3232
num_speculative_tokens=5,
33-
use_v2_block_manager=True,
3433
)
3534
outputs = llm.generate(prompts, sampling_params)
3635
@@ -104,7 +103,6 @@ matching n-grams in the prompt. For more information read `this thread. <https:/
104103
speculative_model="[ngram]",
105104
num_speculative_tokens=5,
106105
ngram_prompt_lookup_max=4,
107-
use_v2_block_manager=True,
108106
)
109107
outputs = llm.generate(prompts, sampling_params)
110108
@@ -135,7 +133,6 @@ For more information see `this blog <https://pytorch.org/blog/hitchhikers-guide-
135133
tensor_parallel_size=4,
136134
speculative_model="ibm-fms/llama3-70b-accelerator",
137135
speculative_draft_tensor_parallel_size=1,
138-
use_v2_block_manager=True,
139136
)
140137
outputs = llm.generate(prompts, sampling_params)
141138

examples/offline_inference_mlpspeculator.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,6 @@ def time_generation(llm: LLM, prompts: List[str],
5050
llm = LLM(
5151
model="meta-llama/Llama-2-13b-chat-hf",
5252
speculative_model="ibm-fms/llama-13b-accelerator",
53-
# These are currently required for MLPSpeculator decoding
54-
use_v2_block_manager=True,
5553
)
5654

5755
print("With speculation")

tests/basic_correctness/test_chunked_prefill.py

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,20 +12,14 @@
1212
import pytest
1313

1414
from ..models.utils import check_logprobs_close, check_outputs_equal
15-
from ..utils import check_deprecated_block_manager_usage, multi_gpu_test
15+
from ..utils import multi_gpu_test
1616

1717
MODELS = [
1818
"facebook/opt-125m",
1919
"meta-llama/Llama-2-7b-hf",
2020
]
2121

2222

23-
@pytest.fixture(scope="module", autouse=True)
24-
def check_deprecated_block_manager():
25-
check_deprecated_block_manager_usage(
26-
'tests/basic_correctness/test_chunked_prefill.py')
27-
28-
2923
@pytest.mark.parametrize("model", MODELS)
3024
@pytest.mark.parametrize("dtype", ["half"])
3125
@pytest.mark.parametrize("max_tokens", [32])
@@ -197,7 +191,6 @@ def test_models_with_fp8_kv_cache(
197191
@pytest.mark.parametrize("max_tokens", [16])
198192
@pytest.mark.parametrize("enforce_eager", [False])
199193
@pytest.mark.parametrize("chunk_size", [30, 32])
200-
@pytest.mark.parametrize("use_v2_block_manager", [False, True])
201194
# NOTE: Increasing this in this suite will fail CI because we currently cannot
202195
# reset distributed env properly. Use a value > 1 just when you test.
203196
@pytest.mark.parametrize("tensor_parallel_size", [1])
@@ -206,7 +199,6 @@ def test_with_prefix_caching(
206199
max_tokens: int,
207200
enforce_eager: bool,
208201
chunk_size: int,
209-
use_v2_block_manager: bool,
210202
tensor_parallel_size: int,
211203
) -> None:
212204
"""
@@ -234,7 +226,6 @@ def test_with_prefix_caching(
234226
enable_chunked_prefill=True,
235227
enable_prefix_caching=enable,
236228
tensor_parallel_size=tensor_parallel_size,
237-
use_v2_block_manager=use_v2_block_manager,
238229
enforce_eager=enforce_eager,
239230
max_num_seqs=max_num_seqs,
240231
) as vllm_model:

0 commit comments

Comments
 (0)