Skip to content

Commit a73e183

Browse files
t-sibirajDarkLight1337aarnphm
authored
[Misc] Replace os environ to monkeypatch in test suite (#14516)
Signed-off-by: sibi <[email protected]> Signed-off-by: Aaron Pham <[email protected]> Co-authored-by: Cyrus Leung <[email protected]> Co-authored-by: Aaron Pham <[email protected]>
1 parent 1e799b7 commit a73e183

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+1997
-1755
lines changed

.buildkite/test-pipeline.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -522,7 +522,7 @@ steps:
522522
# TODO: investigate and fix
523523
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
524524
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
525-
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
525+
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
526526

527527
- label: Plugin Tests (2 GPUs) # 40min
528528
working_dir: "/vllm-workspace/tests"

tests/basic_correctness/test_basic_correctness.py

Lines changed: 63 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ def test_vllm_gc_ed():
4747
@pytest.mark.parametrize("max_tokens", [5])
4848
@pytest.mark.parametrize("enforce_eager", [False])
4949
def test_models(
50+
monkeypatch: pytest.MonkeyPatch,
5051
hf_runner,
5152
model: str,
5253
backend: str,
@@ -63,31 +64,33 @@ def test_models(
6364
pytest.skip(
6465
f"{backend} does not support gemma2 with full context length.")
6566

66-
os.environ["VLLM_ATTENTION_BACKEND"] = backend
67+
with monkeypatch.context() as m:
68+
m.setenv("VLLM_ATTENTION_BACKEND", backend)
6769

68-
# 5042 tokens for gemma2
69-
# gemma2 has alternating sliding window size of 4096
70-
# we need a prompt with more than 4096 tokens to test the sliding window
71-
prompt = "The following numbers of the sequence " + ", ".join(
72-
str(i) for i in range(1024)) + " are:"
73-
example_prompts = [prompt]
70+
# 5042 tokens for gemma2
71+
# gemma2 has alternating sliding window size of 4096
72+
# we need a prompt with more than 4096 tokens to test the sliding window
73+
prompt = "The following numbers of the sequence " + ", ".join(
74+
str(i) for i in range(1024)) + " are:"
75+
example_prompts = [prompt]
7476

75-
with hf_runner(model, dtype=dtype) as hf_model:
76-
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
77+
with hf_runner(model, dtype=dtype) as hf_model:
78+
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
7779

78-
with VllmRunner(model,
79-
max_model_len=8192,
80-
dtype=dtype,
81-
enforce_eager=enforce_eager,
82-
gpu_memory_utilization=0.7) as vllm_model:
83-
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
80+
with VllmRunner(model,
81+
max_model_len=8192,
82+
dtype=dtype,
83+
enforce_eager=enforce_eager,
84+
gpu_memory_utilization=0.7) as vllm_model:
85+
vllm_outputs = vllm_model.generate_greedy(example_prompts,
86+
max_tokens)
8487

85-
check_outputs_equal(
86-
outputs_0_lst=hf_outputs,
87-
outputs_1_lst=vllm_outputs,
88-
name_0="hf",
89-
name_1="vllm",
90-
)
88+
check_outputs_equal(
89+
outputs_0_lst=hf_outputs,
90+
outputs_1_lst=vllm_outputs,
91+
name_0="hf",
92+
name_1="vllm",
93+
)
9194

9295

9396
@multi_gpu_test(num_gpus=2)
@@ -104,6 +107,7 @@ def test_models(
104107
("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
105108
])
106109
def test_models_distributed(
110+
monkeypatch: pytest.MonkeyPatch,
107111
hf_runner,
108112
vllm_runner,
109113
example_prompts,
@@ -116,34 +120,41 @@ def test_models_distributed(
116120
if test_suite != TARGET_TEST_SUITE:
117121
pytest.skip(f"Skip test for {test_suite}")
118122

119-
if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
120-
# test Ray Compiled Graph
121-
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
122-
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
123-
124-
if attention_backend:
125-
os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
126-
127-
dtype = "half"
128-
max_tokens = 5
129-
130-
# NOTE: take care of the order. run vLLM first, and then run HF.
131-
# vLLM needs a fresh new process without cuda initialization.
132-
# if we run HF first, the cuda initialization will be done and it
133-
# will hurt multiprocessing backend with fork method (the default method).
134-
with vllm_runner(model,
135-
dtype=dtype,
136-
tensor_parallel_size=2,
137-
distributed_executor_backend=distributed_executor_backend
138-
) as vllm_model:
139-
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
140-
141-
with hf_runner(model, dtype=dtype) as hf_model:
142-
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
143-
144-
check_outputs_equal(
145-
outputs_0_lst=hf_outputs,
146-
outputs_1_lst=vllm_outputs,
147-
name_0="hf",
148-
name_1="vllm",
149-
)
123+
with monkeypatch.context() as monkeypatch_context:
124+
if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
125+
# test Ray Compiled Graph
126+
monkeypatch_context.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
127+
monkeypatch_context.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
128+
129+
if attention_backend:
130+
monkeypatch_context.setenv(
131+
"VLLM_ATTENTION_BACKEND",
132+
attention_backend,
133+
)
134+
135+
dtype = "half"
136+
max_tokens = 5
137+
138+
# NOTE: take care of the order. run vLLM first, and then run HF.
139+
# vLLM needs a fresh new process without cuda initialization.
140+
# if we run HF first, the cuda initialization will be done and it
141+
# will hurt multiprocessing backend with fork method
142+
# (the default method).
143+
with vllm_runner(
144+
model,
145+
dtype=dtype,
146+
tensor_parallel_size=2,
147+
distributed_executor_backend=distributed_executor_backend,
148+
) as vllm_model:
149+
vllm_outputs = vllm_model.generate_greedy(example_prompts,
150+
max_tokens)
151+
152+
with hf_runner(model, dtype=dtype) as hf_model:
153+
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
154+
155+
check_outputs_equal(
156+
outputs_0_lst=hf_outputs,
157+
outputs_1_lst=vllm_outputs,
158+
name_0="hf",
159+
name_1="vllm",
160+
)

0 commit comments

Comments
 (0)