Skip to content

Commit 20e7310

Browse files
yenuo26wangyu31577hsliuustc0106
authored
[Test]Delete skip mark for amd ci test and fix CI failure (vllm-project#927)
Signed-off-by: wangyu31577 <wangyu31577@hundsun.com> Signed-off-by: Hongsheng Liu <liuhongsheng4@huawei.com> Co-authored-by: wangyu31577 <wangyu31577@hundsun.com> Co-authored-by: Hongsheng Liu <liuhongsheng4@huawei.com>
1 parent 741f7e2 commit 20e7310

File tree

6 files changed

+78
-62
lines changed

6 files changed

+78
-62
lines changed

tests/conftest.py

Lines changed: 12 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
if "VLLM_TARGET_DEVICE" not in os.environ:
1111
os.environ["VLLM_TARGET_DEVICE"] = "cpu"
1212

13+
import gc
1314
import socket
1415
import subprocess
1516
import sys
@@ -59,13 +60,12 @@ def clean_gpu_memory_between_tests():
5960
_run_post_test_cleanup()
6061

6162

62-
def _run_pre_test_cleanup():
63-
if os.getenv("VLLM_TEST_CLEAN_GPU_MEMORY", "0") != "1":
63+
def _run_pre_test_cleanup(enable_force=False):
64+
if os.getenv("VLLM_TEST_CLEAN_GPU_MEMORY", "0") != "1" and not enable_force:
6465
print("GPU cleanup disabled")
6566
return
6667

6768
print("Pre-test GPU status:")
68-
_print_simple_gpu_status()
6969

7070
num_gpus = torch.cuda.device_count()
7171
if num_gpus > 0:
@@ -74,44 +74,25 @@ def _run_pre_test_cleanup():
7474

7575
wait_for_gpu_memory_to_clear(
7676
devices=list(range(num_gpus)),
77-
threshold_ratio=0.1,
77+
threshold_ratio=0.05,
7878
)
7979
except Exception as e:
8080
print(f"Pre-test cleanup note: {e}")
8181

8282

83-
def _run_post_test_cleanup():
84-
if os.getenv("VLLM_TEST_CLEAN_GPU_MEMORY", "0") != "1":
83+
def _run_post_test_cleanup(enable_force=False):
84+
if os.getenv("VLLM_TEST_CLEAN_GPU_MEMORY", "0") != "1" and not enable_force:
85+
print("GPU cleanup disabled")
8586
return
8687

87-
import gc
88-
8988
if torch.cuda.is_available():
9089
gc.collect()
9190
torch.cuda.empty_cache()
9291

9392
print("Post-test GPU status:")
94-
_print_simple_gpu_status()
9593
_print_gpu_processes()
9694

9795

98-
def _print_simple_gpu_status():
99-
"""Print simple GPU memory status"""
100-
if not torch.cuda.is_available():
101-
print(" CUDA not available")
102-
return
103-
104-
num_devices = torch.cuda.device_count()
105-
for device_id in range(num_devices):
106-
try:
107-
torch.cuda.set_device(device_id)
108-
allocated = torch.cuda.memory_allocated(device_id) / (1024**2)
109-
reserved = torch.cuda.memory_reserved(device_id) / (1024**2)
110-
print(f" GPU {device_id}: Allocated: {allocated:.1f}MB, Reserved: {reserved:.1f}MB")
111-
except Exception:
112-
print(f" GPU {device_id}: Error reading status")
113-
114-
11596
def _print_gpu_processes():
11697
"""Print GPU information including nvidia-smi and system processes"""
11798

@@ -871,6 +852,9 @@ def __init__(
871852
*,
872853
env_dict: dict[str, str] | None = None,
873854
) -> None:
855+
_run_pre_test_cleanup(enable_force=True)
856+
_run_post_test_cleanup(enable_force=True)
857+
cleanup_dist_env_and_memory()
874858
self.model = model
875859
self.serve_args = serve_args
876860
self.env_dict = env_dict
@@ -986,5 +970,6 @@ def __enter__(self):
986970
def __exit__(self, exc_type, exc_val, exc_tb):
987971
if self.proc:
988972
self._kill_process_tree(self.proc.pid)
989-
_run_post_test_cleanup()
973+
_run_pre_test_cleanup(enable_force=True)
974+
_run_post_test_cleanup(enable_force=True)
990975
cleanup_dist_env_and_memory()

tests/e2e/offline_inference/conftest.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from vllm import TextPrompt
1111
from vllm.distributed.parallel_state import cleanup_dist_env_and_memory
1212

13-
from tests.conftest import _run_post_test_cleanup
13+
from tests.conftest import _run_post_test_cleanup, _run_pre_test_cleanup
1414
from vllm_omni.entrypoints.omni import Omni
1515
from vllm_omni.inputs.data import OmniSamplingParams
1616
from vllm_omni.outputs import OmniRequestOutput
@@ -51,6 +51,9 @@ def __init__(
5151
stage_configs_path: Optional path to YAML stage config file
5252
**kwargs: Additional arguments passed to Omni
5353
"""
54+
cleanup_dist_env_and_memory()
55+
_run_pre_test_cleanup(enable_force=True)
56+
_run_post_test_cleanup(enable_force=True)
5457
self.model_name = model_name
5558
self.seed = seed
5659

@@ -337,7 +340,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
337340
self.close()
338341
del self.omni
339342
cleanup_dist_env_and_memory()
340-
_run_post_test_cleanup()
343+
_run_post_test_cleanup(enable_force=True)
341344

342345
def close(self):
343346
"""Close and cleanup the Omni instance."""

tests/e2e/online_serving/stage_configs/rocm/qwen3_omni_ci.yaml

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
# Stage 0: Thinker (multimodal understanding + text generation)
33
# Stage 1: Talker (text embeddings → 16-layer RVQ codec codes)
44
# Stage 2: Code2Wav (8-layer RVQ codes → audio waveform)
5-
65
# The following config has been verified on 2x H100-80G GPUs.
76
stage_args:
87
- stage_id: 0
@@ -22,7 +21,6 @@ stage_args:
2221
enable_prefix_caching: false
2322
hf_config_name: thinker_config
2423
tensor_parallel_size: 2
25-
load_format: dummy
2624
final_output: true
2725
final_output_type: text
2826
is_comprehension: true
@@ -52,15 +50,14 @@ stage_args:
5250
enable_prefix_caching: false
5351
distributed_executor_backend: "mp"
5452
hf_config_name: talker_config
55-
load_format: dummy
5653
engine_input_source: [0]
5754
custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker
5855
# final_output: true
5956
# final_output_type: text
6057
default_sampling_params:
6158
temperature: 0.9
6259
top_k: 50
63-
max_tokens: 100
60+
max_tokens: 1000
6461
seed: 42
6562
detokenize: False
6663
repetition_penalty: 1.05
@@ -83,7 +80,6 @@ stage_args:
8380
distributed_executor_backend: "mp"
8481
max_num_batched_tokens: 1000000
8582
hf_config_name: thinker_config
86-
load_format: dummy
8783
async_scheduling: false
8884
engine_input_source: [1]
8985
custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav
@@ -93,7 +89,7 @@ stage_args:
9389
temperature: 0.0
9490
top_p: 1.0
9591
top_k: -1
96-
max_tokens: 200
92+
max_tokens: 2000
9793
seed: 42
9894
detokenize: True
9995
repetition_penalty: 1.1

tests/e2e/online_serving/test_qwen3_omni.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ def dummy_messages_from_video_data(
133133

134134
def get_prompt(prompt_type="text_only"):
135135
prompts = {
136-
"text_only": "What is the capital of China?",
136+
"text_only": "What is the capital of China? Answer in 20 words.",
137137
"mix": "What is recited in the audio? What is in this image? Describe the video briefly.",
138138
}
139139
return prompts.get(prompt_type, prompts["text_only"])
@@ -144,9 +144,6 @@ def get_max_batch_size(size_type="few"):
144144
return batch_sizes.get(size_type, 5)
145145

146146

147-
@pytest.mark.skipif(
148-
current_omni_platform.is_rocm(), reason="Test skipped on AMD environment due to known output issues"
149-
)
150147
@pytest.mark.parametrize("omni_server", test_params, indirect=True)
151148
def test_mix_to_text_audio_001(client: openai.OpenAI, omni_server, request) -> None:
152149
"""
@@ -226,9 +223,6 @@ def test_mix_to_text_audio_001(client: openai.OpenAI, omni_server, request) -> N
226223
assert similarity > 0.9, "The audio content is not same as the text"
227224

228225

229-
@pytest.mark.skipif(
230-
current_omni_platform.is_rocm(), reason="Test skipped on AMD environment due to known output issues"
231-
)
232226
@pytest.mark.parametrize("omni_server", test_params, indirect=True)
233227
def test_text_to_text_audio_001(client: openai.OpenAI, omni_server) -> None:
234228
"""

tests/e2e/stage_configs/qwen3_omni_ci.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ stage_args:
2020
engine_output_type: latent # Output hidden states for talker
2121
distributed_executor_backend: "mp"
2222
max_num_batched_tokens: 32768
23+
max_model_len: 32768
2324
enable_prefix_caching: false
2425
hf_config_name: thinker_config
2526
tensor_parallel_size: 2
@@ -51,6 +52,7 @@ stage_args:
5152
engine_output_type: latent # Output codec codes for code2wav
5253
enable_prefix_caching: false
5354
max_num_batched_tokens: 32768
55+
max_model_len: 32768
5456
distributed_executor_backend: "mp"
5557
hf_config_name: talker_config
5658
engine_input_source: [0]

tests/utils.py

Lines changed: 56 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -77,11 +77,36 @@ def wait_for_gpu_memory_to_clear(
7777
threshold_ratio: float | None = None,
7878
timeout_s: float = 120,
7979
) -> None:
80+
import gc
81+
8082
assert threshold_bytes is not None or threshold_ratio is not None
8183
# Use nvml instead of pytorch to reduce measurement error from torch cuda
8284
# context.
8385
devices = get_physical_device_indices(devices)
8486
start_time = time.time()
87+
88+
# Print waiting start information
89+
device_list = ", ".join(str(d) for d in devices)
90+
if threshold_bytes is not None:
91+
threshold_str = f"{threshold_bytes / 2**30:.2f} GiB"
92+
condition_str = f"Memory usage ≤ {threshold_str}"
93+
else:
94+
threshold_percent = threshold_ratio * 100
95+
threshold_str = f"{threshold_percent:.1f}%"
96+
condition_str = f"Memory usage ratio ≤ {threshold_str}"
97+
98+
print(f"[GPU Memory Monitor] Waiting for GPU {device_list} to free memory, Condition: {condition_str}")
99+
100+
# Define the is_free function based on threshold type
101+
if threshold_bytes is not None:
102+
103+
def is_free(used, total):
104+
return used <= threshold_bytes / 2**30
105+
else:
106+
107+
def is_free(used, total):
108+
return used / total <= threshold_ratio
109+
85110
while True:
86111
output: dict[int, str] = {}
87112
output_raw: dict[int, tuple[float, float]] = {}
@@ -97,33 +122,44 @@ def wait_for_gpu_memory_to_clear(
97122
gb_used = mem_info.used / 2**30
98123
gb_total = mem_info.total / 2**30
99124
output_raw[device] = (gb_used, gb_total)
100-
output[device] = f"{gb_used:.02f}/{gb_total:.02f}"
101-
102-
print("gpu memory used/total (GiB): ", end="")
103-
for k, v in output.items():
104-
print(f"{k}={v}; ", end="")
105-
print("")
106-
107-
if threshold_bytes is not None:
108-
109-
def is_free(used, total):
110-
return used <= threshold_bytes / 2**30 # noqa E731
125+
# Format to more readable form
126+
usage_percent = (gb_used / gb_total) * 100 if gb_total > 0 else 0
127+
output[device] = f"{gb_used:.1f}GiB/{gb_total:.1f}GiB ({usage_percent:.1f}%)"
111128

112-
threshold = f"{threshold_bytes / 2**30} GiB"
113-
else:
114-
115-
def is_free(used, total):
116-
return used / total <= threshold_ratio # noqa E731
117-
118-
threshold = f"{threshold_ratio:.2f}"
129+
# Optimized GPU memory status print
130+
print("[GPU Memory Status] Current usage:")
131+
for device_id, mem_info in output.items():
132+
print(f" GPU {device_id}: {mem_info}")
119133

134+
# Calculate waiting duration
120135
dur_s = time.time() - start_time
136+
elapsed_minutes = dur_s / 60
137+
138+
# Check if all devices meet the condition
121139
if all(is_free(used, total) for used, total in output_raw.values()):
122-
print(f"Done waiting for free GPU memory on devices {devices=} ({threshold=}) {dur_s=:.02f}")
140+
# Optimized completion message
141+
print(f"[GPU Memory Freed] Devices {device_list} meet memory condition")
142+
print(f" Condition: {condition_str}")
143+
print(f" Wait time: {dur_s:.1f} seconds ({elapsed_minutes:.1f} minutes)")
144+
print(" Final status:")
145+
for device_id, mem_info in output.items():
146+
print(f" GPU {device_id}: {mem_info}")
123147
break
124148

149+
# Check timeout
125150
if dur_s >= timeout_s:
126-
raise ValueError(f"Memory of devices {devices=} not free after {dur_s=:.02f} ({threshold=})")
151+
raise ValueError(
152+
f"[GPU Memory Timeout] Devices {device_list} still don't meet memory condition after {dur_s:.1f} seconds\n"
153+
f"Condition: {condition_str}\n"
154+
f"Current status:\n" + "\n".join(f" GPU {device}: {output[device]}" for device in devices)
155+
)
156+
157+
# Add waiting hint (optional)
158+
if dur_s > 10 and int(dur_s) % 10 == 0: # Show hint every 10 seconds
159+
print(f"Waiting... Already waited {dur_s:.1f} seconds ({elapsed_minutes:.1f} minutes)")
160+
161+
gc.collect()
162+
torch.cuda.empty_cache()
127163

128164
time.sleep(5)
129165

0 commit comments

Comments
 (0)