[Test]Delete skip mark for amd ci test and fix CI failure (vllm-project#927)

yenuo26 · wangyu31577 · hsliuustc0106 · web-flow · commit 20e7310eb184 · 2026-01-29T16:21:24.000+13:00
Signed-off-by: wangyu31577 &lt;wangyu31577@hundsun.com&gt;
Signed-off-by: Hongsheng Liu &lt;liuhongsheng4@huawei.com&gt;
Co-authored-by: wangyu31577 &lt;wangyu31577@hundsun.com&gt;
Co-authored-by: Hongsheng Liu &lt;liuhongsheng4@huawei.com&gt;
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -10,6 +10,7 @@
 if "VLLM_TARGET_DEVICE" not in os.environ:
     os.environ["VLLM_TARGET_DEVICE"] = "cpu"
 
+import gc
 import socket
 import subprocess
 import sys
@@ -59,13 +60,12 @@ def clean_gpu_memory_between_tests():
     _run_post_test_cleanup()
 
 
-def _run_pre_test_cleanup():
-    if os.getenv("VLLM_TEST_CLEAN_GPU_MEMORY", "0") != "1":
+def _run_pre_test_cleanup(enable_force=False):
+    if os.getenv("VLLM_TEST_CLEAN_GPU_MEMORY", "0") != "1" and not enable_force:
         print("GPU cleanup disabled")
         return
 
     print("Pre-test GPU status:")
-    _print_simple_gpu_status()
 
     num_gpus = torch.cuda.device_count()
     if num_gpus > 0:
@@ -74,44 +74,25 @@ def _run_pre_test_cleanup():
 
             wait_for_gpu_memory_to_clear(
                 devices=list(range(num_gpus)),
-                threshold_ratio=0.1,
+                threshold_ratio=0.05,
             )
         except Exception as e:
             print(f"Pre-test cleanup note: {e}")
 
 
-def _run_post_test_cleanup():
-    if os.getenv("VLLM_TEST_CLEAN_GPU_MEMORY", "0") != "1":
+def _run_post_test_cleanup(enable_force=False):
+    if os.getenv("VLLM_TEST_CLEAN_GPU_MEMORY", "0") != "1" and not enable_force:
+        print("GPU cleanup disabled")
         return
 
-    import gc
-
     if torch.cuda.is_available():
         gc.collect()
         torch.cuda.empty_cache()
 
         print("Post-test GPU status:")
-        _print_simple_gpu_status()
         _print_gpu_processes()
 
 
-def _print_simple_gpu_status():
-    """Print simple GPU memory status"""
-    if not torch.cuda.is_available():
-        print("  CUDA not available")
-        return
-
-    num_devices = torch.cuda.device_count()
-    for device_id in range(num_devices):
-        try:
-            torch.cuda.set_device(device_id)
-            allocated = torch.cuda.memory_allocated(device_id) / (1024**2)
-            reserved = torch.cuda.memory_reserved(device_id) / (1024**2)
-            print(f"  GPU {device_id}: Allocated: {allocated:.1f}MB, Reserved: {reserved:.1f}MB")
-        except Exception:
-            print(f"  GPU {device_id}: Error reading status")
-
-
 def _print_gpu_processes():
     """Print GPU information including nvidia-smi and system processes"""
 
@@ -871,6 +852,9 @@ def __init__(
         *,
         env_dict: dict[str, str] | None = None,
     ) -> None:
+        _run_pre_test_cleanup(enable_force=True)
+        _run_post_test_cleanup(enable_force=True)
+        cleanup_dist_env_and_memory()
         self.model = model
         self.serve_args = serve_args
         self.env_dict = env_dict
@@ -986,5 +970,6 @@ def __enter__(self):
     def __exit__(self, exc_type, exc_val, exc_tb):
         if self.proc:
             self._kill_process_tree(self.proc.pid)
-        _run_post_test_cleanup()
+        _run_pre_test_cleanup(enable_force=True)
+        _run_post_test_cleanup(enable_force=True)
         cleanup_dist_env_and_memory()
diff --git a/tests/e2e/offline_inference/conftest.py b/tests/e2e/offline_inference/conftest.py
@@ -10,7 +10,7 @@
 from vllm import TextPrompt
 from vllm.distributed.parallel_state import cleanup_dist_env_and_memory
 
-from tests.conftest import _run_post_test_cleanup
+from tests.conftest import _run_post_test_cleanup, _run_pre_test_cleanup
 from vllm_omni.entrypoints.omni import Omni
 from vllm_omni.inputs.data import OmniSamplingParams
 from vllm_omni.outputs import OmniRequestOutput
@@ -51,6 +51,9 @@ def __init__(
             stage_configs_path: Optional path to YAML stage config file
             **kwargs: Additional arguments passed to Omni
         """
+        cleanup_dist_env_and_memory()
+        _run_pre_test_cleanup(enable_force=True)
+        _run_post_test_cleanup(enable_force=True)
         self.model_name = model_name
         self.seed = seed
 
@@ -337,7 +340,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         self.close()
         del self.omni
         cleanup_dist_env_and_memory()
-        _run_post_test_cleanup()
+        _run_post_test_cleanup(enable_force=True)
 
     def close(self):
         """Close and cleanup the Omni instance."""
diff --git a/tests/e2e/online_serving/stage_configs/rocm/qwen3_omni_ci.yaml b/tests/e2e/online_serving/stage_configs/rocm/qwen3_omni_ci.yaml
@@ -2,7 +2,6 @@
 # Stage 0: Thinker (multimodal understanding + text generation)
 # Stage 1: Talker (text embeddings → 16-layer RVQ codec codes)
 # Stage 2: Code2Wav (8-layer RVQ codes → audio waveform)
-
 # The following config has been verified on 2x H100-80G GPUs.
 stage_args:
   - stage_id: 0
@@ -22,7 +21,6 @@ stage_args:
       enable_prefix_caching: false
       hf_config_name: thinker_config
       tensor_parallel_size: 2
-      load_format: dummy
     final_output: true
     final_output_type: text
     is_comprehension: true
@@ -52,15 +50,14 @@ stage_args:
        enable_prefix_caching: false
        distributed_executor_backend: "mp"
        hf_config_name: talker_config
-       load_format: dummy
     engine_input_source: [0]
     custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker
     # final_output: true
     # final_output_type: text
     default_sampling_params:
       temperature: 0.9
       top_k: 50
-      max_tokens: 100
+      max_tokens: 1000
       seed: 42
       detokenize: False
       repetition_penalty: 1.05
@@ -83,7 +80,6 @@ stage_args:
       distributed_executor_backend: "mp"
       max_num_batched_tokens: 1000000
       hf_config_name: thinker_config
-      load_format: dummy
       async_scheduling: false
     engine_input_source: [1]
     custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav
@@ -93,7 +89,7 @@ stage_args:
       temperature: 0.0
       top_p: 1.0
       top_k: -1
-      max_tokens: 200
+      max_tokens: 2000
       seed: 42
       detokenize: True
       repetition_penalty: 1.1
diff --git a/tests/e2e/online_serving/test_qwen3_omni.py b/tests/e2e/online_serving/test_qwen3_omni.py
@@ -133,7 +133,7 @@ def dummy_messages_from_video_data(
 
 def get_prompt(prompt_type="text_only"):
     prompts = {
-        "text_only": "What is the capital of China?",
+        "text_only": "What is the capital of China? Answer in 20 words.",
         "mix": "What is recited in the audio? What is in this image? Describe the video briefly.",
     }
     return prompts.get(prompt_type, prompts["text_only"])
@@ -144,9 +144,6 @@ def get_max_batch_size(size_type="few"):
     return batch_sizes.get(size_type, 5)
 
 
-@pytest.mark.skipif(
-    current_omni_platform.is_rocm(), reason="Test skipped on AMD environment due to known output issues"
-)
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
 def test_mix_to_text_audio_001(client: openai.OpenAI, omni_server, request) -> None:
     """
@@ -226,9 +223,6 @@ def test_mix_to_text_audio_001(client: openai.OpenAI, omni_server, request) -> N
     assert similarity > 0.9, "The audio content is not same as the text"
 
 
-@pytest.mark.skipif(
-    current_omni_platform.is_rocm(), reason="Test skipped on AMD environment due to known output issues"
-)
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
 def test_text_to_text_audio_001(client: openai.OpenAI, omni_server) -> None:
     """
diff --git a/tests/e2e/stage_configs/qwen3_omni_ci.yaml b/tests/e2e/stage_configs/qwen3_omni_ci.yaml
@@ -20,6 +20,7 @@ stage_args:
     engine_output_type: latent  # Output hidden states for talker
     distributed_executor_backend: "mp"
     max_num_batched_tokens: 32768
+    max_model_len: 32768
     enable_prefix_caching: false
     hf_config_name: thinker_config
     tensor_parallel_size: 2
@@ -51,6 +52,7 @@ stage_args:
     engine_output_type: latent  # Output codec codes for code2wav
     enable_prefix_caching: false
     max_num_batched_tokens: 32768
+    max_model_len: 32768
     distributed_executor_backend: "mp"
     hf_config_name: talker_config
   engine_input_source: [0]
diff --git a/tests/utils.py b/tests/utils.py
@@ -77,11 +77,36 @@ def wait_for_gpu_memory_to_clear(
     threshold_ratio: float | None = None,
     timeout_s: float = 120,
 ) -> None:
+    import gc
+
     assert threshold_bytes is not None or threshold_ratio is not None
     # Use nvml instead of pytorch to reduce measurement error from torch cuda
     # context.
     devices = get_physical_device_indices(devices)
     start_time = time.time()
+
+    # Print waiting start information
+    device_list = ", ".join(str(d) for d in devices)
+    if threshold_bytes is not None:
+        threshold_str = f"{threshold_bytes / 2**30:.2f} GiB"
+        condition_str = f"Memory usage ≤ {threshold_str}"
+    else:
+        threshold_percent = threshold_ratio * 100
+        threshold_str = f"{threshold_percent:.1f}%"
+        condition_str = f"Memory usage ratio ≤ {threshold_str}"
+
+    print(f"[GPU Memory Monitor] Waiting for GPU {device_list} to free memory, Condition: {condition_str}")
+
+    # Define the is_free function based on threshold type
+    if threshold_bytes is not None:
+
+        def is_free(used, total):
+            return used <= threshold_bytes / 2**30
+    else:
+
+        def is_free(used, total):
+            return used / total <= threshold_ratio
+
     while True:
         output: dict[int, str] = {}
         output_raw: dict[int, tuple[float, float]] = {}
@@ -97,33 +122,44 @@ def wait_for_gpu_memory_to_clear(
                 gb_used = mem_info.used / 2**30
                 gb_total = mem_info.total / 2**30
             output_raw[device] = (gb_used, gb_total)
-            output[device] = f"{gb_used:.02f}/{gb_total:.02f}"
-
-        print("gpu memory used/total (GiB): ", end="")
-        for k, v in output.items():
-            print(f"{k}={v}; ", end="")
-        print("")
-
-        if threshold_bytes is not None:
-
-            def is_free(used, total):
-                return used <= threshold_bytes / 2**30  # noqa E731
+            # Format to more readable form
+            usage_percent = (gb_used / gb_total) * 100 if gb_total > 0 else 0
+            output[device] = f"{gb_used:.1f}GiB/{gb_total:.1f}GiB ({usage_percent:.1f}%)"
 
-            threshold = f"{threshold_bytes / 2**30} GiB"
-        else:
-
-            def is_free(used, total):
-                return used / total <= threshold_ratio  # noqa E731
-
-            threshold = f"{threshold_ratio:.2f}"
+        # Optimized GPU memory status print
+        print("[GPU Memory Status] Current usage:")
+        for device_id, mem_info in output.items():
+            print(f"  GPU {device_id}: {mem_info}")
 
+        # Calculate waiting duration
         dur_s = time.time() - start_time
+        elapsed_minutes = dur_s / 60
+
+        # Check if all devices meet the condition
         if all(is_free(used, total) for used, total in output_raw.values()):
-            print(f"Done waiting for free GPU memory on devices {devices=} ({threshold=}) {dur_s=:.02f}")
+            # Optimized completion message
+            print(f"[GPU Memory Freed] Devices {device_list} meet memory condition")
+            print(f"   Condition: {condition_str}")
+            print(f"   Wait time: {dur_s:.1f} seconds ({elapsed_minutes:.1f} minutes)")
+            print("   Final status:")
+            for device_id, mem_info in output.items():
+                print(f"     GPU {device_id}: {mem_info}")
             break
 
+        # Check timeout
         if dur_s >= timeout_s:
-            raise ValueError(f"Memory of devices {devices=} not free after {dur_s=:.02f} ({threshold=})")
+            raise ValueError(
+                f"[GPU Memory Timeout] Devices {device_list} still don't meet memory condition after {dur_s:.1f} seconds\n"
+                f"Condition: {condition_str}\n"
+                f"Current status:\n" + "\n".join(f"  GPU {device}: {output[device]}" for device in devices)
+            )
+
+        # Add waiting hint (optional)
+        if dur_s > 10 and int(dur_s) % 10 == 0:  # Show hint every 10 seconds
+            print(f"Waiting... Already waited {dur_s:.1f} seconds ({elapsed_minutes:.1f} minutes)")
+
+        gc.collect()
+        torch.cuda.empty_cache()
 
         time.sleep(5)