add preemption handling to CI

yangulei · yangulei · commit b0f4a6e86aa2 · 2026-01-19T17:50:56.000+08:00
Signed-off-by: Youlei Yang &lt;youlei.yang@intel.com&gt;
diff --git a/tests/full_tests/ci_gsm8k_tests.sh b/tests/full_tests/ci_gsm8k_tests.sh
@@ -265,6 +265,13 @@ run_qwen2_5_vl_unified_attn_test() {
     echo "✅ Test multimodal-support + unified attention with qwen2.5-vl-7b passed."
 }
 
+# Preemption test
+run_preemption_test() {
+    echo "➡️ Testing preemption handling..."
+    VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 python -u "${VLLM_GAUDI_PREFIX}/tests/full_tests/preemption.py"
+    echo "✅ Test with preemption handling passed."
+}
+
 # Spec decode with ngram
 run_spec_decode_ngram_test() {
     echo "➡️ Testing Spec-decode with ngram..."
diff --git a/tests/full_tests/preemption.py b/tests/full_tests/preemption.py
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm import LLM, SamplingParams
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=512, ignore_eos=True)
+
+
+def main():
+    # Create an LLM.
+    llm = LLM(
+        model="meta-llama/Meta-Llama-3-8B-Instruct",
+        block_size=128,
+        max_model_len=1024,
+        max_num_batched_tokens=1024,
+        gpu_memory_utilization=0.9,
+        num_gpu_blocks_override=8,
+        disable_log_stats=False,
+    )
+    # Generate texts from the prompts.
+    # The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt:    {prompt!r}")
+        print(f"Output:    {generated_text!r}")
+        print("-" * 60)
+
+
+if __name__ == "__main__":
+    main()