Allow e2e tests to run vllm in a separate python environment (#1802)

dhuangnm · dsikka · web-flow · commit 4caf5406c42b · 2025-09-12T12:36:30.000-04:00
SUMMARY: Currently the e2e tests assume vllm is installed into the same python environment as llmcompressor and run vllm validation after model is optimized. This can be problematic due to following factors: - vllm might have conflicting dependencies from llmcompressor and they cannot co-install into the same python env. - in the upcoming RHAIIS release, llmcompressor will contain its own image and only llmcompressor and its dependencies will be installed in the image originally. This PR is to address the issues above and allow the e2e tests to run the vllm validation using a different python env. This is achieved by packaging the vllm code into a separate run_vllm.py file, and using an env variable VLLM_PYTHON_ENV to control vllm code to run in the same or a different python env through subprocess.Popen(). By default, we still assume vllm and llmcompressor are installed into the same python env, and there is no changes to the way to set up or run the e2e tests, i.e.: ``` # install llmcompressor and vllm in current python env and run tests bash tests/e2e/vLLM/run_tests.sh -c tests/e2e/vLLM/configs -t tests/e2e/vLLM/test_vllm.py ``` However if vllm is installed into a separate python env (e.g. through virtualenv, uv venv etc), an env variable VLLM_PYTHON_ENV needs to be set to the path of this separate python env, i.e.: ``` export VLLM_PYTHON_ENV=<path of the python env where vllm is installed separately> # run tests in the llmcompressor python env bash tests/e2e/vLLM/run_tests.sh -c tests/e2e/vLLM/configs -t tests/e2e/vLLM/test_vllm.py ``` Here is an example to set up and run the e2e tests using a separate vllm python env: ``` # create vllm python env and get its path uv venv vllm-venv --python 3.12 source vllm-venv/bin/activate uv pip install vllm which python # get the path from this command and use the output for the VLLM_PYTHON_ENV env var deactivate # set up llmcompressor python env to run tests uv venv llmcompressor-venv --python 3.12 source llmcompressor-venv/bin/activate uv pip install llmcompressor[dev] cd llm-compressor export VLLM_PYTHON_ENV=<path from the `which python` command above> bash tests/e2e/vLLM/run_tests.sh -c tests/e2e/vLLM/configs -t tests/e2e/vLLM/test_vllm.py ``` This PR also removed the skip check for vllm in the tests so tests will fail if there is any issue with vllm, no matter it's due to installation or runtime issue. This allows us to track vllm setup issues rather than just skipping and not returning any errors. TEST PLAN: Run the e2e tests with the same and different python env for vllm and make sure all tests pass. - Run with vllm in the same python env: https://github.com/neuralmagic/llm-compressor-testing/actions/runs/17650195538 - Run with vllm in a separate python env: https://github.com/neuralmagic/llm-compressor-testing/actions/runs/17647525082 --------- Signed-off-by: Dan Huang <dan.huang@neuralmagic.com> Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com>
diff --git a/tests/e2e/vLLM/run_vllm.py b/tests/e2e/vLLM/run_vllm.py
@@ -0,0 +1,51 @@
+import json
+import sys
+
+import torch
+from vllm import LLM, SamplingParams
+
+
+def parse_args():
+    """Parse JSON arguments passed via command line."""
+    if len(sys.argv) < 4:
+        msg = "Usage: python script.py '<scheme>' '<llm_kwargs_json>' '<prompts_json>'"
+        raise ValueError(msg)
+
+    try:
+        scheme = json.loads(sys.argv[1])
+        llm_kwargs = json.loads(sys.argv[2])
+        prompts = json.loads(sys.argv[3])
+    except json.JSONDecodeError as e:
+        raise ValueError(f"Invalid JSON input: {e}")
+
+    if "W4A16_2of4" in scheme:
+        # required by the kernel
+        llm_kwargs["dtype"] = torch.float16
+
+    return llm_kwargs, prompts
+
+
+def run_vllm(llm_kwargs: dict, prompts: list[str]) -> None:
+    """Run vLLM with given kwargs and prompts, then print outputs."""
+    sampling_params = SamplingParams(temperature=0.80, top_p=0.95)
+
+    llm = LLM(**llm_kwargs)
+    outputs = llm.generate(prompts, sampling_params)
+
+    print("================= vLLM GENERATION =================")
+    for output in outputs:
+        if not output or not output.outputs:
+            print("[Warning] Empty output for prompt:", output.prompt)
+            continue
+
+        print(f"\nPROMPT:\n{output.prompt}")
+        print(f"GENERATED TEXT:\n{output.outputs[0].text}")
+
+
+def main():
+    llm_kwargs, prompts = parse_args()
+    run_vllm(llm_kwargs, prompts)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/e2e/vLLM/test_vllm.py b/tests/e2e/vLLM/test_vllm.py
@@ -1,6 +1,7 @@
 import os
 import re
 import shutil
+import sys
 from pathlib import Path
 
 import pandas as pd
@@ -14,21 +15,14 @@
 from tests.examples.utils import requires_gpu_count
 from tests.test_timer.timer_utils import get_singleton_manager, log_time
 
-try:
-    from vllm import LLM, SamplingParams
-
-    vllm_installed = True
-except ImportError:
-    vllm_installed = False
-    logger.warning("vllm is not installed. This test will be skipped")
-
-
 HF_MODEL_HUB_NAME = "nm-testing"
 
 TEST_DATA_FILE = os.environ.get(
     "TEST_DATA_FILE", "tests/e2e/vLLM/configs/int8_dynamic_per_token.yaml"
 )
 SKIP_HF_UPLOAD = os.environ.get("SKIP_HF_UPLOAD", "")
+# vllm python environment
+VLLM_PYTHON_ENV = os.environ.get("VLLM_PYTHON_ENV", "same")
 TIMINGS_DIR = os.environ.get("TIMINGS_DIR", "timings/e2e-test_vllm")
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 EXPECTED_SAVED_FILES = [
@@ -45,7 +39,6 @@
 @pytest.mark.parametrize(
     "test_data_file", [pytest.param(TEST_DATA_FILE, id=TEST_DATA_FILE)]
 )
-@pytest.mark.skipif(not vllm_installed, reason="vLLM is not installed, skipping test")
 class TestvLLM:
     """
     The following test quantizes a model using a preset scheme or recipe,
@@ -83,6 +76,12 @@ def set_up(self, test_data_file: str):
         self.max_seq_length = eval_config.get("max_seq_length", 2048)
         # GPU memory utilization - only set if explicitly provided in config
         self.gpu_memory_utilization = eval_config.get("gpu_memory_utilization")
+        # vllm python env - if same, use the current python env, otherwise use
+        # the python passed in VLLM_PYTHON_ENV
+        if VLLM_PYTHON_ENV.lower() != "same":
+            self.vllm_env = VLLM_PYTHON_ENV
+        else:
+            self.vllm_env = sys.executable
 
         if not self.save_dir:
             self.save_dir = self.model.split("/")[1] + f"-{self.scheme}"
@@ -152,20 +151,12 @@ def test_vllm(self, test_data_file: str):
                 folder_path=self.save_dir,
             )
 
-        logger.info("================= RUNNING vLLM =========================")
+        if VLLM_PYTHON_ENV.lower() == "same":
+            logger.info("========== RUNNING vLLM in the same python env ==========")
+        else:
+            logger.info("========== RUNNING vLLM in a separate python env ==========")
 
-        outputs = self._run_vllm()
-
-        logger.info("================= vLLM GENERATION ======================")
-        for output in outputs:
-            assert output
-            prompt = output.prompt
-            generated_text = output.outputs[0].text
-
-            logger.info("PROMPT")
-            logger.info(prompt)
-            logger.info("GENERATED TEXT")
-            logger.info(generated_text)
+        self._run_vllm(logger)
 
         self.tear_down()
 
@@ -193,22 +184,36 @@ def _save_compressed_model(self, oneshot_model, tokenizer):
         tokenizer.save_pretrained(self.save_dir)
 
     @log_time
-    def _run_vllm(self):
-        import torch
+    def _run_vllm(self, logger):
+        import json
+        import subprocess
 
-        sampling_params = SamplingParams(temperature=0.80, top_p=0.95)
         llm_kwargs = {"model": self.save_dir}
 
-        if "W4A16_2of4" in self.scheme:
-            # required by the kernel
-            llm_kwargs["dtype"] = torch.float16
-
         if self.gpu_memory_utilization is not None:
             llm_kwargs["gpu_memory_utilization"] = self.gpu_memory_utilization
 
-        llm = LLM(**llm_kwargs)
-        outputs = llm.generate(self.prompts, sampling_params)
-        return outputs
+        json_scheme = json.dumps(self.scheme)
+        json_llm_kwargs = json.dumps(llm_kwargs)
+        json_prompts = json.dumps(self.prompts)
+
+        test_file_dir = os.path.dirname(os.path.abspath(__file__))
+        run_file_path = os.path.join(test_file_dir, "run_vllm.py")
+
+        logger.info("Run vllm in subprocess.Popen() using python env:")
+        logger.info(self.vllm_env)
+
+        result = subprocess.Popen(
+            [self.vllm_env, run_file_path, json_scheme, json_llm_kwargs, json_prompts],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+        )
+        stdout, stderr = result.communicate()
+        logger.info(stdout)
+
+        error_msg = f"ERROR: vLLM failed with exit code {result.returncode}: {stderr}"
+        assert result.returncode == 0, error_msg
 
     def _check_session_contains_recipe(self) -> None:
         session = active_session()