|
1 | 1 | import os
|
2 | 2 | import re
|
3 | 3 | import shutil
|
| 4 | +import sys |
4 | 5 | from pathlib import Path
|
5 | 6 |
|
6 | 7 | import pandas as pd
|
|
14 | 15 | from tests.examples.utils import requires_gpu_count
|
15 | 16 | from tests.test_timer.timer_utils import get_singleton_manager, log_time
|
16 | 17 |
|
17 |
| -try: |
18 |
| - from vllm import LLM, SamplingParams |
19 |
| - |
20 |
| - vllm_installed = True |
21 |
| -except ImportError: |
22 |
| - vllm_installed = False |
23 |
| - logger.warning("vllm is not installed. This test will be skipped") |
24 |
| - |
25 |
| - |
26 | 18 | HF_MODEL_HUB_NAME = "nm-testing"
|
27 | 19 |
|
28 | 20 | TEST_DATA_FILE = os.environ.get(
|
29 | 21 | "TEST_DATA_FILE", "tests/e2e/vLLM/configs/int8_dynamic_per_token.yaml"
|
30 | 22 | )
|
31 | 23 | SKIP_HF_UPLOAD = os.environ.get("SKIP_HF_UPLOAD", "")
|
| 24 | +# vllm python environment |
| 25 | +VLLM_PYTHON_ENV = os.environ.get("VLLM_PYTHON_ENV", "same") |
32 | 26 | TIMINGS_DIR = os.environ.get("TIMINGS_DIR", "timings/e2e-test_vllm")
|
33 | 27 | os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
34 | 28 | EXPECTED_SAVED_FILES = [
|
|
45 | 39 | @pytest.mark.parametrize(
|
46 | 40 | "test_data_file", [pytest.param(TEST_DATA_FILE, id=TEST_DATA_FILE)]
|
47 | 41 | )
|
48 |
| -@pytest.mark.skipif(not vllm_installed, reason="vLLM is not installed, skipping test") |
49 | 42 | class TestvLLM:
|
50 | 43 | """
|
51 | 44 | The following test quantizes a model using a preset scheme or recipe,
|
@@ -83,6 +76,12 @@ def set_up(self, test_data_file: str):
|
83 | 76 | self.max_seq_length = eval_config.get("max_seq_length", 2048)
|
84 | 77 | # GPU memory utilization - only set if explicitly provided in config
|
85 | 78 | self.gpu_memory_utilization = eval_config.get("gpu_memory_utilization")
|
| 79 | + # vllm python env - if same, use the current python env, otherwise use |
| 80 | + # the python passed in VLLM_PYTHON_ENV |
| 81 | + if VLLM_PYTHON_ENV.lower() != "same": |
| 82 | + self.vllm_env = VLLM_PYTHON_ENV |
| 83 | + else: |
| 84 | + self.vllm_env = sys.executable |
86 | 85 |
|
87 | 86 | if not self.save_dir:
|
88 | 87 | self.save_dir = self.model.split("/")[1] + f"-{self.scheme}"
|
@@ -152,20 +151,12 @@ def test_vllm(self, test_data_file: str):
|
152 | 151 | folder_path=self.save_dir,
|
153 | 152 | )
|
154 | 153 |
|
155 |
| - logger.info("================= RUNNING vLLM =========================") |
| 154 | + if VLLM_PYTHON_ENV.lower() == "same": |
| 155 | + logger.info("========== RUNNING vLLM in the same python env ==========") |
| 156 | + else: |
| 157 | + logger.info("========== RUNNING vLLM in a separate python env ==========") |
156 | 158 |
|
157 |
| - outputs = self._run_vllm() |
158 |
| - |
159 |
| - logger.info("================= vLLM GENERATION ======================") |
160 |
| - for output in outputs: |
161 |
| - assert output |
162 |
| - prompt = output.prompt |
163 |
| - generated_text = output.outputs[0].text |
164 |
| - |
165 |
| - logger.info("PROMPT") |
166 |
| - logger.info(prompt) |
167 |
| - logger.info("GENERATED TEXT") |
168 |
| - logger.info(generated_text) |
| 159 | + self._run_vllm(logger) |
169 | 160 |
|
170 | 161 | self.tear_down()
|
171 | 162 |
|
@@ -193,22 +184,36 @@ def _save_compressed_model(self, oneshot_model, tokenizer):
|
193 | 184 | tokenizer.save_pretrained(self.save_dir)
|
194 | 185 |
|
195 | 186 | @log_time
|
196 |
| - def _run_vllm(self): |
197 |
| - import torch |
| 187 | + def _run_vllm(self, logger): |
| 188 | + import json |
| 189 | + import subprocess |
198 | 190 |
|
199 |
| - sampling_params = SamplingParams(temperature=0.80, top_p=0.95) |
200 | 191 | llm_kwargs = {"model": self.save_dir}
|
201 | 192 |
|
202 |
| - if "W4A16_2of4" in self.scheme: |
203 |
| - # required by the kernel |
204 |
| - llm_kwargs["dtype"] = torch.float16 |
205 |
| - |
206 | 193 | if self.gpu_memory_utilization is not None:
|
207 | 194 | llm_kwargs["gpu_memory_utilization"] = self.gpu_memory_utilization
|
208 | 195 |
|
209 |
| - llm = LLM(**llm_kwargs) |
210 |
| - outputs = llm.generate(self.prompts, sampling_params) |
211 |
| - return outputs |
| 196 | + json_scheme = json.dumps(self.scheme) |
| 197 | + json_llm_kwargs = json.dumps(llm_kwargs) |
| 198 | + json_prompts = json.dumps(self.prompts) |
| 199 | + |
| 200 | + test_file_dir = os.path.dirname(os.path.abspath(__file__)) |
| 201 | + run_file_path = os.path.join(test_file_dir, "run_vllm.py") |
| 202 | + |
| 203 | + logger.info("Run vllm in subprocess.Popen() using python env:") |
| 204 | + logger.info(self.vllm_env) |
| 205 | + |
| 206 | + result = subprocess.Popen( |
| 207 | + [self.vllm_env, run_file_path, json_scheme, json_llm_kwargs, json_prompts], |
| 208 | + stdout=subprocess.PIPE, |
| 209 | + stderr=subprocess.PIPE, |
| 210 | + text=True, |
| 211 | + ) |
| 212 | + stdout, stderr = result.communicate() |
| 213 | + logger.info(stdout) |
| 214 | + |
| 215 | + error_msg = f"ERROR: vLLM failed with exit code {result.returncode}: {stderr}" |
| 216 | + assert result.returncode == 0, error_msg |
212 | 217 |
|
213 | 218 | def _check_session_contains_recipe(self) -> None:
|
214 | 219 | session = active_session()
|
|
0 commit comments