huggingface · NathanHB · Sep 23, 2025 · Sep 23, 2025 · Sep 23, 2025 · Sep 23, 2025
diff --git a/.github/workflows/latest_tests.yaml b/.github/workflows/latest_tests.yaml
@@ -0,0 +1,59 @@
+name: Tests on dev branch of vllm and transformers
+
+on:
+  # Run automatically every Saturday at 00:00 UTC
+  schedule:
+    - cron: "0 0 * * 6"
+
+  # Allow manual triggering via GitHub UI
+  workflow_dispatch:
+
+  # Optional: run on pushes to main or release branches
+  push:
+    branches:
+      - main
+      - v*-release
+
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  run_tests:
+    name: Run tests on dev branch of vllm and transformers
+    runs-on: 'aws-g4dn-2xlarge-use1-public-80'
+    steps:
+      - name: Install Git LFS
+        run: |
+          sudo apt-get update && sudo apt-get install -y git-lfs
+          git lfs install
+
+      - name: Install Python development headers
+        run: sudo apt-get update && sudo apt-get install -y python3.10-dev
+
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          lfs: true
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+        with:
+          enable-cache: true
+
+      - name: Install the project
+        run: |
+          uv sync --extra dev
+          VLLM_USE_PRECOMPILED=1 uv pip install --upgrade git+https://github.com/vllm-project/vllm.git@main
+          uv pip install --upgrade git+https://github.com/huggingface/transformers.git@main
+
+      - name: run nvidia-smi
+        run: nvidia-smi
+
+      - name: Pip freeze
+        run: uv pip freeze
+
+      - name: Run tests
+        run: |
+          VLLM_WORKER_MULTIPROC_METHOD=spawn uv run pytest --disable-pytest-warnings --runslow tests/slow_tests/test_vllm_model.py
+          uv run pytest --disable-pytest-warnings --runslow tests/slow_tests/test_accelerate_model.py
diff --git a/pyproject.toml b/pyproject.toml
@@ -63,7 +63,7 @@ dependencies = [
     "GitPython>=3.1.41", # for logging
     "datasets>=4.0.0",
     "pydantic",
-    "numpy>=2",  # pinned to avoid incompatibilities
+    "numpy>=2,<2.3",  # pinned to avoid incompatibilities
     "hf-xet>=1.1.8",  # pinned to avoid failing test suite
     # Prettiness
     "typer",
@@ -98,7 +98,7 @@ nanotron = [
   "tensorboardX"
 ]
 tensorboardX = ["tensorboardX"]
-vllm = ["vllm>=0.10.0,<0.10.2", "ray", "more_itertools"]
+vllm = ["vllm", "ray", "more_itertools"]
 sglang = ["sglang"]
 quality = ["ruff>=v0.11.0","pre-commit"]
 tests = ["pytest>=7.4.0","deepdiff","pip>=25.2"]

diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py
@@ -48,6 +48,7 @@
     import ray
     from more_itertools import distribute
     from vllm import LLM, RequestOutput, SamplingParams
+    from vllm.inputs.data import TokensPrompt
     from vllm.distributed.parallel_state import (
         destroy_distributed_environment,
         destroy_model_parallel,
@@ -291,7 +292,7 @@ def _create_auto_model(self, config: VLLMModelConfig) -> Optional[LLM]:
         # Inferring from the tokenizer will cause vllm to bug for models with mismatches between model
         # config and tk config, like mistralai/Mistral-7B-v0.1
         if self._max_length is None:
-            self._max_length = model.llm_engine.model_config.max_seq_len_to_capture
+            self._max_length = model.llm_engine.model_config.max_model_len
 
         return model
 
@@ -455,7 +456,7 @@ def run_inference_one_model(model_args: dict, sampling_params: SamplingParams, r
             ]
         else:
             outputs = self.model.generate(
-                prompt_token_ids=inputs,
+                prompts=[TokensPrompt(prompt_token_ids=input) for input in inputs],
                 sampling_params=sampling_params,
                 use_tqdm=True,
             )