Merge pull request #272 from runpod-workers/feat/vllm-0.16.0

velaraptor-runpod · web-flow · commit 17efb0e7d088 · 2026-03-05T13:06:45.000-06:00
feat: Update to 0.16.0
diff --git a/.runpod/hub.json b/.runpod/hub.json
@@ -280,15 +280,6 @@
           "advanced": true
         }
       },
-      {
-        "key": "NUM_GPU_BLOCKS_OVERRIDE",
-        "input": {
-          "name": "Num GPU Blocks Override",
-          "type": "number",
-          "description": "If specified, ignore GPU profiling result and use this number of GPU blocks.",
-          "advanced": true
-        }
-      },
       {
         "key": "MAX_NUM_BATCHED_TOKENS",
         "input": {
diff --git a/Dockerfile b/Dockerfile
@@ -7,7 +7,7 @@ RUN ldconfig /usr/local/cuda-12.9/compat/
 
 # Install vLLM with FlashInfer - use CUDA 12.8 PyTorch wheels (compatible with vLLM 0.15.1)
 RUN python3 -m pip install --upgrade pip && \
-    python3 -m pip install "vllm[flashinfer]==0.15.1" --extra-index-url https://download.pytorch.org/whl/cu129
+    python3 -m pip install "vllm[flashinfer]==0.16.0" --extra-index-url https://download.pytorch.org/whl/cu129