runpod-workers · TimPietruskyRunPod · Nov 3, 2025 · Oct 10, 2025
diff --git a/.runpod/hub.json b/.runpod/hub.json
@@ -21,21 +21,29 @@
         }
       },
       {
-        "key": "HF_TOKEN",
+        "key": "TOOL_CALL_PARSER",
         "input": {
-          "name": "Access Token",
+          "name": "Tool Call Parser",
           "type": "string",
-          "description": "Hugging Face access token for gated & private models",
+          "description": "Defines the parser used to interpret tool call responses",
           "default": "",
-          "required": false
+          "required": false,
+          "advanced": true,
+          "options": [
+            { "value": "llama3", "label": "llama3" },
+            { "value": "llama4", "label": "llama4" },
+            { "value": "mistral", "label": "mistral" },
+            { "value": "qwen25", "label": "qwen25" },
+            { "value": "deepseekv3", "label": "deepseekv3" }
+          ]
         }
       },
       {
-        "key": "TOOL_CALL_PARSER",
+        "key": "REASONING_PARSER",
         "input": {
-          "name": "Tool Call Parser",
+          "name": "Reasoning Parser",
           "type": "string",
-          "description": "Defines the parser used to interpret tool call responses",
+          "description": "Defines the parser used to interpret reasoning traces",
           "default": "",
           "required": false,
           "advanced": true,

diff --git a/Dockerfile b/Dockerfile
@@ -1,8 +1,8 @@
-FROM lmsysorg/sglang:v0.4.6.post4-cu124
+FROM lmsysorg/sglang:v0.5.2-cu126
 
 # Install uv package manager
 RUN curl -Ls https://astral.sh/uv/install.sh | sh \
-    && ln -s /root/.local/bin/uv /usr/local/bin/uv
+    && ln -sf /root/.local/bin/uv /usr/local/bin/uv
 ENV PATH="/root/.local/bin:${PATH}"
 
 # Set working directory to the one already used by the base image

diff --git a/README.md b/README.md
@@ -51,7 +51,19 @@ All behaviour is controlled through environment variables:
 | `ENABLE_P2P_CHECK`                | Enable P2P check for GPU access                   | false                                 | boolean (true or false)                                                                   |
 | `ENABLE_FLASHINFER_MLA`           | Enable FlashInfer MLA optimization                | false                                 | boolean (true or false)                                                                   |
 | `TRITON_ATTENTION_REDUCE_IN_FP32` | Cast Triton attention reduce op to FP32           | false                                 | boolean (true or false)                                                                   |
-| `TOOL_CALL_PARSER`                | Defines the parser used to interpret responses    | qwen25                                | "llama3", "llama4", "mistral", "qwen25", "deepseekv3"                                     |
+| `TOOL_CALL_PARSER`                | Defines the parser used to interpret responses    |                                       | "llama3", "llama4", "mistral", "qwen25", "deepseekv3"                                     |
+| `REASONING_PARSER`                | Defines the parser used for reasoning traces      |                                       | "llama3", "llama4", "mistral", "qwen25", "deepseekv3"                                     |
+
+## Tool/Function Calling and Reasoning
+
+- **Tool/Function calling**: Set the `TOOL_CALL_PARSER` environment variable to match your model family. Supported values: `llama3`, `llama4`, `mistral`, `qwen25`, `deepseekv3`. If unset, this worker does not pass `--tool-call-parser` to SGLang.
+
+  - Example (docker-compose): add `TOOL_CALL_PARSER=llama3` under `environment:`.
+  - Example (RunPod Hub): set the `TOOL_CALL_PARSER` env var in the UI.
+
+- **Reasoning**: Set the `REASONING_PARSER` environment variable to match your model family if you want to enable reasoning traces parsing. If unset, this worker does not pass `--reasoning-parser` to SGLang.
+  - Example (docker-compose): add `# REASONING_PARSER=llama3` under `environment:` (uncomment to use).
+  - Example (RunPod Hub): set the `REASONING_PARSER` env var in the UI.
 
 ## API Usage
 

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -19,6 +19,7 @@ services:
       - ATTENTION_BACKEND=flashinfer
       - SAMPLING_BACKEND=flashinfer
       - TOOL_CALL_PARSER=llama3
+      # - REASONING_PARSER=llama3
       - HF_TOKEN=${HF_TOKEN}
 
       # make it work locally with <= 8 GB VRAM

diff --git a/docs/conventions.md b/docs/conventions.md
@@ -61,3 +61,21 @@ chore(deps): update requirements.txt
 - Test changes before committing
 - Write descriptive commit messages
 - Keep commits focused and atomic
+
+## Configuration Conventions
+
+- Single source of truth: use `.runpod/hub.json` for endpoint configuration.
+
+  - Define environment variables, UI options, and allowed CUDA versions here.
+  - Do not add or rely on `worker-config.json` (removed).
+
+- CUDA policy:
+
+  - Minimum supported CUDA is 12.6.
+  - Base images must match this (e.g., `lmsysorg/sglang:vX.Y.Z-cu126`).
+  - Keep `allowedCudaVersions` in `hub.json` at 12.6 or higher.
+
+- Tool/function calling and reasoning:
+  - `TOOL_CALL_PARSER`: required to enable tool/function calling; no runtime default is applied. If unset, `--tool-call-parser` is not passed to SGLang.
+  - `REASONING_PARSER`: required to enable reasoning trace parsing; no runtime default is applied. If unset, `--reasoning-parser` is not passed to SGLang.
+  - Choose a parser matching the model family (e.g., `llama3`, `llama4`, `mistral`, `qwen25`, `deepseekv3`).
diff --git a/engine.py b/engine.py
@@ -60,7 +60,8 @@ def start_server(self):
             "LOAD_BALANCE_METHOD": "--load-balance-method",
             "ATTENTION_BACKEND": "--attention-backend",
             "SAMPLING_BACKEND": "--sampling-backend",
-            "TOOL_CALL_PARSER": "--tool-call-parser"
+            "TOOL_CALL_PARSER": "--tool-call-parser",
+            "REASONING_PARSER": "--reasoning-parser",
         }
 
         # Boolean flags