[Feat] Include runner and convert flag (#803)

shernshiou · web-flow · commit 20a658044af0 · 2026-01-29T16:15:20.000-08:00
* feat: Include runner and convert flag

Signed-off-by: Shern Shiou Tan &lt;shernshiou@gmail.com&gt;

* chores: Add validation and description of runner and convert

Signed-off-by: Shern Shiou Tan &lt;shernshiou@gmail.com&gt;

* docs: Include runner and convert flag at helm README

Signed-off-by: Shern Shiou Tan &lt;shernshiou@gmail.com&gt;

* feat: Move runner and convert to vLLM Configuration

Signed-off-by: Shern Shiou Tan &lt;shernshiou@gmail.com&gt;

---------

Signed-off-by: Shern Shiou Tan &lt;shernshiou@gmail.com&gt;
diff --git a/helm/README.md b/helm/README.md
@@ -132,6 +132,8 @@ This table documents all available configuration values for the Production Stack
 | `servingEngineSpec.modelSpec[].vllmConfig.maxNumSeqs` | integer | `256` | Maximum number of sequences to be processed in a single iteration |
 | `servingEngineSpec.modelSpec[].vllmConfig.maxLoras` | integer | `0` | The maximum number of LoRA models to be loaded in a single batch |
 | `servingEngineSpec.modelSpec[].vllmConfig.gpuMemoryUtilization` | number | `0.9` | The fraction of GPU memory to be used for the model executor (0-1) |
+| `servingEngineSpec.modelSpec[].vllmConfig.runner` | string | `""` | The runner type for the model, can be "auto" or "pooling" |
+| `servingEngineSpec.modelSpec[].vllmConfig.convert` | string | `""` | The conversion type for the model, can be "token_embed", "embed", "token_classify", "classify", or "score" |
 | `servingEngineSpec.modelSpec[].vllmConfig.extraArgs` | list | `["--disable-log-requests"]` | Extra command line arguments to pass to vLLM |
 
 #### LMCache Configuration
diff --git a/helm/templates/deployment-vllm-multi.yaml b/helm/templates/deployment-vllm-multi.yaml
@@ -168,6 +168,14 @@ spec:
           - "--max_loras"
           - {{ .maxLoras | quote }}
           {{-   end }}
+          {{-   if hasKey . "runner" }}
+          - "--runner"
+          - {{ .runner | quote }}
+          {{-   end }}
+          {{-   if hasKey . "convert" }}
+          - "--convert"
+          - {{ .convert | quote }}
+          {{-   end }}
           {{-   if .extraArgs }}
           {{-     range .extraArgs }}
           - {{ . | quote }}
diff --git a/helm/values.schema.json b/helm/values.schema.json
@@ -208,6 +208,14 @@
                   "dtype": {
                     "type": "string"
                   },
+                  "runner": {
+                    "type": "string",
+                    "enum": ["auto", "pooling"]
+                  },
+                  "convert": {
+                    "type": "string",
+                    "enum": ["token_embed", "embed", "token_classify", "classify", "score"]
+                  },
                   "extraArgs": {
                     "type": "array",
                     "items": {
diff --git a/helm/values.yaml b/helm/values.yaml
@@ -82,6 +82,8 @@ servingEngineSpec:
   #   - maxNumSeqs: (optional, int) Maximum number of sequences to be processed in a single iteration., e.g., 32
   #   - maxLoras: (optional, int) The maximum number of LoRA models to be loaded in a single batch, e.g., 4
   #   - gpuMemoryUtilization: (optional, float) The fraction of GPU memory to be used for the model executor, which can range from 0 to 1. e.g., 0.95
+  #   - runner: (optional, string) The runner type for the model, can be "auto" or "pooling". e.g., "pooling"
+  #   - convert: (optional, string) The conversion type for the model, can be "token_embed", "embed", "token_classify", "classify", or "score". e.g., "embed"
   #   - extraArgs: (optional, list) Extra command line arguments to pass to vLLM, e.g., ["--disable-log-requests"]
   #
   # - lmcacheConfig: (optional, map) The configuration of the LMCache for KV offloading, supported options are:

Original file line number	Diff line number	Diff line change
`@@ -82,6 +82,8 @@ servingEngineSpec:`
`82`	`82`	`# - maxNumSeqs: (optional, int) Maximum number of sequences to be processed in a single iteration., e.g., 32`
`83`	`83`	`# - maxLoras: (optional, int) The maximum number of LoRA models to be loaded in a single batch, e.g., 4`
`84`	`84`	`# - gpuMemoryUtilization: (optional, float) The fraction of GPU memory to be used for the model executor, which can range from 0 to 1. e.g., 0.95`
	`85`	`+ # - runner: (optional, string) The runner type for the model, can be "auto" or "pooling". e.g., "pooling"`
	`86`	`+ # - convert: (optional, string) The conversion type for the model, can be "token_embed", "embed", "token_classify", "classify", or "score". e.g., "embed"`
`85`	`87`	`# - extraArgs: (optional, list) Extra command line arguments to pass to vLLM, e.g., ["--disable-log-requests"]`
`86`	`88`	`#`
`87`	`89`	`# - lmcacheConfig: (optional, map) The configuration of the LMCache for KV offloading, supported options are:`