Feat: Support Path Suffix for LLM Endpoints (#949)

Xunzhuo · web-flow · commit 8ad0c46a23ba · 2026-01-05T13:37:50.000+08:00
* Feat: Support Path Suffix for LLM Endpoints

Signed-off-by: bitliu &lt;bitliu@tencent.com&gt;

* Feat: Support Path Suffix for LLM Endpoints

Signed-off-by: bitliu &lt;bitliu@tencent.com&gt;

---------

Signed-off-by: bitliu &lt;bitliu@tencent.com&gt;
diff --git a/src/semantic-router/cmd/main.go b/src/semantic-router/cmd/main.go
@@ -154,12 +154,12 @@ func main() {
 	// Initialize embedding models BEFORE creating server, this ensures Qwen3/Gemma models are ready when semantic cache is initialized
 	// Use the already loaded config instead of calling config.Load() again
 	if cfg.Qwen3ModelPath != "" || cfg.GemmaModelPath != "" {
-		if err := candle_binding.InitEmbeddingModels(
+		if initErr := candle_binding.InitEmbeddingModels(
 			cfg.Qwen3ModelPath,
 			cfg.GemmaModelPath,
 			cfg.EmbeddingModels.UseCPU,
-		); err != nil {
-			logging.Errorf("Failed to initialize embedding models: %v", err)
+		); initErr != nil {
+			logging.Errorf("Failed to initialize embedding models: %v", initErr)
 			logging.Warnf("Embedding API endpoints will return placeholder embeddings")
 		} else {
 			logging.Infof("Embedding models initialized successfully")
diff --git a/src/vllm-sr/cli/config_generator.py b/src/vllm-sr/cli/config_generator.py
@@ -92,12 +92,23 @@ def generate_envoy_config_from_user_config(
         uses_dns = False
 
         for endpoint in model.endpoints:
-            # Parse endpoint (host:port or just host)
-            if ":" in endpoint.endpoint:
-                host, port = endpoint.endpoint.split(":", 1)
+            # Parse endpoint: can be "host", "host:port", or "host/path" or "host:port/path"
+            endpoint_str = endpoint.endpoint
+            path = ""
+
+            # Extract path if present (e.g., "host/path" or "host:port/path")
+            if "/" in endpoint_str:
+                # Split by first "/" to separate host[:port] from path
+                parts = endpoint_str.split("/", 1)
+                endpoint_str = parts[0]  # host or host:port
+                path = "/" + parts[1]  # /path
+
+            # Parse host and port
+            if ":" in endpoint_str:
+                host, port = endpoint_str.split(":", 1)
                 port = int(port)
             else:
-                host = endpoint.endpoint
+                host = endpoint_str
                 # Default port based on protocol
                 port = 443 if endpoint.protocol == "https" else 80
 
@@ -117,6 +128,7 @@ def generate_envoy_config_from_user_config(
                     "name": endpoint.name,
                     "address": host,
                     "port": int(port),
+                    "path": path,
                     "weight": endpoint.weight,
                     "protocol": endpoint.protocol,
                     "is_https": is_https,
@@ -131,13 +143,21 @@ def generate_envoy_config_from_user_config(
         # Domain names → LOGICAL_DNS, IP addresses → STATIC
         cluster_type = "LOGICAL_DNS" if uses_dns else "STATIC"
 
+        # Determine path prefix - use the first endpoint's path if all endpoints have the same path
+        path_prefix = ""
+        if endpoints:
+            first_path = endpoints[0].get("path", "")
+            if first_path and all(ep.get("path", "") == first_path for ep in endpoints):
+                path_prefix = first_path
+
         models.append(
             {
                 "name": model.name,
                 "cluster_name": cluster_name,
                 "endpoints": endpoints,
                 "cluster_type": cluster_type,
                 "has_https": has_https,
+                "path_prefix": path_prefix,
             }
         )
 
diff --git a/src/vllm-sr/cli/merger.py b/src/vllm-sr/cli/merger.py
@@ -226,25 +226,40 @@ def translate_providers_to_router_format(providers) -> Dict[str, Any]:
 
         # Add endpoints for this model
         for endpoint in model.endpoints:
-            # Parse endpoint (host:port or just host)
-            if ":" in endpoint.endpoint:
-                host, port = endpoint.endpoint.split(":", 1)
+            # Parse endpoint: can be "host", "host:port", or "host/path" or "host:port/path"
+            endpoint_str = endpoint.endpoint
+            path = ""
+
+            # Extract path if present (e.g., "host/path" or "host:port/path")
+            if "/" in endpoint_str:
+                # Split by first "/" to separate host[:port] from path
+                parts = endpoint_str.split("/", 1)
+                endpoint_str = parts[0]  # host or host:port
+                path = "/" + parts[1]  # /path
+
+            # Parse host and port
+            if ":" in endpoint_str:
+                host, port = endpoint_str.split(":", 1)
                 port = int(port)
             else:
-                host = endpoint.endpoint
+                host = endpoint_str
                 # Use default port based on protocol
                 port = 443 if endpoint.protocol == "https" else 80
 
-            vllm_endpoints.append(
-                {
-                    "name": f"{model.name}_{endpoint.name}",
-                    "address": host,
-                    "port": port,
-                    "weight": endpoint.weight,
-                    "protocol": endpoint.protocol,
-                    "model": model.name,
-                }
-            )
+            endpoint_config = {
+                "name": f"{model.name}_{endpoint.name}",
+                "address": host,
+                "port": port,
+                "weight": endpoint.weight,
+                "protocol": endpoint.protocol,
+                "model": model.name,
+            }
+
+            # Add path if present
+            if path:
+                endpoint_config["path"] = path
+
+            vllm_endpoints.append(endpoint_config)
 
     return {
         "vllm_endpoints": vllm_endpoints,
diff --git a/src/vllm-sr/cli/templates/envoy.template.yaml b/src/vllm-sr/cli/templates/envoy.template.yaml
@@ -38,6 +38,14 @@ static_resources:
                   cluster: {{ model.cluster_name }}_cluster
                   timeout: {{ listener.timeout | default('300s') }}
                   idleTimeout: 300s
+                  {% if model.path_prefix %}
+                  # Prepend path prefix to all requests (e.g., /openapi + /v1/chat/completions = /openapi/v1/chat/completions)
+                  regex_rewrite:
+                    pattern:
+                      google_re2: {}
+                      regex: "^/"
+                    substitution: "{{ model.path_prefix }}/"
+                  {% endif %}
               {% endfor %}
               # Default route (no x-selected-model header)
               - match: