feat: add v1/models endpoint (#186)

JaredforReal · web-flow · commit 1067aae54fa0 · 2025-09-21T10:17:44.000-04:00
* FEAT: add v1/models endpoint

Signed-off-by: JaredforReal &lt;w13431838023@gmail.com&gt;

* add port-endpoint mapping in api docs

Signed-off-by: JaredforReal &lt;w13431838023@gmail.com&gt;

---------

Signed-off-by: JaredforReal &lt;w13431838023@gmail.com&gt;
diff --git a/src/semantic-router/pkg/api/server.go b/src/semantic-router/pkg/api/server.go
@@ -48,6 +48,21 @@ type SystemInfo struct {
 	GPUAvailable bool   `json:"gpu_available"`
 }
 
+// OpenAIModel represents a single model in the OpenAI /v1/models response
+type OpenAIModel struct {
+	ID      string `json:"id"`
+	Object  string `json:"object"`
+	Created int64  `json:"created"`
+	OwnedBy string `json:"owned_by"`
+	// Keeping the structure minimal; additional fields like permissions can be added later
+}
+
+// OpenAIModelList is the container for the models list response
+type OpenAIModelList struct {
+	Object string        `json:"object"`
+	Data   []OpenAIModel `json:"data"`
+}
+
 // BatchClassificationRequest represents a batch classification request
 type BatchClassificationRequest struct {
 	Texts    []string               `json:"texts"`
@@ -178,6 +193,9 @@ func (s *ClassificationAPIServer) setupRoutes() *http.ServeMux {
 	mux.HandleFunc("GET /info/models", s.handleModelsInfo)
 	mux.HandleFunc("GET /info/classifier", s.handleClassifierInfo)
 
+	// OpenAI-compatible endpoints
+	mux.HandleFunc("GET /v1/models", s.handleOpenAIModels)
+
 	// Metrics endpoints
 	mux.HandleFunc("GET /metrics/classification", s.handleClassificationMetrics)
 
@@ -355,6 +373,45 @@ func (s *ClassificationAPIServer) handleClassifierInfo(w http.ResponseWriter, r
 	})
 }
 
+// handleOpenAIModels handles OpenAI-compatible model listing at /v1/models
+// It returns all models discoverable from the router configuration plus a synthetic "auto" model.
+func (s *ClassificationAPIServer) handleOpenAIModels(w http.ResponseWriter, r *http.Request) {
+	now := time.Now().Unix()
+
+	// Start with the special "auto" model always available from the router
+	models := []OpenAIModel{
+		{
+			ID:      "auto",
+			Object:  "model",
+			Created: now,
+			OwnedBy: "semantic-router",
+		},
+	}
+
+	// Append underlying models from config (if available)
+	if s.config != nil {
+		for _, m := range s.config.GetAllModels() {
+			// Skip if already added as "auto" (or avoid duplicates in general)
+			if m == "auto" {
+				continue
+			}
+			models = append(models, OpenAIModel{
+				ID:      m,
+				Object:  "model",
+				Created: now,
+				OwnedBy: "upstream-endpoint",
+			})
+		}
+	}
+
+	resp := OpenAIModelList{
+		Object: "list",
+		Data:   models,
+	}
+
+	s.writeJSONResponse(w, http.StatusOK, resp)
+}
+
 func (s *ClassificationAPIServer) handleClassificationMetrics(w http.ResponseWriter, r *http.Request) {
 	s.writeErrorResponse(w, http.StatusNotImplemented, "NOT_IMPLEMENTED", "Classification metrics not implemented yet")
 }
diff --git a/src/semantic-router/pkg/api/server_test.go b/src/semantic-router/pkg/api/server_test.go
@@ -248,3 +248,60 @@ func TestBatchClassificationConfiguration(t *testing.T) {
 		})
 	}
 }
+
+func TestOpenAIModelsEndpoint(t *testing.T) {
+	cfg := &config.RouterConfig{
+		VLLMEndpoints: []config.VLLMEndpoint{
+			{
+				Name:    "primary",
+				Address: "localhost",
+				Port:    8000,
+				Models:  []string{"gpt-4o-mini", "llama-3.1-8b-instruct"},
+				Weight:  1,
+			},
+		},
+	}
+
+	apiServer := &ClassificationAPIServer{
+		classificationSvc: services.NewPlaceholderClassificationService(),
+		config:            cfg,
+	}
+
+	req := httptest.NewRequest("GET", "/v1/models", nil)
+	rr := httptest.NewRecorder()
+
+	apiServer.handleOpenAIModels(rr, req)
+
+	if rr.Code != http.StatusOK {
+		t.Fatalf("expected 200 OK, got %d", rr.Code)
+	}
+
+	var resp OpenAIModelList
+	if err := json.Unmarshal(rr.Body.Bytes(), &resp); err != nil {
+		t.Fatalf("failed to parse response: %v", err)
+	}
+
+	if resp.Object != "list" {
+		t.Errorf("expected object 'list', got %s", resp.Object)
+	}
+
+	// Build a set for easy lookup
+	got := map[string]bool{}
+	for _, m := range resp.Data {
+		got[m.ID] = true
+		if m.Object != "model" {
+			t.Errorf("expected each item.object to be 'model', got %s", m.Object)
+		}
+		if m.Created == 0 {
+			t.Errorf("expected created timestamp to be non-zero")
+		}
+	}
+
+	// Must contain 'auto' and the configured models
+	if !got["auto"] {
+		t.Errorf("expected list to contain 'auto'")
+	}
+	if !got["gpt-4o-mini"] || !got["llama-3.1-8b-instruct"] {
+		t.Errorf("expected configured models to be present, got=%v", got)
+	}
+}
diff --git a/website/docs/api/classification.md b/website/docs/api/classification.md
@@ -18,6 +18,24 @@ The Classification API server runs alongside the main Semantic Router ExtProc se
 - **ExtProc Server**: `http://localhost:50051` (gRPC for Envoy integration)
 - **Metrics Server**: `http://localhost:9190` (Prometheus metrics)
 
+### Endpoint-to-port mapping (quick reference)
+
+- Port 8080 (this API)
+  - `GET /v1/models` (OpenAI-compatible model list, includes `auto`)
+  - `GET /health`
+  - `GET /info/models`, `GET /info/classifier`
+  - `POST /api/v1/classify/intent|pii|security|batch`
+
+- Port 8801 (Envoy public entry)
+  - Typically proxies `POST /v1/chat/completions` to upstream LLMs while invoking ExtProc (50051).
+  - You can expose `GET /v1/models` at 8801 by adding an Envoy route that forwards to `router:8080`.
+
+- Port 50051 (ExtProc, gRPC)
+  - Used by Envoy for external processing of requests; not an HTTP endpoint.
+
+- Port 9190 (Prometheus)
+  - `GET /metrics`
+
 Start the server with:
 
 ```bash
diff --git a/website/docs/api/router.md b/website/docs/api/router.md
@@ -6,6 +6,28 @@ The Semantic Router provides a gRPC-based API that integrates seamlessly with En
 
 The Semantic Router operates as an ExtProc server that processes HTTP requests through Envoy Proxy. It doesn't expose direct REST endpoints but rather processes OpenAI-compatible API requests routed through Envoy.
 
+> Note: In addition to the ExtProc path, this project also starts a lightweight HTTP Classification API on port 8080 for health/info and classification utilities. The OpenAI-compatible `/v1/models` endpoint is provided by this HTTP API (8080) and can be optionally exposed through Envoy (8801) via routing rules.
+
+### Ports and endpoint mapping
+
+- 8801 (HTTP, Envoy public entry)
+  - Typical client entry for OpenAI-compatible requests like `POST /v1/chat/completions`.
+  - Can proxy `GET /v1/models` to Router 8080 if you add an Envoy route; otherwise `/v1/models` at 8801 may return “no healthy upstream”.
+
+- 8080 (HTTP, Classification API)
+  - `GET /v1/models`  → OpenAI-compatible model list (includes synthetic `auto`)
+  - `GET /health`      → Classification API health
+  - `GET /info/models` → Loaded classifier models + system info
+  - `GET /info/classifier` → Classifier configuration details
+  - `POST /api/v1/classify/intent|pii|security|batch` → Direct classification utilities
+
+- 50051 (gRPC, ExtProc)
+  - Envoy External Processing (ExtProc) for in-path classification/routing of `/v1/chat/completions`.
+  - Not an HTTP port; not directly accessible via curl.
+
+- 9190 (HTTP, Prometheus metrics)
+  - `GET /metrics` → Prometheus scrape endpoint (global process metrics).
+
 ### Request Flow
 
 ```mermaid
@@ -30,6 +52,29 @@ sequenceDiagram
 
 The router processes standard OpenAI API requests:
 
+### Models Endpoint
+
+Lists available models and includes a synthetic "auto" model that uses the router's intent classification to select the best underlying model per request.
+
+- Endpoint: `GET /v1/models`
+- Response:
+
+```json
+{
+  "object": "list",
+  "data": [
+    { "id": "auto", "object": "model", "created": 1726890000, "owned_by": "semantic-router" },
+    { "id": "gpt-4o-mini", "object": "model", "created": 1726890000, "owned_by": "upstream-endpoint" },
+    { "id": "llama-3.1-8b-instruct", "object": "model", "created": 1726890000, "owned_by": "upstream-endpoint" }
+  ]
+}
+```
+
+Notes:
+
+- The concrete model list is sourced from your configured vLLM endpoints in `config.yaml` (see `vllm_endpoints[].models`).
+- The special `auto` model is always present and instructs the router to classify and route to the best backend model automatically.
+
 ### Chat Completions Endpoint
 
 **Endpoint:** `POST /v1/chat/completions`