codefromthecrypt
diff --git a/‎Makefile‎
Lines changed: 1 addition & 1 deletion b/‎Makefile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cmd/aigw/.env.otel.console‎
Lines changed: 3 additions & 0 deletions b/‎cmd/aigw/.env.otel.console‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎cmd/aigw/.env.otel.otel-tui‎
Lines changed: 3 additions & 0 deletions b/‎cmd/aigw/.env.otel.otel-tui‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎cmd/aigw/.env.otel.phoenix‎
Lines changed: 3 additions & 0 deletions b/‎cmd/aigw/.env.otel.phoenix‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎cmd/aigw/README.md‎
Lines changed: 13 additions & 4 deletions b/‎cmd/aigw/README.md‎
Lines changed: 13 additions & 4 deletions
diff --git a/‎cmd/aigw/docker-compose-otel.yaml‎
Lines changed: 20 additions & 3 deletions b/‎cmd/aigw/docker-compose-otel.yaml‎
Lines changed: 20 additions & 3 deletions
diff --git a/‎cmd/aigw/docker-compose.yaml‎
Lines changed: 19 additions & 3 deletions b/‎cmd/aigw/docker-compose.yaml‎
Lines changed: 19 additions & 3 deletions
diff --git a/‎internal/apischema/openai/openai.go‎
Lines changed: 9 additions & 1 deletion b/‎internal/apischema/openai/openai.go‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎internal/extproc/embeddings_processor.go‎
Lines changed: 38 additions & 9 deletions b/‎internal/extproc/embeddings_processor.go‎
Lines changed: 38 additions & 9 deletions
diff --git a/‎internal/extproc/embeddings_processor_test.go‎
Lines changed: 4 additions & 1 deletion b/‎internal/extproc/embeddings_processor_test.go‎
Lines changed: 4 additions & 1 deletion
@@ -160,7 +160,7 @@ test-crdcel: apigen ## Run the integration tests of CEL validation in CRD defini
 test-extproc: build.extproc ## Run the integration tests for extproc without controller or k8s at all.
 	@$(MAKE) build.testupstream CMD_PATH_PREFIX=tests/internal/testupstreamlib
 	@echo "Run ExtProc test"
-	@EXTPROC_BIN=$(OUTPUT_DIR)/extproc-$(shell go env GOOS)-$(shell go env GOARCH) go test ./tests/extproc/... $(GO_TEST_ARGS) $(GO_TEST_E2E_ARGS)
+	@EXTPROC_BIN=$(OUTPUT_DIR)/extproc-$(shell go env GOOS)-$(shell go env GOARCH) go test -timeout=20m ./tests/extproc/... $(GO_TEST_ARGS) $(GO_TEST_E2E_ARGS)
 
 # This runs the end-to-end tests for the controller with EnvTest.
 .PHONY: test-controller
 
@@ -10,3 +10,6 @@ OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE=delta
 # See https://github.com/Arize-ai/openinference/blob/main/spec/configuration.md
 OPENINFERENCE_HIDE_INPUTS=false
 OPENINFERENCE_HIDE_OUTPUTS=false
+# See https://github.com/Arize-ai/openinference/blob/main/spec/embedding_spans.md
+OPENINFERENCE_HIDE_EMBEDDINGS_TEXT=false
+OPENINFERENCE_HIDE_EMBEDDINGS_VECTORS=false
@@ -11,3 +11,6 @@ OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE=delta
 # See https://github.com/Arize-ai/openinference/blob/main/spec/configuration.md
 OPENINFERENCE_HIDE_INPUTS=false
 OPENINFERENCE_HIDE_OUTPUTS=false
+# See https://github.com/Arize-ai/openinference/blob/main/spec/embedding_spans.md
+OPENINFERENCE_HIDE_EMBEDDINGS_TEXT=false
+OPENINFERENCE_HIDE_EMBEDDINGS_VECTORS=false
@@ -11,3 +11,6 @@ OTEL_BSP_SCHEDULE_DELAY=100
 # See https://github.com/Arize-ai/openinference/blob/main/spec/configuration.md
 OPENINFERENCE_HIDE_INPUTS=false
 OPENINFERENCE_HIDE_OUTPUTS=false
+# See https://github.com/Arize-ai/openinference/blob/main/spec/embedding_spans.md
+OPENINFERENCE_HIDE_EMBEDDINGS_TEXT=false
+OPENINFERENCE_HIDE_EMBEDDINGS_VECTORS=false
@@ -36,10 +36,18 @@ Here are values we use for Ollama:
    The `chat-completion` service uses `curl` to send a simple chat completion
    request to the AI Gateway CLI (aigw) which routes it to Ollama.
    ```bash
-   docker compose run --rm --no-deps chat-completion
+   docker compose run --rm chat-completion
    ```
 
-4. **Shutdown the example stack**:
+4. **Create embeddings**:
+
+   The `create-embeddings` service uses `curl` to send an embeddings request
+   to the AI Gateway CLI (aigw) which routes it to Ollama.
+   ```bash
+   docker compose run --rm create-embeddings
+   ```
+
+5. **Shutdown the example stack**:
 
    `down` stops the containers and removes the volumes used by the stack.
    ```bash
@@ -120,9 +128,10 @@ This configures the OTLP endpoint to otel-tui on port 4318.
    - `otel-tui` - Export to otel-tui Terminal UI (also starts otel-tui service)
    - `phoenix` - Export to Phoenix (also starts Phoenix service)
 
-2. **Send a test request**:
+2. **Send test requests**:
    ```bash
-   COMPOSE_PROFILES=<profile> docker compose -f docker-compose-otel.yaml run --build --rm --no-deps chat-completion
+   COMPOSE_PROFILES=<profile> docker compose -f docker-compose-otel.yaml run --build --rm chat-completion
+   COMPOSE_PROFILES=<profile> docker compose -f docker-compose-otel.yaml run --build --rm create-embeddings
    ```
 
 3. **Check telemetry output**:
 
@@ -75,13 +75,30 @@ services:
     build:
       context: ../../tests/internal/testopeninference
       dockerfile: Dockerfile.openai_client
+      target: chat-completion
     container_name: chat-completion
+    profiles: ["test"]
+    env_file:
+      - ../../.env.ollama
+      - .env.otel.${COMPOSE_PROFILES:-console}
+    environment:
+      - OPENAI_BASE_URL=http://aigw:1975/v1
+      - OPENAI_API_KEY=unused
+
+  # create-embeddings is the standard OpenAI client (`openai` in pip), instrumented
+  # with the following OpenTelemetry instrumentation libraries:
+  # - openinference-instrumentation-openai (embeddings spans)
+  # - opentelemetry-instrumentation-httpx (HTTP client spans and trace headers)
+  create-embeddings:
+    build:
+      context: ../../tests/internal/testopeninference
+      dockerfile: Dockerfile.openai_client
+      target: create-embeddings
+    container_name: create-embeddings
+    profiles: ["test"]
     env_file:
       - ../../.env.ollama
       - .env.otel.${COMPOSE_PROFILES:-console}
     environment:
       - OPENAI_BASE_URL=http://aigw:1975/v1
       - OPENAI_API_KEY=unused
-    depends_on:
-      aigw:
-        condition: service_started
 
@@ -50,9 +50,6 @@ services:
     profiles: ["test"]
     env_file:
       - ../../.env.ollama
-    depends_on:
-      aigw:
-        condition: service_started
     command:
       - sh
       - -c
@@ -64,3 +61,22 @@ services:
           -d "{\"model\":\"$$CHAT_MODEL\",\"messages\":[{\"role\":\"user\",\"content\":\"Answer in up to 3 words: Which ocean contains Bouvet Island?\"}]}"
     extra_hosts:  # localhost:host-gateway trick doesn't work with aigw
       - "host.docker.internal:host-gateway"
+
+  # create-embeddings is a simple curl-based test client for sending embeddings requests to aigw.
+  create-embeddings:
+    image: golang:1.25
+    container_name: create-embeddings
+    profiles: ["test"]
+    env_file:
+      - ../../.env.ollama
+    command:
+      - sh
+      - -c
+      - |
+        curl -s -w %{http_code} \
+          -X POST http://aigw:1975/v1/embeddings \
+          -H "Authorization: Bearer unused" \
+          -H "Content-Type: application/json" \
+          -d "{\"model\":\"$$EMBEDDINGS_MODEL\",\"input\":\"What is RAG?\"}"
+    extra_hosts:  # localhost:host-gateway trick doesn't work with aigw
+      - "host.docker.internal:host-gateway"
@@ -272,14 +272,22 @@ func (s *StringOrArray) UnmarshalJSON(data []byte) error {
 		return nil
 	}
 
-	// Try to unmarshal as array of ints (for token embeddings).
+	// Try to unmarshal as array of ints (for single pre-tokenized sequence).
 	var ints []int64
 	err = json.Unmarshal(data, &ints)
 	if err == nil {
 		s.Value = ints
 		return nil
 	}
 
+	// Try to unmarshal as array of int arrays (for batch pre-tokenized sequences).
+	var intArrays [][]int64
+	err = json.Unmarshal(data, &intArrays)
+	if err == nil {
+		s.Value = intArrays
+		return nil
+	}
+
 	// Try to unmarshal as array of ChatCompletionContentPartTextParam (for chat completion).
 	var arr []ChatCompletionContentPartTextParam
 	err = json.Unmarshal(data, &arr)
 
@@ -28,11 +28,12 @@ import (
 
 // EmbeddingsProcessorFactory returns a factory method to instantiate the embeddings processor.
 func EmbeddingsProcessorFactory(em metrics.EmbeddingsMetrics) ProcessorFactory {
-	return func(config *processorConfig, requestHeaders map[string]string, logger *slog.Logger, _ tracing.Tracing, isUpstreamFilter bool) (Processor, error) {
+	return func(config *processorConfig, requestHeaders map[string]string, logger *slog.Logger, tracing tracing.Tracing, isUpstreamFilter bool) (Processor, error) {
 		logger = logger.With("processor", "embeddings", "isUpstreamFilter", fmt.Sprintf("%v", isUpstreamFilter))
 		if !isUpstreamFilter {
 			return &embeddingsProcessorRouterFilter{
 				config:         config,
+				tracer:         tracing.EmbeddingsTracer(),
 				requestHeaders: requestHeaders,
 				logger:         logger,
 			}, nil
@@ -67,6 +68,10 @@ type embeddingsProcessorRouterFilter struct {
 	// when the request is retried.
 	originalRequestBody    *openai.EmbeddingRequest
 	originalRequestBodyRaw []byte
+	// tracer is the tracer used for requests.
+	tracer tracing.EmbeddingsTracer
+	// span is the tracing span for this request, created in ProcessRequestBody.
+	span tracing.EmbeddingsSpan
 	// upstreamFilterCount is the number of upstream filters that have been processed.
 	// This is used to determine if the request is a retry request.
 	upstreamFilterCount int
@@ -93,7 +98,7 @@ func (e *embeddingsProcessorRouterFilter) ProcessResponseBody(ctx context.Contex
 }
 
 // ProcessRequestBody implements [Processor.ProcessRequestBody].
-func (e *embeddingsProcessorRouterFilter) ProcessRequestBody(_ context.Context, rawBody *extprocv3.HttpBody) (*extprocv3.ProcessingResponse, error) {
+func (e *embeddingsProcessorRouterFilter) ProcessRequestBody(ctx context.Context, rawBody *extprocv3.HttpBody) (*extprocv3.ProcessingResponse, error) {
 	originalModel, body, err := parseOpenAIEmbeddingBody(rawBody)
 	if err != nil {
 		return nil, fmt.Errorf("failed to parse request body: %w", err)
@@ -110,13 +115,24 @@ func (e *embeddingsProcessorRouterFilter) ProcessRequestBody(_ context.Context,
 	})
 	e.originalRequestBody = body
 	e.originalRequestBodyRaw = rawBody.Body
+
+	// Tracing may need to inject headers, so create a header mutation here.
+	headerMutation := &extprocv3.HeaderMutation{
+		SetHeaders: additionalHeaders,
+	}
+	e.span = e.tracer.StartSpanAndInjectHeaders(
+		ctx,
+		e.requestHeaders,
+		headerMutation,
+		body,
+		rawBody.Body,
+	)
+
 	return &extprocv3.ProcessingResponse{
 		Response: &extprocv3.ProcessingResponse_RequestBody{
 			RequestBody: &extprocv3.BodyResponse{
 				Response: &extprocv3.CommonResponse{
-					HeaderMutation: &extprocv3.HeaderMutation{
-						SetHeaders: additionalHeaders,
-					},
+					HeaderMutation:  headerMutation,
 					ClearRouteCache: true,
 				},
 			},
@@ -146,13 +162,15 @@ type embeddingsProcessorUpstreamFilter struct {
 	costs translator.LLMTokenUsage
 	// metrics tracking.
 	metrics metrics.EmbeddingsMetrics
+	// span is the tracing span for this request, inherited from the router filter.
+	span tracing.EmbeddingsSpan
 }
 
 // selectTranslator selects the translator based on the output schema.
 func (e *embeddingsProcessorUpstreamFilter) selectTranslator(out filterapi.VersionedAPISchema) error {
 	switch out.Name {
 	case filterapi.APISchemaOpenAI:
-		e.translator = translator.NewEmbeddingOpenAIToOpenAITranslator(out.Version, e.modelNameOverride)
+		e.translator = translator.NewEmbeddingOpenAIToOpenAITranslator(out.Version, e.modelNameOverride, e.span)
 	default:
 		return fmt.Errorf("unsupported API schema: backend=%s", out)
 	}
@@ -278,6 +296,13 @@ func (e *embeddingsProcessorUpstreamFilter) ProcessResponseBody(ctx context.Cont
 		if err != nil {
 			return nil, fmt.Errorf("failed to transform response error: %w", err)
 		}
+		if e.span != nil {
+			b := bodyMutation.GetBody()
+			if b == nil {
+				b = body.Body
+			}
+			e.span.EndSpanOnError(code, b)
+		}
 		// Mark so the deferred handler records failure.
 		recordRequestCompletionErr = true
 		return &extprocv3.ProcessingResponse{
@@ -327,6 +352,9 @@ func (e *embeddingsProcessorUpstreamFilter) ProcessResponseBody(ctx context.Cont
 		}
 	}
 
+	if body.EndOfStream && e.span != nil {
+		e.span.EndSpan()
+	}
 	return resp, nil
 }
 
@@ -345,6 +373,10 @@ func (e *embeddingsProcessorUpstreamFilter) SetBackend(ctx context.Context, b *f
 	e.metrics.SetBackend(b)
 	e.modelNameOverride = b.ModelNameOverride
 	e.backendName = b.Name
+	e.originalRequestBody = rp.originalRequestBody
+	e.originalRequestBodyRaw = rp.originalRequestBodyRaw
+	e.onRetry = rp.upstreamFilterCount > 1
+	e.span = rp.span
 	if err = e.selectTranslator(b.Schema); err != nil {
 		return fmt.Errorf("failed to select translator: %w", err)
 	}
@@ -356,9 +388,6 @@ func (e *embeddingsProcessorUpstreamFilter) SetBackend(ctx context.Context, b *f
 		// Update metrics with the overridden model
 		e.metrics.SetRequestModel(e.modelNameOverride)
 	}
-	e.originalRequestBody = rp.originalRequestBody
-	e.originalRequestBodyRaw = rp.originalRequestBodyRaw
-	e.onRetry = rp.upstreamFilterCount > 1
 	rp.upstreamFilter = e
 	return
 }
 
@@ -58,7 +58,9 @@ func Test_embeddingsProcessorUpstreamFilter_SelectTranslator(t *testing.T) {
 
 func Test_embeddingsProcessorRouterFilter_ProcessRequestBody(t *testing.T) {
 	t.Run("body parser error", func(t *testing.T) {
-		p := &embeddingsProcessorRouterFilter{}
+		p := &embeddingsProcessorRouterFilter{
+			tracer: tracing.NoopEmbeddingsTracer{},
+		}
 		_, err := p.ProcessRequestBody(t.Context(), &extprocv3.HttpBody{Body: []byte("nonjson")})
 		require.ErrorContains(t, err, "invalid character 'o' in literal null")
 	})
@@ -70,6 +72,7 @@ func Test_embeddingsProcessorRouterFilter_ProcessRequestBody(t *testing.T) {
 			config:         &processorConfig{modelNameHeaderKey: modelKey},
 			requestHeaders: headers,
 			logger:         slog.Default(),
+			tracer:         tracing.NoopEmbeddingsTracer{},
 		}
 		resp, err := p.ProcessRequestBody(t.Context(), &extprocv3.HttpBody{Body: embeddingBodyFromModel(t, "some-model")})
 		require.NoError(t, err)