Skip to content

Commit cf48588

Browse files
feat: create embeddings tracing implementation (envoyproxy#1240)
**Description** This adds embeddings tracing per OpenInference OpenAI semantics, and adjusts docs accordingly. **Related Issues/PRs** Fixes envoyproxy#1085 --------- Signed-off-by: Adrian Cole <[email protected]>
1 parent 67fa20b commit cf48588

35 files changed

+1673
-82
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ test-crdcel: apigen ## Run the integration tests of CEL validation in CRD defini
160160
test-extproc: build.extproc ## Run the integration tests for extproc without controller or k8s at all.
161161
@$(MAKE) build.testupstream CMD_PATH_PREFIX=tests/internal/testupstreamlib
162162
@echo "Run ExtProc test"
163-
@EXTPROC_BIN=$(OUTPUT_DIR)/extproc-$(shell go env GOOS)-$(shell go env GOARCH) go test ./tests/extproc/... $(GO_TEST_ARGS) $(GO_TEST_E2E_ARGS)
163+
@EXTPROC_BIN=$(OUTPUT_DIR)/extproc-$(shell go env GOOS)-$(shell go env GOARCH) go test -timeout=20m ./tests/extproc/... $(GO_TEST_ARGS) $(GO_TEST_E2E_ARGS)
164164

165165
# This runs the end-to-end tests for the controller with EnvTest.
166166
.PHONY: test-controller

cmd/aigw/.env.otel.console

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,6 @@ OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE=delta
1010
# See https://github.com/Arize-ai/openinference/blob/main/spec/configuration.md
1111
OPENINFERENCE_HIDE_INPUTS=false
1212
OPENINFERENCE_HIDE_OUTPUTS=false
13+
# See https://github.com/Arize-ai/openinference/blob/main/spec/embedding_spans.md
14+
OPENINFERENCE_HIDE_EMBEDDINGS_TEXT=false
15+
OPENINFERENCE_HIDE_EMBEDDINGS_VECTORS=false

cmd/aigw/.env.otel.otel-tui

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,6 @@ OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE=delta
1111
# See https://github.com/Arize-ai/openinference/blob/main/spec/configuration.md
1212
OPENINFERENCE_HIDE_INPUTS=false
1313
OPENINFERENCE_HIDE_OUTPUTS=false
14+
# See https://github.com/Arize-ai/openinference/blob/main/spec/embedding_spans.md
15+
OPENINFERENCE_HIDE_EMBEDDINGS_TEXT=false
16+
OPENINFERENCE_HIDE_EMBEDDINGS_VECTORS=false

cmd/aigw/.env.otel.phoenix

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,6 @@ OTEL_BSP_SCHEDULE_DELAY=100
1111
# See https://github.com/Arize-ai/openinference/blob/main/spec/configuration.md
1212
OPENINFERENCE_HIDE_INPUTS=false
1313
OPENINFERENCE_HIDE_OUTPUTS=false
14+
# See https://github.com/Arize-ai/openinference/blob/main/spec/embedding_spans.md
15+
OPENINFERENCE_HIDE_EMBEDDINGS_TEXT=false
16+
OPENINFERENCE_HIDE_EMBEDDINGS_VECTORS=false

cmd/aigw/README.md

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,18 @@ Here are values we use for Ollama:
3636
The `chat-completion` service uses `curl` to send a simple chat completion
3737
request to the AI Gateway CLI (aigw) which routes it to Ollama.
3838
```bash
39-
docker compose run --rm --no-deps chat-completion
39+
docker compose run --rm chat-completion
4040
```
4141

42-
4. **Shutdown the example stack**:
42+
4. **Create embeddings**:
43+
44+
The `create-embeddings` service uses `curl` to send an embeddings request
45+
to the AI Gateway CLI (aigw) which routes it to Ollama.
46+
```bash
47+
docker compose run --rm create-embeddings
48+
```
49+
50+
5. **Shutdown the example stack**:
4351

4452
`down` stops the containers and removes the volumes used by the stack.
4553
```bash
@@ -120,9 +128,10 @@ This configures the OTLP endpoint to otel-tui on port 4318.
120128
- `otel-tui` - Export to otel-tui Terminal UI (also starts otel-tui service)
121129
- `phoenix` - Export to Phoenix (also starts Phoenix service)
122130

123-
2. **Send a test request**:
131+
2. **Send test requests**:
124132
```bash
125-
COMPOSE_PROFILES=<profile> docker compose -f docker-compose-otel.yaml run --build --rm --no-deps chat-completion
133+
COMPOSE_PROFILES=<profile> docker compose -f docker-compose-otel.yaml run --build --rm chat-completion
134+
COMPOSE_PROFILES=<profile> docker compose -f docker-compose-otel.yaml run --build --rm create-embeddings
126135
```
127136

128137
3. **Check telemetry output**:

cmd/aigw/docker-compose-otel.yaml

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -75,13 +75,30 @@ services:
7575
build:
7676
context: ../../tests/internal/testopeninference
7777
dockerfile: Dockerfile.openai_client
78+
target: chat-completion
7879
container_name: chat-completion
80+
profiles: ["test"]
81+
env_file:
82+
- ../../.env.ollama
83+
- .env.otel.${COMPOSE_PROFILES:-console}
84+
environment:
85+
- OPENAI_BASE_URL=http://aigw:1975/v1
86+
- OPENAI_API_KEY=unused
87+
88+
# create-embeddings is the standard OpenAI client (`openai` in pip), instrumented
89+
# with the following OpenTelemetry instrumentation libraries:
90+
# - openinference-instrumentation-openai (embeddings spans)
91+
# - opentelemetry-instrumentation-httpx (HTTP client spans and trace headers)
92+
create-embeddings:
93+
build:
94+
context: ../../tests/internal/testopeninference
95+
dockerfile: Dockerfile.openai_client
96+
target: create-embeddings
97+
container_name: create-embeddings
98+
profiles: ["test"]
7999
env_file:
80100
- ../../.env.ollama
81101
- .env.otel.${COMPOSE_PROFILES:-console}
82102
environment:
83103
- OPENAI_BASE_URL=http://aigw:1975/v1
84104
- OPENAI_API_KEY=unused
85-
depends_on:
86-
aigw:
87-
condition: service_started

cmd/aigw/docker-compose.yaml

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,6 @@ services:
5050
profiles: ["test"]
5151
env_file:
5252
- ../../.env.ollama
53-
depends_on:
54-
aigw:
55-
condition: service_started
5653
command:
5754
- sh
5855
- -c
@@ -64,3 +61,22 @@ services:
6461
-d "{\"model\":\"$$CHAT_MODEL\",\"messages\":[{\"role\":\"user\",\"content\":\"Answer in up to 3 words: Which ocean contains Bouvet Island?\"}]}"
6562
extra_hosts: # localhost:host-gateway trick doesn't work with aigw
6663
- "host.docker.internal:host-gateway"
64+
65+
# create-embeddings is a simple curl-based test client for sending embeddings requests to aigw.
66+
create-embeddings:
67+
image: golang:1.25
68+
container_name: create-embeddings
69+
profiles: ["test"]
70+
env_file:
71+
- ../../.env.ollama
72+
command:
73+
- sh
74+
- -c
75+
- |
76+
curl -s -w %{http_code} \
77+
-X POST http://aigw:1975/v1/embeddings \
78+
-H "Authorization: Bearer unused" \
79+
-H "Content-Type: application/json" \
80+
-d "{\"model\":\"$$EMBEDDINGS_MODEL\",\"input\":\"What is RAG?\"}"
81+
extra_hosts: # localhost:host-gateway trick doesn't work with aigw
82+
- "host.docker.internal:host-gateway"

internal/apischema/openai/openai.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -272,14 +272,22 @@ func (s *StringOrArray) UnmarshalJSON(data []byte) error {
272272
return nil
273273
}
274274

275-
// Try to unmarshal as array of ints (for token embeddings).
275+
// Try to unmarshal as array of ints (for single pre-tokenized sequence).
276276
var ints []int64
277277
err = json.Unmarshal(data, &ints)
278278
if err == nil {
279279
s.Value = ints
280280
return nil
281281
}
282282

283+
// Try to unmarshal as array of int arrays (for batch pre-tokenized sequences).
284+
var intArrays [][]int64
285+
err = json.Unmarshal(data, &intArrays)
286+
if err == nil {
287+
s.Value = intArrays
288+
return nil
289+
}
290+
283291
// Try to unmarshal as array of ChatCompletionContentPartTextParam (for chat completion).
284292
var arr []ChatCompletionContentPartTextParam
285293
err = json.Unmarshal(data, &arr)

internal/extproc/embeddings_processor.go

Lines changed: 38 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,12 @@ import (
2828

2929
// EmbeddingsProcessorFactory returns a factory method to instantiate the embeddings processor.
3030
func EmbeddingsProcessorFactory(em metrics.EmbeddingsMetrics) ProcessorFactory {
31-
return func(config *processorConfig, requestHeaders map[string]string, logger *slog.Logger, _ tracing.Tracing, isUpstreamFilter bool) (Processor, error) {
31+
return func(config *processorConfig, requestHeaders map[string]string, logger *slog.Logger, tracing tracing.Tracing, isUpstreamFilter bool) (Processor, error) {
3232
logger = logger.With("processor", "embeddings", "isUpstreamFilter", fmt.Sprintf("%v", isUpstreamFilter))
3333
if !isUpstreamFilter {
3434
return &embeddingsProcessorRouterFilter{
3535
config: config,
36+
tracer: tracing.EmbeddingsTracer(),
3637
requestHeaders: requestHeaders,
3738
logger: logger,
3839
}, nil
@@ -67,6 +68,10 @@ type embeddingsProcessorRouterFilter struct {
6768
// when the request is retried.
6869
originalRequestBody *openai.EmbeddingRequest
6970
originalRequestBodyRaw []byte
71+
// tracer is the tracer used for requests.
72+
tracer tracing.EmbeddingsTracer
73+
// span is the tracing span for this request, created in ProcessRequestBody.
74+
span tracing.EmbeddingsSpan
7075
// upstreamFilterCount is the number of upstream filters that have been processed.
7176
// This is used to determine if the request is a retry request.
7277
upstreamFilterCount int
@@ -93,7 +98,7 @@ func (e *embeddingsProcessorRouterFilter) ProcessResponseBody(ctx context.Contex
9398
}
9499

95100
// ProcessRequestBody implements [Processor.ProcessRequestBody].
96-
func (e *embeddingsProcessorRouterFilter) ProcessRequestBody(_ context.Context, rawBody *extprocv3.HttpBody) (*extprocv3.ProcessingResponse, error) {
101+
func (e *embeddingsProcessorRouterFilter) ProcessRequestBody(ctx context.Context, rawBody *extprocv3.HttpBody) (*extprocv3.ProcessingResponse, error) {
97102
originalModel, body, err := parseOpenAIEmbeddingBody(rawBody)
98103
if err != nil {
99104
return nil, fmt.Errorf("failed to parse request body: %w", err)
@@ -110,13 +115,24 @@ func (e *embeddingsProcessorRouterFilter) ProcessRequestBody(_ context.Context,
110115
})
111116
e.originalRequestBody = body
112117
e.originalRequestBodyRaw = rawBody.Body
118+
119+
// Tracing may need to inject headers, so create a header mutation here.
120+
headerMutation := &extprocv3.HeaderMutation{
121+
SetHeaders: additionalHeaders,
122+
}
123+
e.span = e.tracer.StartSpanAndInjectHeaders(
124+
ctx,
125+
e.requestHeaders,
126+
headerMutation,
127+
body,
128+
rawBody.Body,
129+
)
130+
113131
return &extprocv3.ProcessingResponse{
114132
Response: &extprocv3.ProcessingResponse_RequestBody{
115133
RequestBody: &extprocv3.BodyResponse{
116134
Response: &extprocv3.CommonResponse{
117-
HeaderMutation: &extprocv3.HeaderMutation{
118-
SetHeaders: additionalHeaders,
119-
},
135+
HeaderMutation: headerMutation,
120136
ClearRouteCache: true,
121137
},
122138
},
@@ -146,13 +162,15 @@ type embeddingsProcessorUpstreamFilter struct {
146162
costs translator.LLMTokenUsage
147163
// metrics tracking.
148164
metrics metrics.EmbeddingsMetrics
165+
// span is the tracing span for this request, inherited from the router filter.
166+
span tracing.EmbeddingsSpan
149167
}
150168

151169
// selectTranslator selects the translator based on the output schema.
152170
func (e *embeddingsProcessorUpstreamFilter) selectTranslator(out filterapi.VersionedAPISchema) error {
153171
switch out.Name {
154172
case filterapi.APISchemaOpenAI:
155-
e.translator = translator.NewEmbeddingOpenAIToOpenAITranslator(out.Version, e.modelNameOverride)
173+
e.translator = translator.NewEmbeddingOpenAIToOpenAITranslator(out.Version, e.modelNameOverride, e.span)
156174
default:
157175
return fmt.Errorf("unsupported API schema: backend=%s", out)
158176
}
@@ -278,6 +296,13 @@ func (e *embeddingsProcessorUpstreamFilter) ProcessResponseBody(ctx context.Cont
278296
if err != nil {
279297
return nil, fmt.Errorf("failed to transform response error: %w", err)
280298
}
299+
if e.span != nil {
300+
b := bodyMutation.GetBody()
301+
if b == nil {
302+
b = body.Body
303+
}
304+
e.span.EndSpanOnError(code, b)
305+
}
281306
// Mark so the deferred handler records failure.
282307
recordRequestCompletionErr = true
283308
return &extprocv3.ProcessingResponse{
@@ -327,6 +352,9 @@ func (e *embeddingsProcessorUpstreamFilter) ProcessResponseBody(ctx context.Cont
327352
}
328353
}
329354

355+
if body.EndOfStream && e.span != nil {
356+
e.span.EndSpan()
357+
}
330358
return resp, nil
331359
}
332360

@@ -345,6 +373,10 @@ func (e *embeddingsProcessorUpstreamFilter) SetBackend(ctx context.Context, b *f
345373
e.metrics.SetBackend(b)
346374
e.modelNameOverride = b.ModelNameOverride
347375
e.backendName = b.Name
376+
e.originalRequestBody = rp.originalRequestBody
377+
e.originalRequestBodyRaw = rp.originalRequestBodyRaw
378+
e.onRetry = rp.upstreamFilterCount > 1
379+
e.span = rp.span
348380
if err = e.selectTranslator(b.Schema); err != nil {
349381
return fmt.Errorf("failed to select translator: %w", err)
350382
}
@@ -356,9 +388,6 @@ func (e *embeddingsProcessorUpstreamFilter) SetBackend(ctx context.Context, b *f
356388
// Update metrics with the overridden model
357389
e.metrics.SetRequestModel(e.modelNameOverride)
358390
}
359-
e.originalRequestBody = rp.originalRequestBody
360-
e.originalRequestBodyRaw = rp.originalRequestBodyRaw
361-
e.onRetry = rp.upstreamFilterCount > 1
362391
rp.upstreamFilter = e
363392
return
364393
}

internal/extproc/embeddings_processor_test.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,9 @@ func Test_embeddingsProcessorUpstreamFilter_SelectTranslator(t *testing.T) {
5858

5959
func Test_embeddingsProcessorRouterFilter_ProcessRequestBody(t *testing.T) {
6060
t.Run("body parser error", func(t *testing.T) {
61-
p := &embeddingsProcessorRouterFilter{}
61+
p := &embeddingsProcessorRouterFilter{
62+
tracer: tracing.NoopEmbeddingsTracer{},
63+
}
6264
_, err := p.ProcessRequestBody(t.Context(), &extprocv3.HttpBody{Body: []byte("nonjson")})
6365
require.ErrorContains(t, err, "invalid character 'o' in literal null")
6466
})
@@ -70,6 +72,7 @@ func Test_embeddingsProcessorRouterFilter_ProcessRequestBody(t *testing.T) {
7072
config: &processorConfig{modelNameHeaderKey: modelKey},
7173
requestHeaders: headers,
7274
logger: slog.Default(),
75+
tracer: tracing.NoopEmbeddingsTracer{},
7376
}
7477
resp, err := p.ProcessRequestBody(t.Context(), &extprocv3.HttpBody{Body: embeddingBodyFromModel(t, "some-model")})
7578
require.NoError(t, err)

0 commit comments

Comments
 (0)