Skip to content

Commit b4041d4

Browse files
committed
Initial distributed tracing instrumentation
- Add OpenTelemetry distributed tracing support to kv-cache-manager - Instrument Redis operations with automatic OTel tracing and metrics
1 parent 2605059 commit b4041d4

File tree

6 files changed

+72
-4
lines changed

6 files changed

+72
-4
lines changed

examples/kv_cache_index/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ func runPrompts(ctx context.Context, kvCacheIndexer *kvcache.Indexer) error {
132132
ModelName: modelName,
133133
ChunkHash: h,
134134
}
135-
}), []kvblock.PodEntry{{"pod1", "gpu"}})
135+
}), []kvblock.PodEntry{{PodIdentifier: "pod1", DeviceTier: "gpu"}})
136136

137137
// Sleep 3 secs
138138
time.Sleep(3 * time.Second)

go.mod

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,12 @@ require (
1111
github.com/pebbe/zmq4 v1.4.0
1212
github.com/prometheus/client_golang v1.22.0
1313
github.com/prometheus/client_model v0.6.1
14+
github.com/redis/go-redis/extra/redisotel/v9 v9.7.3
1415
github.com/redis/go-redis/v9 v9.7.3
1516
github.com/stretchr/testify v1.10.0
1617
github.com/vmihailenco/msgpack/v5 v5.4.1
18+
go.opentelemetry.io/otel v1.36.0
19+
go.opentelemetry.io/otel/trace v1.36.0
1720
k8s.io/apimachinery v0.33.0
1821
k8s.io/client-go v0.33.0
1922
k8s.io/klog/v2 v2.130.1
@@ -26,6 +29,7 @@ require (
2629
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect
2730
github.com/emicklei/go-restful/v3 v3.11.0 // indirect
2831
github.com/go-logr/logr v1.4.2 // indirect
32+
github.com/go-logr/stdr v1.2.2 // indirect
2933
github.com/go-openapi/jsonpointer v0.21.0 // indirect
3034
github.com/go-openapi/jsonreference v0.20.2 // indirect
3135
github.com/go-openapi/swag v0.23.0 // indirect
@@ -43,13 +47,16 @@ require (
4347
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
4448
github.com/prometheus/common v0.62.0 // indirect
4549
github.com/prometheus/procfs v0.15.1 // indirect
50+
github.com/redis/go-redis/extra/rediscmd/v9 v9.7.3 // indirect
4651
github.com/stretchr/objx v0.5.2 // indirect
4752
github.com/vmihailenco/tagparser/v2 v2.0.0 // indirect
4853
github.com/x448/float16 v0.8.4 // indirect
4954
github.com/yuin/gopher-lua v1.1.1 // indirect
55+
go.opentelemetry.io/auto/sdk v1.1.0 // indirect
56+
go.opentelemetry.io/otel/metric v1.36.0 // indirect
5057
golang.org/x/net v0.38.0 // indirect
5158
golang.org/x/oauth2 v0.27.0 // indirect
52-
golang.org/x/sys v0.31.0 // indirect
59+
golang.org/x/sys v0.33.0 // indirect
5360
golang.org/x/term v0.30.0 // indirect
5461
golang.org/x/text v0.23.0 // indirect
5562
golang.org/x/time v0.9.0 // indirect

go.sum

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,11 @@ github.com/emicklei/go-restful/v3 v3.11.0 h1:rAQeMHw1c7zTmncogyy8VvRZwtkmkZ4FxER
2121
github.com/emicklei/go-restful/v3 v3.11.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc=
2222
github.com/fxamacker/cbor/v2 v2.7.0 h1:iM5WgngdRBanHcxugY4JySA0nk1wZorNOpTgCMedv5E=
2323
github.com/fxamacker/cbor/v2 v2.7.0/go.mod h1:pxXPTn3joSm21Gbwsv0w9OSA2y1HFR9qXEeXQVeNoDQ=
24+
github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
2425
github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY=
2526
github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
27+
github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
28+
github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
2629
github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs=
2730
github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ=
2831
github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY=
@@ -88,6 +91,10 @@ github.com/prometheus/common v0.62.0 h1:xasJaQlnWAeyHdUBeGjXmutelfJHWMRr+Fg4QszZ
8891
github.com/prometheus/common v0.62.0/go.mod h1:vyBcEuLSvWos9B1+CyL7JZ2up+uFzXhkqml0W5zIY1I=
8992
github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc=
9093
github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk=
94+
github.com/redis/go-redis/extra/rediscmd/v9 v9.7.3 h1:1AXQZkJkFxGV3f78mSnUI70l0orO6FHnYoSmBos8SZM=
95+
github.com/redis/go-redis/extra/rediscmd/v9 v9.7.3/go.mod h1:OgkpkwJYex1oyVAabK+VhVUKhUXw8uZUfewJYH1wG90=
96+
github.com/redis/go-redis/extra/redisotel/v9 v9.7.3 h1:ICBA9xYh+SmZqMfBtjKpp1ohi/V5R1TEZglLZc8IxTc=
97+
github.com/redis/go-redis/extra/redisotel/v9 v9.7.3/go.mod h1:DMzxd0CDyZ9VFw9sEPIVpIgKTAaubfGuaPQSUaS7/fo=
9198
github.com/redis/go-redis/v9 v9.7.3 h1:YpPyAayJV+XErNsatSElgRZZVCwXX9QzkKYNvO7x0wM=
9299
github.com/redis/go-redis/v9 v9.7.3/go.mod h1:bGUrSggJ9X9GUmZpZNEOQKaANxSGgOEBRltRTZHSvrA=
93100
github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII=
@@ -115,6 +122,16 @@ github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9de
115122
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
116123
github.com/yuin/gopher-lua v1.1.1 h1:kYKnWBjvbNP4XLT3+bPEwAXJx262OhaHDWDVOPjL46M=
117124
github.com/yuin/gopher-lua v1.1.1/go.mod h1:GBR0iDaNXjAgGg9zfCvksxSRnQx76gclCIb7kdAd1Pw=
125+
go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA=
126+
go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A=
127+
go.opentelemetry.io/otel v1.36.0 h1:UumtzIklRBY6cI/lllNZlALOF5nNIzJVb16APdvgTXg=
128+
go.opentelemetry.io/otel v1.36.0/go.mod h1:/TcFMXYjyRNh8khOAO9ybYkqaDBb/70aVwkNML4pP8E=
129+
go.opentelemetry.io/otel/metric v1.36.0 h1:MoWPKVhQvJ+eeXWHFBOPoBOi20jh6Iq2CcCREuTYufE=
130+
go.opentelemetry.io/otel/metric v1.36.0/go.mod h1:zC7Ks+yeyJt4xig9DEw9kuUFe5C3zLbVjV2PzT6qzbs=
131+
go.opentelemetry.io/otel/sdk v1.33.0 h1:iax7M131HuAm9QkZotNHEfstof92xM+N8sr3uHXc2IM=
132+
go.opentelemetry.io/otel/sdk v1.33.0/go.mod h1:A1Q5oi7/9XaMlIWzPSxLRWOI8nG3FnzHJNbiENQuihM=
133+
go.opentelemetry.io/otel/trace v1.36.0 h1:ahxWNuqZjpdiFAyrIoQ4GIiAIhxAunQR6MUoKrsNd4w=
134+
go.opentelemetry.io/otel/trace v1.36.0/go.mod h1:gQ+OnDZzrybY4k4seLzPAWNwVBBVlF2szhehOBB/tGA=
118135
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
119136
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
120137
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
@@ -136,8 +153,8 @@ golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJ
136153
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
137154
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
138155
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
139-
golang.org/x/sys v0.31.0 h1:ioabZlmFYtWhL+TRYpcnNlLwhyxaM9kWTDEmfnprqik=
140-
golang.org/x/sys v0.31.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
156+
golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw=
157+
golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
141158
golang.org/x/term v0.30.0 h1:PQ39fJZ+mfadBm0y5WlL4vlM7Sx1Hgf13sMIY2+QS9Y=
142159
golang.org/x/term v0.30.0/go.mod h1:NYYFdzHoI5wRh/h5tDMdMqCqPJZEuNqVR5xJLd/n67g=
143160
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=

pkg/kvcache/indexer.go

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ import (
2020
"context"
2121
"fmt"
2222

23+
"go.opentelemetry.io/otel"
24+
"go.opentelemetry.io/otel/attribute"
2325
"k8s.io/apimachinery/pkg/util/sets"
2426
"k8s.io/klog/v2"
2527

@@ -117,6 +119,17 @@ func (k *Indexer) KVBlockIndex() kvblock.Index {
117119
func (k *Indexer) GetPodScores(ctx context.Context, prompt, modelName string,
118120
podIdentifiers []string,
119121
) (map[string]int, error) {
122+
tracer := otel.GetTracerProvider().Tracer("llm-d-kv-cache")
123+
ctx, span := tracer.Start(ctx, "kv-cache-manager.GetPodScores")
124+
defer span.End()
125+
126+
span.SetAttributes(
127+
attribute.String("component", "llm-d-kv-cache-manager"),
128+
attribute.String("operation", "get_pod_scores"),
129+
attribute.String("gen_ai.request.model", modelName),
130+
attribute.Int("llm_d.kv_cache.pod_count", len(podIdentifiers)),
131+
)
132+
120133
traceLogger := klog.FromContext(ctx).V(logging.TRACE).WithName("kvcache.GetPodScores")
121134
// 0. add to tokenizers pool
122135
k.tokenizersPool.AddTask(prompt, modelName)
@@ -135,6 +148,7 @@ func (k *Indexer) GetPodScores(ctx context.Context, prompt, modelName string,
135148
// 3. query kvblock indexer for pods
136149
keyToPods, err := k.kvBlockIndex.Lookup(ctx, blockKeys, sets.New(podIdentifiers...))
137150
if err != nil {
151+
span.RecordError(err)
138152
return nil, fmt.Errorf("failed to query kvblock indexer: %w", err)
139153
}
140154
traceLogger.Info("found block keys", "block-keys", blockKeys,
@@ -143,10 +157,27 @@ func (k *Indexer) GetPodScores(ctx context.Context, prompt, modelName string,
143157
// 4. score pods
144158
podScores, err := k.kvBlockScorer.Score(blockKeys, keyToPods)
145159
if err != nil {
160+
span.RecordError(err)
146161
return nil, fmt.Errorf("failed to query kvblock scorer: %w", err)
147162
}
148163
traceLogger.Info("found pod scores", "pod-scores", podScores)
149164

165+
// Calculate hit ratio for observability
166+
totalPods := len(podIdentifiers)
167+
if totalPods == 0 {
168+
// If no specific pods requested, use all pods with scores
169+
totalPods = len(podScores)
170+
}
171+
172+
var hitRatio float64
173+
if totalPods > 0 {
174+
hitRatio = float64(len(podScores)) / float64(totalPods)
175+
}
176+
177+
span.SetAttributes(
178+
attribute.Float64("llm_d.kv_cache.hit_ratio", hitRatio),
179+
)
180+
150181
return podScores, nil
151182
}
152183

pkg/kvcache/kvblock/index.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ type IndexConfig struct {
4646
func DefaultIndexConfig() *IndexConfig {
4747
return &IndexConfig{
4848
InMemoryConfig: DefaultInMemoryIndexConfig(),
49+
RedisConfig: DefaultRedisIndexConfig(),
4950
EnableMetrics: false,
5051
}
5152
}

pkg/kvcache/kvblock/redis.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"strings"
2424
"time"
2525

26+
"github.com/redis/go-redis/extra/redisotel/v9"
2627
"github.com/redis/go-redis/v9"
2728
"k8s.io/apimachinery/pkg/util/sets"
2829
"k8s.io/klog/v2"
@@ -57,6 +58,17 @@ func NewRedisIndex(config *RedisIndexConfig) (Index, error) {
5758
}
5859

5960
redisClient := redis.NewClient(redisOpt)
61+
62+
// Enable automatic OpenTelemetry tracing for Redis operations
63+
if err := redisotel.InstrumentTracing(redisClient); err != nil {
64+
return nil, fmt.Errorf("failed to instrument Redis tracing: %w", err)
65+
}
66+
67+
// Enable automatic OpenTelemetry metrics for Redis operations
68+
if err := redisotel.InstrumentMetrics(redisClient); err != nil {
69+
return nil, fmt.Errorf("failed to instrument Redis metrics: %w", err)
70+
}
71+
6072
if err := redisClient.Ping(context.Background()).Err(); err != nil {
6173
return nil, fmt.Errorf("failed to connect to Redis: %w", err)
6274
}

0 commit comments

Comments
 (0)