Skip to content

Commit d2c7459

Browse files
shunjiazhusjy3
authored andcommitted
add agentkit_runtime_operation_latency metric for agentkit
1 parent 97cbdd7 commit d2c7459

File tree

4 files changed

+98
-9
lines changed

4 files changed

+98
-9
lines changed

observability/constant.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,9 @@ const (
6868
// APMPlus specific metrics
6969
MetricNameAPMPlusSpanLatency = "apmplus_span_latency"
7070
MetricNameAPMPlusToolTokenUsage = "apmplus_tool_token_usage"
71+
72+
// AgentKit specific metrics
73+
MetricNameAgentKitDuration = "agentkit_runtime_operation_latency"
7174
)
7275

7376
// General attributes

observability/metrics.go

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ package observability
1616

1717
import (
1818
"context"
19-
19+
"fmt"
2020
"sync"
2121

2222
"go.opentelemetry.io/otel"
@@ -46,6 +46,11 @@ var (
4646
genAIServerTimePerOutputTokenBuckets = []float64{
4747
0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, 1.0, 2.5,
4848
}
49+
50+
// Time duration buckets for agent_kit (seconds)
51+
agentkitDurationSecondBuckets = []float64{
52+
0.01, 0.02, 0.04, 0.08, 0.16, 0.32, 0.64, 1.28, 2.56, 5.12, 10.24, 20.48, 40.96, 81.92, 163.84,
53+
}
4954
)
5055

5156
var (
@@ -69,6 +74,9 @@ var (
6974
// special metrics for APMPlus
7075
apmPlusLatencyHistograms []metric.Float64Histogram
7176
apmPlusToolTokenUsageHistograms []metric.Float64Histogram
77+
78+
// special metrics for AgentKit
79+
agentkitDurationHistograms []metric.Float64Histogram
7280
)
7381

7482
// registerLocalMetrics initializes the metrics system with a local isolated MeterProvider.
@@ -200,6 +208,16 @@ func initializeInstruments(m metric.Meter) {
200208
); err == nil {
201209
apmPlusToolTokenUsageHistograms = append(apmPlusToolTokenUsageHistograms, h)
202210
}
211+
212+
// AgentKit Duration
213+
if h, err := m.Float64Histogram(
214+
MetricNameAgentKitDuration,
215+
metric.WithDescription("operation latency"),
216+
metric.WithUnit("s"),
217+
metric.WithExplicitBucketBoundaries(agentkitDurationSecondBuckets...),
218+
); err == nil {
219+
agentkitDurationHistograms = append(agentkitDurationHistograms, h)
220+
}
203221
}
204222

205223
// RecordTokenUsage records the number of tokens used.
@@ -271,3 +289,12 @@ func RecordAPMPlusToolTokenUsage(ctx context.Context, tokens int64, attrs ...att
271289
histogram.Record(ctx, float64(tokens), metric.WithAttributes(attrs...))
272290
}
273291
}
292+
293+
func RecordAgentKitDuration(ctx context.Context, durationSeconds float64, err error, attrs ...attribute.KeyValue) {
294+
if err != nil {
295+
attrs = append(attrs, attribute.String("error_type", fmt.Sprintf("%T", err)))
296+
}
297+
for _, histogram := range agentkitDurationHistograms {
298+
histogram.Record(ctx, durationSeconds, metric.WithAttributes(attrs...))
299+
}
300+
}

observability/metrics_test.go

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ package observability
1616

1717
import (
1818
"context"
19+
"fmt"
1920
"testing"
2021

2122
"github.com/stretchr/testify/assert"
@@ -117,6 +118,34 @@ func TestMetricsRecording(t *testing.T) {
117118
}
118119
assert.True(t, found, "Streaming time to first token not found")
119120
})
121+
122+
t.Run("RecordAgentKitDurationWithError", func(t *testing.T) {
123+
testErr := fmt.Errorf("test error")
124+
RecordAgentKitDuration(ctx, 2.5, testErr, attrs...)
125+
126+
var rm metricdata.ResourceMetrics
127+
err := reader.Collect(ctx, &rm)
128+
assert.NoError(t, err)
129+
130+
var found bool
131+
for _, sm := range rm.ScopeMetrics {
132+
for _, m := range sm.Metrics {
133+
if m.Name == MetricNameAgentKitDuration {
134+
data := m.Data.(metricdata.Histogram[float64])
135+
for _, dp := range data.DataPoints {
136+
if dp.Count > 0 {
137+
assert.Equal(t, uint64(1), dp.Count)
138+
assert.Equal(t, 2.5, dp.Sum)
139+
errType, _ := dp.Attributes.Value("error_type")
140+
assert.Equal(t, "*errors.errorString", errType.AsString())
141+
found = true
142+
}
143+
}
144+
}
145+
}
146+
}
147+
assert.True(t, found, "AgentKit duration not found")
148+
})
120149
}
121150

122151
func TestRegisterLocalMetrics(t *testing.T) {

observability/plugin.go

Lines changed: 38 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -179,16 +179,28 @@ func (p *adkObservabilityPlugin) AfterRun(ctx agent.InvocationContext) {
179179

180180
// Record final metrics for invocation
181181
if !meta.StartTime.IsZero() {
182-
elapsed := time.Since(meta.StartTime).Seconds()
183-
metricAttrs := []attribute.KeyValue{
184-
attribute.String("gen_ai_operation_name", "chain"),
185-
attribute.String("gen_ai_operation_type", "workflow"),
186-
attribute.String("gen_ai.system", GetModelProvider(context.Context(ctx))),
187-
}
188182
if p.isMetricsEnabled() {
183+
elapsed := time.Since(meta.StartTime).Seconds()
184+
metricAttrs := []attribute.KeyValue{
185+
attribute.String("gen_ai_operation_name", "chain"),
186+
attribute.String("gen_ai_operation_type", "workflow"),
187+
attribute.String("gen_ai.system", GetModelProvider(context.Context(ctx))),
188+
}
189189
RecordOperationDuration(context.Background(), elapsed, metricAttrs...)
190190
RecordAPMPlusSpanLatency(context.Background(), elapsed, metricAttrs...)
191+
192+
agentKitsAttrs := []attribute.KeyValue{
193+
attribute.String("gen_ai_operation_name", "chain"),
194+
attribute.String("gen_ai_operation_type", "workflow"),
195+
}
196+
197+
var lastErr error
198+
if val, _ := ctx.Session().State().Get(stateKeyLastError); val != nil {
199+
lastErr = val.(error)
200+
}
201+
RecordAgentKitDuration(context.Background(), elapsed, lastErr, agentKitsAttrs...)
191202
}
203+
192204
}
193205

194206
// Clean up from global map with delay to allow children to be exported.
@@ -346,6 +358,7 @@ func (p *adkObservabilityPlugin) AfterModel(ctx agent.CallbackContext, resp *mod
346358

347359
if err != nil {
348360
span.SetStatus(codes.Error, err.Error())
361+
_ = ctx.State().Set(stateKeyLastError, err)
349362
// Record Exceptions metric
350363
if p.isMetricsEnabled() {
351364
meta := p.getSpanMetadata(ctx.State())
@@ -357,6 +370,7 @@ func (p *adkObservabilityPlugin) AfterModel(ctx agent.CallbackContext, resp *mod
357370
attribute.String("error_type", "error"), // Simple error type
358371
}
359372
RecordExceptions(context.Context(ctx), 1, metricAttrs...)
373+
p.recordFinalResponseMetrics(ctx, meta, meta.ModelName, err)
360374
}
361375
return nil, nil
362376
}
@@ -450,7 +464,7 @@ func (p *adkObservabilityPlugin) AfterModel(ctx agent.CallbackContext, resp *mod
450464

451465
if !resp.Partial {
452466
// Record Operation Duration and Latency
453-
p.recordFinalResponseMetrics(ctx, meta, finalModelName)
467+
p.recordFinalResponseMetrics(ctx, meta, finalModelName, nil)
454468
}
455469

456470
return nil, nil
@@ -563,7 +577,7 @@ func (p *adkObservabilityPlugin) recordStreamingGenerationMetrics(ctx agent.Call
563577
}
564578
}
565579

566-
func (p *adkObservabilityPlugin) recordFinalResponseMetrics(ctx agent.CallbackContext, meta *spanMetadata, finalModelName string) {
580+
func (p *adkObservabilityPlugin) recordFinalResponseMetrics(ctx agent.CallbackContext, meta *spanMetadata, finalModelName string, err error) {
567581
if !meta.StartTime.IsZero() {
568582
duration := time.Since(meta.StartTime).Seconds()
569583
metricAttrs := []attribute.KeyValue{
@@ -575,6 +589,12 @@ func (p *adkObservabilityPlugin) recordFinalResponseMetrics(ctx agent.CallbackCo
575589
if p.isMetricsEnabled() {
576590
RecordOperationDuration(context.Context(ctx), duration, metricAttrs...)
577591
RecordAPMPlusSpanLatency(context.Context(ctx), duration, metricAttrs...)
592+
593+
agentKitsAttrs := []attribute.KeyValue{
594+
attribute.String("gen_ai_operation_name", "chat"),
595+
attribute.String("gen_ai_operation_type", "llm"),
596+
}
597+
RecordAgentKitDuration(context.Context(ctx), duration, err, agentKitsAttrs...)
578598
}
579599
}
580600
}
@@ -901,6 +921,9 @@ func (p *adkObservabilityPlugin) BeforeTool(ctx tool.Context, tool tool.Tool, ar
901921

902922
// AfterTool is called after a tool is executed.
903923
func (p *adkObservabilityPlugin) AfterTool(ctx tool.Context, tool tool.Tool, args, result map[string]any, err error) (map[string]any, error) {
924+
if err != nil {
925+
_ = ctx.State().Set(stateKeyLastError, err)
926+
}
904927
// Metrics recording only
905928
meta := p.getSpanMetadata(ctx.State())
906929
if !meta.StartTime.IsZero() {
@@ -913,6 +936,12 @@ func (p *adkObservabilityPlugin) AfterTool(ctx tool.Context, tool tool.Tool, arg
913936
if p.isMetricsEnabled() {
914937
RecordOperationDuration(context.Background(), duration, metricAttrs...)
915938
RecordAPMPlusSpanLatency(context.Background(), duration, metricAttrs...)
939+
940+
agentKitsAttrs := []attribute.KeyValue{
941+
attribute.String("gen_ai_operation_name", tool.Name()),
942+
attribute.String("gen_ai_operation_type", "tool"),
943+
}
944+
RecordAgentKitDuration(context.Background(), duration, err, agentKitsAttrs...)
916945
}
917946

918947
if p.isMetricsEnabled() {
@@ -1052,6 +1081,7 @@ const (
10521081
stateKeyMetadata = "veadk.observability.metadata"
10531082
stateKeyStreamingOutput = "veadk.observability.streaming_output"
10541083
stateKeyStreamingSpan = "veadk.observability.streaming_span"
1084+
stateKeyLastError = "veadk.observability.last_error"
10551085
)
10561086

10571087
// spanMetadata groups various observational data points in a single structure

0 commit comments

Comments
 (0)