Skip to content

Commit 56294ac

Browse files
committed
init the trace sdk
1 parent 798d37e commit 56294ac

File tree

7 files changed

+149
-11
lines changed

7 files changed

+149
-11
lines changed

cmd/epp/runner/runner.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ var (
106106
modelServerMetricsScheme = flag.String("model-server-metrics-scheme", "http", "Scheme to scrape metrics from pods")
107107
modelServerMetricsHttpsInsecureSkipVerify = flag.Bool("model-server-metrics-https-insecure-skip-verify", true, "When using 'https' scheme for 'model-server-metrics-scheme', configure 'InsecureSkipVerify' (default to true)")
108108
haEnableLeaderElection = flag.Bool("ha-enable-leader-election", false, "Enables leader election for high availability. When enabled, readiness probes will only pass on the leader.")
109+
tracing = flag.Bool("tracing", true, "Enables emitting traces")
109110

110111
setupLog = ctrl.Log.WithName("setup")
111112
)
@@ -141,6 +142,13 @@ func (r *Runner) Run(ctx context.Context) error {
141142
flag.Parse()
142143
initLogging(&opts)
143144

145+
if *tracing {
146+
err := common.InitTracing(ctx, setupLog)
147+
if err != nil {
148+
return err
149+
}
150+
}
151+
144152
setupLog.Info("GIE build", "commit-sha", version.CommitSHA, "build-ref", version.BuildRef)
145153

146154
// Validate flags

config/charts/inferencepool/README.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,18 @@ These are the options available to you with `provider.name` set to `istio`:
214214
| `istio.destinationRule.host` | Custom host value for the destination rule. If not set this will use the default value which is derrived from the epp service name and release namespace to gerenate a valid service address. |
215215
| `istio.destinationRule.trafficPolicy.connectionPool` | Configure the connectionPool level settings of the traffic policy |
216216

217+
### Opentelemetry
218+
219+
he following table list the configurable parameters of opentelemetry trace.
220+
221+
222+
| **Parameter Name** | **Description** |
223+
|--------------------------------|------------------------------------------------------------------------------|
224+
| `opentelemetry.enabled` | Enables or disables OpenTelemetry tracing globally for the EndpointPicker. |
225+
| `opentelemetry.autoENVInject.CRInstanceName` | Controls the behavior of opentelemetry-operator auto-instrument. |
226+
| `opentelemetry.env` | A list of environment variables to manually configure the OpenTelemetry SDK. |
227+
228+
217229
## Notes
218230

219231
This chart will only deploy an InferencePool and its corresponding EndpointPicker extension. Before install the chart, please make sure that the inference extension CRDs are installed in the cluster. For more details, please refer to the [getting started guide](https://gateway-api-inference-extension.sigs.k8s.io/guides/).

config/charts/inferencepool/templates/epp-deployment.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@ spec:
2222
metadata:
2323
labels:
2424
{{- include "gateway-api-inference-extension.selectorLabels" . | nindent 8 }}
25+
annotations:
26+
{{- if and .Values.opentelemetry.enabled }}
27+
instrumentation.opentelemetry.io/inject-sdk: {{ .Values.opentelemetry.autoENVInject.CRInstanceName | quote }}
28+
{{- end }}
2529
spec:
2630
serviceAccountName: {{ include "gateway-api-inference-extension.name" . }}
2731
# Conservatively, this timeout should mirror the longest grace period of the pods within the pool
@@ -62,6 +66,8 @@ spec:
6266
- "--{{ .name }}"
6367
- "{{ .value }}"
6468
{{- end }}
69+
- "--tracing"
70+
- "false"
6571
ports:
6672
- name: grpc
6773
containerPort: 9002
@@ -104,6 +110,9 @@ spec:
104110
{{- if .Values.inferenceExtension.env }}
105111
{{- toYaml .Values.inferenceExtension.env | nindent 8 }}
106112
{{- end }}
113+
{{- if and .Values.opentelemetry.enabled .Values.opentelemetry.env }}
114+
{{- toYaml .Values.opentelemetry.env | nindent 8 }}
115+
{{- end }}
107116
volumeMounts:
108117
- name: plugins-config-volume
109118
mountPath: "/config"

config/charts/inferencepool/values.yaml

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,3 +86,34 @@ istio:
8686
# connectionPool:
8787
# http:
8888
# maxRequestsPerConnection: 256000
89+
90+
opentelemetry:
91+
enabled: true
92+
# With this setting you can send trace to the exist opentelemetry collector based on opentelemetry-operator
93+
# See https://github.com/open-telemetry/opentelemetry-operator?tab=readme-ov-file#opentelemetry-auto-instrumentation-injection
94+
autoENVInject:
95+
# The possible values for the annotation can be
96+
# "true" - inject and Instrumentation resource from the namespace.
97+
# "my-instrumentation" - name of Instrumentation CR instance in the current namespace.
98+
# "my-other-namespace/my-instrumentation" - name and namespace of Instrumentation CR instance in
99+
# "false" - do not inject
100+
CRInstanceName: "false"
101+
# Add the required OTel environment manually
102+
# If you also enabled autoENVInject setting, the auto env inject will be skipped by opentelemetry-operator,
103+
env:
104+
- name: OTEL_EXPORTER_OTLP_ENDPOINT
105+
value: "http://localhost:4317"
106+
- name: OTEL_SERVICE_NAME
107+
value: "gateway-api-inference-extension"
108+
- name: OTEL_RESOURCE_ATTRIBUTES_NODE_NAME
109+
valueFrom:
110+
fieldRef:
111+
apiVersion: v1
112+
fieldPath: spec.nodeName
113+
- name: OTEL_RESOURCE_ATTRIBUTES_POD_NAME
114+
valueFrom:
115+
fieldRef:
116+
apiVersion: v1
117+
fieldPath: metadata.name
118+
- name: OTEL_RESOURCE_ATTRIBUTES
119+
value: 'k8s.namespace.name=$(NAMESPACE),k8s.node.name=$(OTEL_RESOURCE_ATTRIBUTES_NODE_NAME),k8s.pod.name=$(OTEL_RESOURCE_ATTRIBUTES_POD_NAME)'

go.mod

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,10 @@ require (
1717
github.com/prometheus/common v0.66.1
1818
github.com/prometheus/prometheus v0.305.0
1919
github.com/stretchr/testify v1.11.1
20+
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.60.0
21+
go.opentelemetry.io/otel v1.38.0
22+
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.36.0
23+
go.opentelemetry.io/otel/sdk v1.37.0
2024
go.uber.org/multierr v1.11.0
2125
go.uber.org/zap v1.27.0
2226
golang.org/x/sync v0.17.0
@@ -95,12 +99,9 @@ require (
9599
github.com/x448/float16 v0.8.4 // indirect
96100
go.opentelemetry.io/auto/sdk v1.1.0 // indirect
97101
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 // indirect
98-
go.opentelemetry.io/otel v1.37.0 // indirect
99102
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.36.0 // indirect
100-
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.36.0 // indirect
101-
go.opentelemetry.io/otel/metric v1.37.0 // indirect
102-
go.opentelemetry.io/otel/sdk v1.37.0 // indirect
103-
go.opentelemetry.io/otel/trace v1.37.0 // indirect
103+
go.opentelemetry.io/otel/metric v1.38.0 // indirect
104+
go.opentelemetry.io/otel/trace v1.38.0 // indirect
104105
go.opentelemetry.io/proto/otlp v1.6.0 // indirect
105106
go.uber.org/atomic v1.11.0 // indirect
106107
go.uber.org/automaxprocs v1.6.0 // indirect

go.sum

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -250,22 +250,24 @@ github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9de
250250
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
251251
go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA=
252252
go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A=
253+
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.60.0 h1:x7wzEgXfnzJcHDwStJT+mxOz4etr2EcexjqhBvmoakw=
254+
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.60.0/go.mod h1:rg+RlpR5dKwaS95IyyZqj5Wd4E13lk/msnTS0Xl9lJM=
253255
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 h1:F7Jx+6hwnZ41NSFTO5q4LYDtJRXBf2PD0rNBkeB/lus=
254256
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0/go.mod h1:UHB22Z8QsdRDrnAtX4PntOl36ajSxcdUMt1sF7Y6E7Q=
255-
go.opentelemetry.io/otel v1.37.0 h1:9zhNfelUvx0KBfu/gb+ZgeAfAgtWrfHJZcAqFC228wQ=
256-
go.opentelemetry.io/otel v1.37.0/go.mod h1:ehE/umFRLnuLa/vSccNq9oS1ErUlkkK71gMcN34UG8I=
257+
go.opentelemetry.io/otel v1.38.0 h1:RkfdswUDRimDg0m2Az18RKOsnI8UDzppJAtj01/Ymk8=
258+
go.opentelemetry.io/otel v1.38.0/go.mod h1:zcmtmQ1+YmQM9wrNsTGV/q/uyusom3P8RxwExxkZhjM=
257259
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.36.0 h1:dNzwXjZKpMpE2JhmO+9HsPl42NIXFIFSUSSs0fiqra0=
258260
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.36.0/go.mod h1:90PoxvaEB5n6AOdZvi+yWJQoE95U8Dhhw2bSyRqnTD0=
259261
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.36.0 h1:JgtbA0xkWHnTmYk7YusopJFX6uleBmAuZ8n05NEh8nQ=
260262
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.36.0/go.mod h1:179AK5aar5R3eS9FucPy6rggvU0g52cvKId8pv4+v0c=
261-
go.opentelemetry.io/otel/metric v1.37.0 h1:mvwbQS5m0tbmqML4NqK+e3aDiO02vsf/WgbsdpcPoZE=
262-
go.opentelemetry.io/otel/metric v1.37.0/go.mod h1:04wGrZurHYKOc+RKeye86GwKiTb9FKm1WHtO+4EVr2E=
263+
go.opentelemetry.io/otel/metric v1.38.0 h1:Kl6lzIYGAh5M159u9NgiRkmoMKjvbsKtYRwgfrA6WpA=
264+
go.opentelemetry.io/otel/metric v1.38.0/go.mod h1:kB5n/QoRM8YwmUahxvI3bO34eVtQf2i4utNVLr9gEmI=
263265
go.opentelemetry.io/otel/sdk v1.37.0 h1:ItB0QUqnjesGRvNcmAcU0LyvkVyGJ2xftD29bWdDvKI=
264266
go.opentelemetry.io/otel/sdk v1.37.0/go.mod h1:VredYzxUvuo2q3WRcDnKDjbdvmO0sCzOvVAiY+yUkAg=
265267
go.opentelemetry.io/otel/sdk/metric v1.37.0 h1:90lI228XrB9jCMuSdA0673aubgRobVZFhbjxHHspCPc=
266268
go.opentelemetry.io/otel/sdk/metric v1.37.0/go.mod h1:cNen4ZWfiD37l5NhS+Keb5RXVWZWpRE+9WyVCpbo5ps=
267-
go.opentelemetry.io/otel/trace v1.37.0 h1:HLdcFNbRQBE2imdSEgm/kwqmQj1Or1l/7bW6mxVK7z4=
268-
go.opentelemetry.io/otel/trace v1.37.0/go.mod h1:TlgrlQ+PtQO5XFerSPUYG0JSgGyryXewPGyayAWSBS0=
269+
go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJrmcNLE=
270+
go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs=
269271
go.opentelemetry.io/proto/otlp v1.6.0 h1:jQjP+AQyTf+Fe7OKj/MfkDrmK4MNVtw2NpXsf9fefDI=
270272
go.opentelemetry.io/proto/otlp v1.6.0/go.mod h1:cicgGehlFuNdgZkcALOCh3VE6K/u2tAjzlRhDwmVpZc=
271273
go.uber.org/atomic v1.11.0 h1:ZvwS0R+56ePWxUNi+Atn9dWONBPp/AUETXlHW0DxSjE=

pkg/common/traces.go

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
package common
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"os"
7+
8+
"github.com/go-logr/logr"
9+
"go.opentelemetry.io/otel"
10+
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
11+
"go.opentelemetry.io/otel/propagation"
12+
"go.opentelemetry.io/otel/sdk/resource"
13+
sdktrace "go.opentelemetry.io/otel/sdk/trace"
14+
semconv "go.opentelemetry.io/otel/semconv/v1.37.0"
15+
16+
"sigs.k8s.io/gateway-api-inference-extension/version"
17+
)
18+
19+
type errorHandler struct {
20+
logger logr.Logger
21+
}
22+
23+
func (h *errorHandler) Handle(err error) {
24+
h.logger.Error(err, "trace error occurred")
25+
}
26+
27+
func InitTracing(ctx context.Context, logger logr.Logger) error {
28+
logger = logger.WithName("trace")
29+
loggerWrap := &errorHandler{logger: logger}
30+
31+
serviceName, ok := os.LookupEnv("OTEL_SERVICE_NAME")
32+
if !ok {
33+
serviceName = "gateway-api-inference-extension"
34+
os.Setenv("OTEL_SERVICE_NAME", serviceName)
35+
}
36+
37+
collectorAddr, ok := os.LookupEnv("OTEL_EXPORTER_OTLP_ENDPOINT")
38+
if !ok {
39+
collectorAddr = "http://localhost:4317"
40+
os.Setenv("OTEL_EXPORTER_OTLP_ENDPOINT", collectorAddr)
41+
}
42+
43+
traceExporter, err := otlptracegrpc.New(ctx, otlptracegrpc.WithInsecure())
44+
if err != nil {
45+
loggerWrap.Handle(fmt.Errorf("%s: %v", "new OTel trace gRPC exporter fail", err))
46+
return nil
47+
}
48+
49+
logger.Info(fmt.Sprintf("OTel trace exporter connect to: %s with service name: %s", collectorAddr, serviceName))
50+
opt := []sdktrace.TracerProviderOption{
51+
sdktrace.WithBatcher(traceExporter),
52+
sdktrace.WithSampler(sdktrace.ParentBased(sdktrace.AlwaysSample())),
53+
sdktrace.WithResource(resource.NewWithAttributes(
54+
semconv.SchemaURL,
55+
semconv.ServiceVersionKey.String(version.BuildRef),
56+
)),
57+
}
58+
59+
tracerProvider := sdktrace.NewTracerProvider(opt...)
60+
otel.SetTracerProvider(tracerProvider)
61+
otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(propagation.TraceContext{}, propagation.Baggage{}))
62+
otel.SetErrorHandler(loggerWrap)
63+
64+
go func() {
65+
<-ctx.Done()
66+
err := tracerProvider.Shutdown(context.Background())
67+
if err != nil {
68+
loggerWrap.Handle(fmt.Errorf("%s: %v", "failed to shutdown MeterProvider", err))
69+
}
70+
71+
logger.Info("trace provider shutting down")
72+
}()
73+
74+
return nil
75+
}

0 commit comments

Comments
 (0)