Skip to content

Commit 1e1c435

Browse files
authored
feat: add OpenTelemetry Tracing to Sandbox and SandboxClaim Controllers (kubernetes-sigs#234)
* Implements OpenTelemetry tracing for the SandboxClaim and Sandbox * Autogenerated Files * updates metrics package to indicate that these are internal metrics * Updates tracing Instrumenter interface by adding support for initial attributes, span events, and recording state checks. * go mod tidy after a rebase
1 parent 35e3b03 commit 1e1c435

File tree

9 files changed

+364
-37
lines changed

9 files changed

+364
-37
lines changed

Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ COPY api/ ./api/
1818
COPY cmd/ ./cmd/
1919
COPY controllers/ ./controllers/
2020
COPY extensions/ ./extensions/
21+
COPY internal/ ./internal/
2122

2223
# Build the binary with optimizations
2324
RUN CGO_ENABLED=0 GOOS=linux GOARCH=${TARGETARCH} go build -ldflags="-s -w" -o /agent-sandbox-controller ./cmd/agent-sandbox-controller

cmd/agent-sandbox-controller/main.go

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,13 @@
1515
package main
1616

1717
import (
18+
"context"
1819
"flag"
1920
"net/http"
2021
"net/http/pprof"
2122
"os"
2223
"runtime"
24+
"time"
2325

2426
// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
2527
// to ensure that exec-entrypoint and run can make use of them.
@@ -34,6 +36,7 @@ import (
3436
"sigs.k8s.io/agent-sandbox/controllers"
3537
extensionsv1alpha1 "sigs.k8s.io/agent-sandbox/extensions/api/v1alpha1"
3638
extensionscontrollers "sigs.k8s.io/agent-sandbox/extensions/controllers"
39+
asmetrics "sigs.k8s.io/agent-sandbox/internal/metrics"
3740
//+kubebuilder:scaffold:imports
3841
)
3942

@@ -46,6 +49,7 @@ func main() {
4649
var enableLeaderElection bool
4750
var probeAddr string
4851
var extensions bool
52+
var enableTracing bool
4953
var enablePprof bool
5054
var enablePprofDebug bool
5155
var pprofBlockProfileRate int
@@ -56,6 +60,7 @@ func main() {
5660
"Enable leader election for controller manager. "+
5761
"Enabling this will ensure there is only one active controller manager.")
5862
flag.BoolVar(&extensions, "extensions", false, "Enable extensions controllers.")
63+
flag.BoolVar(&enableTracing, "enable-tracing", false, "Enable OpenTelemetry tracing via OTLP.")
5964
flag.BoolVar(&enablePprof, "enable-pprof", false,
6065
"Enable CPU profiling endpoint (/debug/pprof/profile) on the metrics server.")
6166
flag.BoolVar(&enablePprofDebug, "enable-pprof-debug", false,
@@ -74,6 +79,24 @@ func main() {
7479
flag.Parse()
7580

7681
ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts)))
82+
ctx := ctrl.SetupSignalHandler()
83+
84+
// Initialize Tracing Provider
85+
var instrumenter asmetrics.Instrumenter = asmetrics.NewNoOp()
86+
if enableTracing {
87+
var cleanup func()
88+
var err error
89+
// Use a timeout context for initialization to prevent blocking
90+
initCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
91+
defer cancel()
92+
93+
instrumenter, cleanup, err = asmetrics.SetupOTel(initCtx, "agent-sandbox-controller")
94+
if err != nil {
95+
setupLog.Error(err, "unable to initialize tracing")
96+
os.Exit(1)
97+
}
98+
defer cleanup()
99+
}
77100

78101
// Importing net/http/pprof registers handlers on the global DefaultServeMux.
79102
// Reset it to avoid accidentally exposing pprof via any server that uses the default mux.
@@ -135,6 +158,7 @@ func main() {
135158
if err = (&controllers.SandboxReconciler{
136159
Client: mgr.GetClient(),
137160
Scheme: mgr.GetScheme(),
161+
Tracer: instrumenter,
138162
}).SetupWithManager(mgr); err != nil {
139163
setupLog.Error(err, "unable to create controller", "controller", "Sandbox")
140164
os.Exit(1)
@@ -145,6 +169,7 @@ func main() {
145169
Client: mgr.GetClient(),
146170
Scheme: mgr.GetScheme(),
147171
Recorder: mgr.GetEventRecorderFor("sandboxclaim-controller"),
172+
Tracer: instrumenter,
148173
}).SetupWithManager(mgr); err != nil {
149174
setupLog.Error(err, "unable to create controller", "controller", "SandboxClaim")
150175
os.Exit(1)
@@ -170,7 +195,7 @@ func main() {
170195
}
171196

172197
setupLog.Info("starting manager")
173-
if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil {
198+
if err := mgr.Start(ctx); err != nil {
174199
setupLog.Error(err, "problem running manager")
175200
os.Exit(1)
176201
}

controllers/sandbox_controller.go

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ import (
3838
"sigs.k8s.io/controller-runtime/pkg/predicate"
3939

4040
sandboxv1alpha1 "sigs.k8s.io/agent-sandbox/api/v1alpha1"
41+
asmetrics "sigs.k8s.io/agent-sandbox/internal/metrics"
4142
)
4243

4344
const (
@@ -60,6 +61,7 @@ func init() {
6061
type SandboxReconciler struct {
6162
client.Client
6263
Scheme *runtime.Scheme
64+
Tracer asmetrics.Instrumenter
6365
}
6466

6567
//+kubebuilder:rbac:groups=agents.x-k8s.io,resources=sandboxes,verbs=get;list;watch;create;update;patch;delete
@@ -89,6 +91,14 @@ func (r *SandboxReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
8991
return ctrl.Result{}, err
9092
}
9193

94+
// Start Tracing Span
95+
initialAttrs := map[string]string{
96+
"sandbox.name": sandbox.Name,
97+
"sandbox.namespace": sandbox.Namespace,
98+
}
99+
ctx, end := r.Tracer.StartSpan(ctx, sandbox, "ReconcileSandbox", initialAttrs)
100+
defer end()
101+
92102
// If the sandbox is being deleted, do nothing
93103
if !sandbox.ObjectMeta.DeletionTimestamp.IsZero() {
94104
log.Info("Sandbox is being deleted")
@@ -104,6 +114,22 @@ func (r *SandboxReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
104114
return ctrl.Result{}, nil
105115
}
106116

117+
// Initialize trace ID for active resources missing an ID
118+
tc := r.Tracer.GetTraceContext(ctx)
119+
if tc != "" && (sandbox.Annotations == nil || sandbox.Annotations[asmetrics.TraceContextAnnotation] == "") {
120+
patch := client.MergeFrom(sandbox.DeepCopy())
121+
if sandbox.Annotations == nil {
122+
sandbox.Annotations = make(map[string]string)
123+
}
124+
sandbox.Annotations[asmetrics.TraceContextAnnotation] = tc
125+
126+
if err := r.Patch(ctx, sandbox, patch); err != nil {
127+
return ctrl.Result{}, err
128+
}
129+
// Return to ensure the next loop uses the persisted ID
130+
return ctrl.Result{}, nil
131+
}
132+
107133
if sandbox.Spec.Replicas == nil {
108134
replicas := int32(1)
109135
sandbox.Spec.Replicas = &replicas
@@ -303,6 +329,10 @@ func (r *SandboxReconciler) reconcileService(ctx context.Context, sandbox *sandb
303329
func (r *SandboxReconciler) reconcilePod(ctx context.Context, sandbox *sandboxv1alpha1.Sandbox, nameHash string) (*corev1.Pod, error) {
304330
log := log.FromContext(ctx)
305331

332+
// Start a child span of ReconcileSandbox
333+
ctx, end := r.Tracer.StartSpan(ctx, nil, "reconcilePod", nil)
334+
defer end()
335+
306336
// List all pods with the pool label matching the warm pool name hash
307337
// TODO: find a better way to make sure one sandbox has at most one pod
308338
podList := &corev1.PodList{}
@@ -344,7 +374,7 @@ func (r *SandboxReconciler) reconcilePod(ctx context.Context, sandbox *sandboxv1
344374
pod = nil
345375
}
346376

347-
// if replicas is 0, delete the pod if it exists
377+
// 1. PATH: Logic for deleting Pod when replicas is 0
348378
if *sandbox.Spec.Replicas == 0 {
349379
if pod != nil {
350380
if pod.ObjectMeta.DeletionTimestamp.IsZero() {
@@ -372,9 +402,17 @@ func (r *SandboxReconciler) reconcilePod(ctx context.Context, sandbox *sandboxv1
372402
return nil, nil
373403
}
374404

405+
// 2. PATH: Existing Pod found (e.g., adopted from WarmPool or already exists)
375406
if pod != nil {
376407
log.Info("Found Pod", "Pod.Namespace", pod.Namespace, "Pod.Name", pod.Name)
377408

409+
if r.Tracer.IsRecording(ctx) {
410+
r.Tracer.AddEvent(ctx, "ExistingPodStatusObserved", map[string]string{
411+
"pod.Name": pod.Name,
412+
"pod.Phase": string(pod.Status.Phase),
413+
})
414+
}
415+
378416
if pod.Labels == nil {
379417
pod.Labels = make(map[string]string)
380418
}
@@ -396,6 +434,7 @@ func (r *SandboxReconciler) reconcilePod(ctx context.Context, sandbox *sandboxv1
396434
return pod, nil
397435
}
398436

437+
// 3. PATH: Create new Pod
399438
log.Info("Creating a new Pod", "Pod.Namespace", sandbox.Namespace, "Pod.Name", sandbox.Name)
400439
labels := map[string]string{
401440
sandboxLabel: nameHash,
@@ -439,11 +478,23 @@ func (r *SandboxReconciler) reconcilePod(ctx context.Context, sandbox *sandboxv1
439478
return nil, err
440479
}
441480

481+
if r.Tracer.IsRecording(ctx) {
482+
r.Tracer.AddEvent(ctx, "NewPodStatusObserved", map[string]string{
483+
"pod.Name": pod.Name,
484+
"pod.Phase": string(pod.Status.Phase),
485+
})
486+
}
487+
442488
return pod, nil
443489
}
444490

445491
func (r *SandboxReconciler) reconcilePVCs(ctx context.Context, sandbox *sandboxv1alpha1.Sandbox) error {
446492
log := log.FromContext(ctx)
493+
494+
// Start a child span of ReconcileSandbox
495+
ctx, end := r.Tracer.StartSpan(ctx, nil, "reconcilePVCs", nil)
496+
defer end()
497+
447498
for _, pvcTemplate := range sandbox.Spec.VolumeClaimTemplates {
448499
pvc := &corev1.PersistentVolumeClaim{}
449500
pvcName := pvcTemplate.Name + "-" + sandbox.Name

controllers/sandbox_controller_test.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,12 @@ import (
2929
"k8s.io/apimachinery/pkg/runtime"
3030
"k8s.io/apimachinery/pkg/types"
3131
"k8s.io/utils/ptr"
32-
sandboxv1alpha1 "sigs.k8s.io/agent-sandbox/api/v1alpha1"
3332
ctrl "sigs.k8s.io/controller-runtime"
3433
"sigs.k8s.io/controller-runtime/pkg/client"
3534
"sigs.k8s.io/controller-runtime/pkg/client/fake"
35+
36+
sandboxv1alpha1 "sigs.k8s.io/agent-sandbox/api/v1alpha1"
37+
asmetrics "sigs.k8s.io/agent-sandbox/internal/metrics"
3638
)
3739

3840
func newFakeClient(initialObjs ...runtime.Object) client.WithWatch {
@@ -505,6 +507,7 @@ func TestReconcile(t *testing.T) {
505507
r := SandboxReconciler{
506508
Client: newFakeClient(append(tc.initialObjs, sb)...),
507509
Scheme: Scheme,
510+
Tracer: asmetrics.NewNoOp(),
508511
}
509512

510513
_, err := r.Reconcile(t.Context(), ctrl.Request{
@@ -861,6 +864,7 @@ func TestReconcilePod(t *testing.T) {
861864
r := SandboxReconciler{
862865
Client: newFakeClient(append(tc.initialObjs, tc.sandbox)...),
863866
Scheme: Scheme,
867+
Tracer: asmetrics.NewNoOp(),
864868
}
865869

866870
pod, err := r.reconcilePod(t.Context(), tc.sandbox, nameHash)

extensions/controllers/sandboxclaim_controller.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ import (
4040
sandboxv1alpha1 "sigs.k8s.io/agent-sandbox/api/v1alpha1"
4141
sandboxcontrollers "sigs.k8s.io/agent-sandbox/controllers"
4242
extensionsv1alpha1 "sigs.k8s.io/agent-sandbox/extensions/api/v1alpha1"
43+
asmetrics "sigs.k8s.io/agent-sandbox/internal/metrics"
4344
)
4445

4546
// TODO: These constants should be imported from the main controller package Issue #216
@@ -55,6 +56,7 @@ type SandboxClaimReconciler struct {
5556
client.Client
5657
Scheme *runtime.Scheme
5758
Recorder record.EventRecorder
59+
Tracer asmetrics.Instrumenter
5860
}
5961

6062
//+kubebuilder:rbac:groups=extensions.agents.x-k8s.io,resources=sandboxclaims,verbs=get;list;watch;create;update;patch;delete
@@ -77,10 +79,28 @@ func (r *SandboxClaimReconciler) Reconcile(ctx context.Context, req ctrl.Request
7779
return ctrl.Result{}, fmt.Errorf("failed to get sandbox claim %q: %w", req.NamespacedName, err)
7880
}
7981

82+
// Start Tracing Span
83+
ctx, end := r.Tracer.StartSpan(ctx, claim, "ReconcileSandboxClaim", nil)
84+
defer end()
85+
8086
if !claim.DeletionTimestamp.IsZero() {
8187
return ctrl.Result{}, nil
8288
}
8389

90+
// Initialize trace ID for active resources missing an ID
91+
tc := r.Tracer.GetTraceContext(ctx)
92+
if tc != "" && (claim.Annotations == nil || claim.Annotations[asmetrics.TraceContextAnnotation] == "") {
93+
patch := client.MergeFrom(claim.DeepCopy())
94+
if claim.Annotations == nil {
95+
claim.Annotations = make(map[string]string)
96+
}
97+
claim.Annotations[asmetrics.TraceContextAnnotation] = tc
98+
if err := r.Patch(ctx, claim, patch); err != nil {
99+
return ctrl.Result{}, err
100+
}
101+
return ctrl.Result{}, nil
102+
}
103+
84104
originalClaimStatus := claim.Status.DeepCopy()
85105

86106
// Check Expiration
@@ -411,6 +431,14 @@ func (r *SandboxClaimReconciler) createSandbox(ctx context.Context, claim *exten
411431
},
412432
}
413433

434+
// Propagate the trace context annotation to the Sandbox resource
435+
if sandbox.Annotations == nil {
436+
sandbox.Annotations = make(map[string]string)
437+
}
438+
if tc, ok := claim.Annotations[asmetrics.TraceContextAnnotation]; ok {
439+
sandbox.Annotations[asmetrics.TraceContextAnnotation] = tc
440+
}
441+
414442
template.Spec.PodTemplate.DeepCopyInto(&sandbox.Spec.PodTemplate)
415443
// TODO: this is a workaround, remove replica assignment related issue #202
416444
replicas := int32(1)

0 commit comments

Comments
 (0)