Skip to content
20 changes: 20 additions & 0 deletions api/v1alpha1/annotations.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
// Copyright 2026 The Kubernetes Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package v1alpha1

const (
SandboxPodNameAnnotation = "agents.x-k8s.io/pod-name"
SandboxTemplateRefAnnotation = "agents.x-k8s.io/sandbox-template-ref"
)
3 changes: 3 additions & 0 deletions cmd/agent-sandbox-controller/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,9 @@ func main() {
os.Exit(1)
}

// Register the custom Sandbox metric collector globally.
asmetrics.RegisterSandboxCollector(mgr.GetClient(), mgr.GetLogger().WithName("sandbox-collector"))

if err = (&controllers.SandboxReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Expand Down
8 changes: 4 additions & 4 deletions controllers/sandbox_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ import (

const (
sandboxLabel = "agents.x-k8s.io/sandbox-name-hash"
SandboxPodNameAnnotation = "agents.x-k8s.io/pod-name"
sandboxControllerFieldOwner = "sandbox-controller"
)

Expand Down Expand Up @@ -364,7 +363,7 @@ func (r *SandboxReconciler) reconcilePod(ctx context.Context, sandbox *sandboxv1
podName := sandbox.Name
var trackedPodName string
var podNameAnnotationExists bool
if trackedPodName, podNameAnnotationExists = sandbox.Annotations[SandboxPodNameAnnotation]; podNameAnnotationExists && trackedPodName != "" {
if trackedPodName, podNameAnnotationExists = sandbox.Annotations[sandboxv1alpha1.SandboxPodNameAnnotation]; podNameAnnotationExists && trackedPodName != "" {
podName = trackedPodName
log.Info("Using tracked pod name from sandbox annotation", "podName", podName)
}
Expand Down Expand Up @@ -397,11 +396,11 @@ func (r *SandboxReconciler) reconcilePod(ctx context.Context, sandbox *sandboxv1
}

// Remove the pod name annotation from the sandbox if it exists
if _, exists := sandbox.Annotations[SandboxPodNameAnnotation]; exists {
if _, exists := sandbox.Annotations[sandboxv1alpha1.SandboxPodNameAnnotation]; exists {
log.Info("Removing pod name annotation from sandbox", "Sandbox.Name", sandbox.Name)
// Create a patch to update only the annotations
patch := client.MergeFrom(sandbox.DeepCopy())
delete(sandbox.Annotations, SandboxPodNameAnnotation)
delete(sandbox.Annotations, sandboxv1alpha1.SandboxPodNameAnnotation)

if err := r.Patch(ctx, sandbox, patch); err != nil {
return nil, fmt.Errorf("failed to remove pod name annotation: %w", err)
Expand Down Expand Up @@ -629,6 +628,7 @@ func (r *SandboxReconciler) SetupWithManager(mgr ctrl.Manager, concurrentWorkers
if err != nil {
return err
}

return ctrl.NewControllerManagedBy(mgr).
For(&sandboxv1alpha1.Sandbox{}).
Owns(&corev1.Pod{}, builder.WithPredicates(labelSelectorPredicate)).
Expand Down
10 changes: 5 additions & 5 deletions controllers/sandbox_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -706,7 +706,7 @@ func TestReconcilePod(t *testing.T) {
Name: sandboxName,
Namespace: sandboxNs,
Annotations: map[string]string{
SandboxPodNameAnnotation: "adopted-pod-name",
sandboxv1alpha1.SandboxPodNameAnnotation: "adopted-pod-name",
},
},
Spec: sandboxv1alpha1.SandboxSpec{
Expand Down Expand Up @@ -810,7 +810,7 @@ func TestReconcilePod(t *testing.T) {
Name: sandboxName,
Namespace: sandboxNs,
Annotations: map[string]string{
SandboxPodNameAnnotation: "non-existent-pod",
sandboxv1alpha1.SandboxPodNameAnnotation: "non-existent-pod",
},
},
Spec: sandboxv1alpha1.SandboxSpec{
Expand Down Expand Up @@ -845,8 +845,8 @@ func TestReconcilePod(t *testing.T) {
Name: sandboxName,
Namespace: sandboxNs,
Annotations: map[string]string{
SandboxPodNameAnnotation: "annotated-pod-name",
"other-annotation": "other-value",
sandboxv1alpha1.SandboxPodNameAnnotation: "annotated-pod-name",
"other-annotation": "other-value",
},
},
Spec: sandboxv1alpha1.SandboxSpec{
Expand Down Expand Up @@ -886,7 +886,7 @@ func TestReconcilePod(t *testing.T) {
livePod := &corev1.Pod{}
podName := sandboxName
// Check if there's an annotation with a non-empty value
if annotatedPod, exists := tc.sandbox.Annotations[SandboxPodNameAnnotation]; exists && annotatedPod != "" {
if annotatedPod, exists := tc.sandbox.Annotations[sandboxv1alpha1.SandboxPodNameAnnotation]; exists && annotatedPod != "" {
podName = annotatedPod
}
err = r.Get(t.Context(), types.NamespacedName{Name: podName, Namespace: sandboxNs}, livePod)
Expand Down
7 changes: 5 additions & 2 deletions extensions/controllers/sandboxclaim_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,9 @@ func (r *SandboxClaimReconciler) createSandbox(ctx context.Context, claim *exten
sandbox.Annotations[asmetrics.TraceContextAnnotation] = tc
}

// Track the sandbox template ref to be used by metrics collector
sandbox.Annotations[v1alpha1.SandboxTemplateRefAnnotation] = template.Name

template.Spec.PodTemplate.DeepCopyInto(&sandbox.Spec.PodTemplate)
// TODO: this is a workaround, remove replica assignment related issue #202
replicas := int32(1)
Expand Down Expand Up @@ -494,7 +497,7 @@ func (r *SandboxClaimReconciler) createSandbox(ctx context.Context, claim *exten
if sandbox.Annotations == nil {
sandbox.Annotations = make(map[string]string)
}
sandbox.Annotations[sandboxcontrollers.SandboxPodNameAnnotation] = adoptedPod.Name
sandbox.Annotations[v1alpha1.SandboxPodNameAnnotation] = adoptedPod.Name
}

if err := r.Create(ctx, sandbox); err != nil {
Expand Down Expand Up @@ -612,7 +615,7 @@ func (r *SandboxClaimReconciler) recordCreationLatencyMetric(
// This is unlikely to happen; here for completeness only.
if sandbox == nil {
launchType = asmetrics.LaunchTypeUnknown
} else if sandbox.Annotations[sandboxcontrollers.SandboxPodNameAnnotation] != "" {
} else if sandbox.Annotations[v1alpha1.SandboxPodNameAnnotation] != "" {
// Existence of the SandboxPodNameAnnotation implies the pod was adopted from a warm pool.
launchType = asmetrics.LaunchTypeWarm
}
Expand Down
2 changes: 1 addition & 1 deletion extensions/controllers/sandboxclaim_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -986,7 +986,7 @@ func TestSandboxClaimPodAdoption(t *testing.T) {
} else if tc.expectSandboxCreate {
// Verify no pod name annotation when no adoption occurred
if sandbox.Annotations != nil {
if _, exists := sandbox.Annotations[sandboxcontrollers.SandboxPodNameAnnotation]; exists {
if _, exists := sandbox.Annotations[sandboxv1alpha1.SandboxPodNameAnnotation]; exists {
t.Errorf("expected no pod name annotation but found one")
}
}
Expand Down
14 changes: 14 additions & 0 deletions internal/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,20 @@ var (
},
[]string{"namespace", "sandbox_template", "launch_type", "warmpool_name", "pod_condition"},
)

// AgentSandboxesDesc describes the agent_sandboxes metric point-in-time counts.
// Labels:
// - namespace: the namespace of the sandbox
// - ready_condition: "true" | "false"
// - expired: "true" | "false"
// - launch_type: "warm" | "cold"
// - sandbox_template: sandboxTemplateRef
AgentSandboxesDesc = prometheus.NewDesc(
"agent_sandboxes",
"Monitor the point-in-time number of sandboxes in the cluster.",
[]string{"namespace", "ready_condition", "expired", "launch_type", "sandbox_template"},
nil,
)
)

// Init registers custom metrics with the global controller-runtime registry.
Expand Down
146 changes: 146 additions & 0 deletions internal/metrics/sandbox_collector.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
// Copyright 2026 The Kubernetes Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// nolint:revive
package metrics

import (
"context"
"time"

"github.com/go-logr/logr"
"github.com/prometheus/client_golang/prometheus"
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
sandboxv1alpha1 "sigs.k8s.io/agent-sandbox/api/v1alpha1"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/metrics"
)

const (
metricsCollectTimeout = 5 * time.Second
)

// AgentSandboxesMetricKey is used to aggregate counts for identical Sandboxes metric label combinations.
type AgentSandboxesMetricKey struct {
Namespace string
ReadyCondition string
Expired string
LaunchType string
Template string
}

// NewAgentSandboxesConstMetric creates a new Prometheus ConstMetric for the agent_sandboxes gauge.
func NewAgentSandboxesConstMetric(count int, key AgentSandboxesMetricKey) prometheus.Metric {
return prometheus.MustNewConstMetric(
AgentSandboxesDesc,
prometheus.GaugeValue,
float64(count),
key.Namespace,
key.ReadyCondition,
key.Expired,
key.LaunchType,
key.Template,
)
}

// RegisterSandboxCollector registers the custom Prometheus collector for sandbox counts.
func RegisterSandboxCollector(c client.Client, logger logr.Logger) {
collector := NewSandboxCollector(c, logger)
if err := metrics.Registry.Register(collector); err != nil {
if _, ok := err.(prometheus.AlreadyRegisteredError); !ok {
logger.Error(err, "Failed to register SandboxCollector")
} else {
logger.Info("SandboxCollector already registered, ignoring")
}
}
}

// SandboxCollector is a custom Prometheus collector that dynamically fetches sandbox counts.
type SandboxCollector struct {
client client.Client
logger logr.Logger
agentSandboxesDesc *prometheus.Desc
}

// NewSandboxCollector initializes a SandboxCollector.
func NewSandboxCollector(c client.Client, logger logr.Logger) *SandboxCollector {
return &SandboxCollector{
client: c,
logger: logger,
agentSandboxesDesc: AgentSandboxesDesc,
}
}

// Describe sends the metric descriptor to the channel.
func (c *SandboxCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- c.agentSandboxesDesc
}

// Collect fetches sandboxes, calculates labels, and sends metrics to the channel.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this be used to DDOS the reconciler ?
Like have a scraper collect the metric in a tight loop an cause the controller to run out of mem ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a TODO in sandbox_collector.go to acknowledge the O(N) list concern and the potential for DDoS/OOM at scale, and should be replaced with a better implementation.

// Note: Using client.List to fetch all sandboxes in the cluster on every metrics scrape
// introduces O(N) memory allocation and CPU overhead due to deep-copying thousands of objects.
// While updating a GaugeVec in the Reconcile loop might be slightly harder to manage,
// it operates in O(1) memory during scrapes and is generally more performant.
// This is a known performance trade-off to keep the Reconcile loop simpler.
func (c *SandboxCollector) Collect(ch chan<- prometheus.Metric) {
var sandboxList sandboxv1alpha1.SandboxList
ctx, cancel := context.WithTimeout(context.Background(), metricsCollectTimeout)
defer cancel()

if err := c.client.List(ctx, &sandboxList); err != nil {
c.logger.Error(err, "Failed to list sandboxes for metrics collection")
return
}

counts := make(map[AgentSandboxesMetricKey]int)
for _, sandbox := range sandboxList.Items {
readyConditionStr := "false"
expiredStr := "false"
readyCond := meta.FindStatusCondition(sandbox.Status.Conditions, string(sandboxv1alpha1.SandboxConditionReady))
if readyCond != nil {
if readyCond.Status == metav1.ConditionTrue {
readyConditionStr = "true"
}
if readyCond.Reason == sandboxv1alpha1.SandboxReasonExpired {
expiredStr = "true"
}
}

launchTypeStr := LaunchTypeCold
if _, ok := sandbox.Annotations[sandboxv1alpha1.SandboxPodNameAnnotation]; ok && sandbox.Annotations[sandboxv1alpha1.SandboxPodNameAnnotation] != "" {
launchTypeStr = LaunchTypeWarm
}

sandboxTemplateStr := "unknown"
// If a user manually creates a Sandbox without a SandboxClaim, it won't have the
// SandboxTemplateRefAnnotation. The collector correctly handles this by defaulting to "unknown".
if template, ok := sandbox.Annotations[sandboxv1alpha1.SandboxTemplateRefAnnotation]; ok && template != "" {
sandboxTemplateStr = template
}

key := AgentSandboxesMetricKey{
Namespace: sandbox.Namespace,
ReadyCondition: readyConditionStr,
Expired: expiredStr,
LaunchType: launchTypeStr,
Template: sandboxTemplateStr,
}
counts[key]++
}

for key, count := range counts {
ch <- NewAgentSandboxesConstMetric(count, key)
}
}
Loading