diff --git a/api/v1alpha1/sandbox_types.go b/api/v1alpha1/sandbox_types.go index ffc7e47d1..3dc16ee39 100644 --- a/api/v1alpha1/sandbox_types.go +++ b/api/v1alpha1/sandbox_types.go @@ -30,6 +30,11 @@ const ( // SandboxReasonExpired indicates expired state for Sandbox SandboxReasonExpired = "SandboxExpired" + + // SandboxPodNameAnnotation is the annotation used to track the pod name adopted from a warm pool. + SandboxPodNameAnnotation = "agents.x-k8s.io/pod-name" + // SandboxTemplateRefAnnotation is the annotation used to track the sandbox template ref. + SandboxTemplateRefAnnotation = "agents.x-k8s.io/sandbox-template-ref" ) type PodMetadata struct { diff --git a/cmd/agent-sandbox-controller/main.go b/cmd/agent-sandbox-controller/main.go index 1f6d0ebc2..90dda55d4 100644 --- a/cmd/agent-sandbox-controller/main.go +++ b/cmd/agent-sandbox-controller/main.go @@ -202,6 +202,9 @@ func main() { os.Exit(1) } + // Register the custom Sandbox metric collector globally. + asmetrics.RegisterSandboxCollector(mgr.GetClient(), mgr.GetLogger().WithName("sandbox-collector")) + if err = (&controllers.SandboxReconciler{ Client: mgr.GetClient(), Scheme: mgr.GetScheme(), diff --git a/controllers/sandbox_controller.go b/controllers/sandbox_controller.go index 5f24d2c49..e788be924 100644 --- a/controllers/sandbox_controller.go +++ b/controllers/sandbox_controller.go @@ -44,7 +44,6 @@ import ( const ( sandboxLabel = "agents.x-k8s.io/sandbox-name-hash" - SandboxPodNameAnnotation = "agents.x-k8s.io/pod-name" sandboxControllerFieldOwner = "sandbox-controller" ) @@ -363,7 +362,7 @@ func (r *SandboxReconciler) reconcilePod(ctx context.Context, sandbox *sandboxv1 podName := sandbox.Name var trackedPodName string var podNameAnnotationExists bool - if trackedPodName, podNameAnnotationExists = sandbox.Annotations[SandboxPodNameAnnotation]; podNameAnnotationExists && trackedPodName != "" { + if trackedPodName, podNameAnnotationExists = sandbox.Annotations[sandboxv1alpha1.SandboxPodNameAnnotation]; podNameAnnotationExists && trackedPodName != "" { podName = trackedPodName log.Info("Using tracked pod name from sandbox annotation", "podName", podName) } @@ -396,11 +395,11 @@ func (r *SandboxReconciler) reconcilePod(ctx context.Context, sandbox *sandboxv1 } // Remove the pod name annotation from the sandbox if it exists - if _, exists := sandbox.Annotations[SandboxPodNameAnnotation]; exists { + if _, exists := sandbox.Annotations[sandboxv1alpha1.SandboxPodNameAnnotation]; exists { log.Info("Removing pod name annotation from sandbox", "Sandbox.Name", sandbox.Name) // Create a patch to update only the annotations patch := client.MergeFrom(sandbox.DeepCopy()) - delete(sandbox.Annotations, SandboxPodNameAnnotation) + delete(sandbox.Annotations, sandboxv1alpha1.SandboxPodNameAnnotation) if err := r.Patch(ctx, sandbox, patch); err != nil { return nil, fmt.Errorf("failed to remove pod name annotation: %w", err) @@ -642,6 +641,7 @@ func (r *SandboxReconciler) SetupWithManager(mgr ctrl.Manager, concurrentWorkers if err != nil { return err } + return ctrl.NewControllerManagedBy(mgr). For(&sandboxv1alpha1.Sandbox{}). Owns(&corev1.Pod{}, builder.WithPredicates(labelSelectorPredicate)). diff --git a/controllers/sandbox_controller_test.go b/controllers/sandbox_controller_test.go index 7517c5f06..522872070 100644 --- a/controllers/sandbox_controller_test.go +++ b/controllers/sandbox_controller_test.go @@ -707,7 +707,7 @@ func TestReconcilePod(t *testing.T) { Name: sandboxName, Namespace: sandboxNs, Annotations: map[string]string{ - SandboxPodNameAnnotation: "adopted-pod-name", + sandboxv1alpha1.SandboxPodNameAnnotation: "adopted-pod-name", }, }, Spec: sandboxv1alpha1.SandboxSpec{ @@ -812,7 +812,7 @@ func TestReconcilePod(t *testing.T) { Name: sandboxName, Namespace: sandboxNs, Annotations: map[string]string{ - SandboxPodNameAnnotation: "non-existent-pod", + sandboxv1alpha1.SandboxPodNameAnnotation: "non-existent-pod", }, }, Spec: sandboxv1alpha1.SandboxSpec{ @@ -847,8 +847,8 @@ func TestReconcilePod(t *testing.T) { Name: sandboxName, Namespace: sandboxNs, Annotations: map[string]string{ - SandboxPodNameAnnotation: "annotated-pod-name", - "other-annotation": "other-value", + sandboxv1alpha1.SandboxPodNameAnnotation: "annotated-pod-name", + "other-annotation": "other-value", }, }, Spec: sandboxv1alpha1.SandboxSpec{ @@ -888,7 +888,7 @@ func TestReconcilePod(t *testing.T) { livePod := &corev1.Pod{} podName := sandboxName // Check if there's an annotation with a non-empty value - if annotatedPod, exists := tc.sandbox.Annotations[SandboxPodNameAnnotation]; exists && annotatedPod != "" { + if annotatedPod, exists := tc.sandbox.Annotations[sandboxv1alpha1.SandboxPodNameAnnotation]; exists && annotatedPod != "" { podName = annotatedPod } err = r.Get(t.Context(), types.NamespacedName{Name: podName, Namespace: sandboxNs}, livePod) diff --git a/extensions/controllers/sandboxclaim_controller.go b/extensions/controllers/sandboxclaim_controller.go index 458a54cc7..abef90a42 100644 --- a/extensions/controllers/sandboxclaim_controller.go +++ b/extensions/controllers/sandboxclaim_controller.go @@ -476,6 +476,9 @@ func (r *SandboxClaimReconciler) createSandbox(ctx context.Context, claim *exten sandbox.Annotations[asmetrics.TraceContextAnnotation] = tc } + // Track the sandbox template ref to be used by metrics collector + sandbox.Annotations[v1alpha1.SandboxTemplateRefAnnotation] = template.Name + template.Spec.PodTemplate.DeepCopyInto(&sandbox.Spec.PodTemplate) // TODO: this is a workaround, remove replica assignment related issue #202 replicas := int32(1) @@ -731,7 +734,7 @@ func (r *SandboxClaimReconciler) recordCreationLatencyMetric( // This is unlikely to happen; here for completeness only. if sandbox == nil { launchType = asmetrics.LaunchTypeUnknown - } else if sandbox.Annotations[sandboxcontrollers.SandboxPodNameAnnotation] != "" { + } else if sandbox.Annotations[v1alpha1.SandboxPodNameAnnotation] != "" { // Existence of the SandboxPodNameAnnotation implies the pod was adopted from a warm pool. launchType = asmetrics.LaunchTypeWarm } diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index 7da2ca6c3..37cb16113 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -72,6 +72,20 @@ var ( }, []string{"namespace", "sandbox_template", "launch_type", "warmpool_name", "pod_condition"}, ) + + // AgentSandboxesDesc describes the agent_sandboxes metric point-in-time counts. + // Labels: + // - namespace: the namespace of the sandbox + // - ready_condition: "true" | "false" + // - expired: "true" | "false" + // - launch_type: "warm" | "cold" + // - sandbox_template: sandboxTemplateRef + AgentSandboxesDesc = prometheus.NewDesc( + "agent_sandboxes", + "Monitor the point-in-time number of sandboxes in the cluster.", + []string{"namespace", "ready_condition", "expired", "launch_type", "sandbox_template"}, + nil, + ) ) // Init registers custom metrics with the global controller-runtime registry. diff --git a/internal/metrics/sandbox_collector.go b/internal/metrics/sandbox_collector.go new file mode 100644 index 000000000..a84ba9953 --- /dev/null +++ b/internal/metrics/sandbox_collector.go @@ -0,0 +1,149 @@ +// Copyright 2026 The Kubernetes Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// nolint:revive +package metrics + +import ( + "context" + "time" + + "github.com/go-logr/logr" + "github.com/prometheus/client_golang/prometheus" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + sandboxv1alpha1 "sigs.k8s.io/agent-sandbox/api/v1alpha1" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/metrics" +) + +const ( + metricsCollectTimeout = 5 * time.Second +) + +// AgentSandboxesMetricKey is used to aggregate counts for identical Sandboxes metric label combinations. +type AgentSandboxesMetricKey struct { + Namespace string + ReadyCondition string + Expired string + LaunchType string + Template string +} + +// NewAgentSandboxesConstMetric creates a new Prometheus ConstMetric for the agent_sandboxes gauge. +func NewAgentSandboxesConstMetric(count int, key AgentSandboxesMetricKey) prometheus.Metric { + return prometheus.MustNewConstMetric( + AgentSandboxesDesc, + prometheus.GaugeValue, + float64(count), + key.Namespace, + key.ReadyCondition, + key.Expired, + key.LaunchType, + key.Template, + ) +} + +// RegisterSandboxCollector registers the custom Prometheus collector for sandbox counts. +func RegisterSandboxCollector(c client.Client, logger logr.Logger) { + collector := NewSandboxCollector(c, logger) + if err := metrics.Registry.Register(collector); err != nil { + if _, ok := err.(prometheus.AlreadyRegisteredError); !ok { + logger.Error(err, "Failed to register SandboxCollector") + } else { + logger.Info("SandboxCollector already registered, ignoring") + } + } +} + +// SandboxCollector is a custom Prometheus collector that dynamically fetches sandbox counts. +type SandboxCollector struct { + client client.Client + logger logr.Logger + agentSandboxesDesc *prometheus.Desc +} + +// NewSandboxCollector initializes a SandboxCollector. +func NewSandboxCollector(c client.Client, logger logr.Logger) *SandboxCollector { + return &SandboxCollector{ + client: c, + logger: logger, + agentSandboxesDesc: AgentSandboxesDesc, + } +} + +// Describe sends the metric descriptor to the channel. +func (c *SandboxCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- c.agentSandboxesDesc +} + +// Collect fetches sandboxes, calculates labels, and sends metrics to the channel. +// Note: Using client.List to fetch all sandboxes in the cluster on every metrics scrape +// introduces O(N) memory allocation and CPU overhead due to deep-copying thousands of objects. +// While updating a GaugeVec in the Reconcile loop might be slightly harder to manage, +// it operates in O(1) memory during scrapes and is generally more performant. +// This is a known performance trade-off to keep the Reconcile loop simpler. +func (c *SandboxCollector) Collect(ch chan<- prometheus.Metric) { + var sandboxList sandboxv1alpha1.SandboxList + ctx, cancel := context.WithTimeout(context.Background(), metricsCollectTimeout) + defer cancel() + + // TODO(chw120): The current O(N) List call during metrics collection poses a scalability concern. + // In large clusters, frequent scrapes could lead to high CPU usage or OOM. + // This should be replaced with a more efficient implementation. + if err := c.client.List(ctx, &sandboxList); err != nil { + c.logger.Error(err, "Failed to list sandboxes for metrics collection") + return + } + + counts := make(map[AgentSandboxesMetricKey]int) + for _, sandbox := range sandboxList.Items { + readyConditionStr := "false" + expiredStr := "false" + readyCond := meta.FindStatusCondition(sandbox.Status.Conditions, string(sandboxv1alpha1.SandboxConditionReady)) + if readyCond != nil { + if readyCond.Status == metav1.ConditionTrue { + readyConditionStr = "true" + } + if readyCond.Reason == sandboxv1alpha1.SandboxReasonExpired { + expiredStr = "true" + } + } + + launchTypeStr := LaunchTypeCold + if _, ok := sandbox.Annotations[sandboxv1alpha1.SandboxPodNameAnnotation]; ok && sandbox.Annotations[sandboxv1alpha1.SandboxPodNameAnnotation] != "" { + launchTypeStr = LaunchTypeWarm + } + + sandboxTemplateStr := "unknown" + // If a user manually creates a Sandbox without a SandboxClaim, it won't have the + // SandboxTemplateRefAnnotation. The collector correctly handles this by defaulting to "unknown". + if template, ok := sandbox.Annotations[sandboxv1alpha1.SandboxTemplateRefAnnotation]; ok && template != "" { + sandboxTemplateStr = template + } + + key := AgentSandboxesMetricKey{ + Namespace: sandbox.Namespace, + ReadyCondition: readyConditionStr, + Expired: expiredStr, + LaunchType: launchTypeStr, + Template: sandboxTemplateStr, + } + counts[key]++ + } + + for key, count := range counts { + ch <- NewAgentSandboxesConstMetric(count, key) + } +} diff --git a/internal/metrics/sandbox_collector_test.go b/internal/metrics/sandbox_collector_test.go new file mode 100644 index 000000000..e314c3047 --- /dev/null +++ b/internal/metrics/sandbox_collector_test.go @@ -0,0 +1,190 @@ +// Copyright 2026 The Kubernetes Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// nolint:revive +package metrics + +import ( + "testing" + + "github.com/go-logr/logr" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/stretchr/testify/require" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + sandboxv1alpha1 "sigs.k8s.io/agent-sandbox/api/v1alpha1" +) + +func newFakeClient(objects ...runtime.Object) *fake.ClientBuilder { + scheme := runtime.NewScheme() + _ = sandboxv1alpha1.AddToScheme(scheme) + return fake.NewClientBuilder().WithScheme(scheme).WithRuntimeObjects(objects...) +} + +func TestSandboxCollector(t *testing.T) { + testCases := []struct { + name string + sandboxes []runtime.Object + expectedCount int + expectedLabels map[string]int + }{ + { + name: "single ready cold unknown sandbox", + sandboxes: []runtime.Object{ + &sandboxv1alpha1.Sandbox{ + ObjectMeta: metav1.ObjectMeta{ + Name: "sandbox-1", + Namespace: "default", + }, + Status: sandboxv1alpha1.SandboxStatus{ + Conditions: []metav1.Condition{ + { + Type: string(sandboxv1alpha1.SandboxConditionReady), + Status: metav1.ConditionTrue, + }, + }, + }, + }, + }, + expectedCount: 1, + expectedLabels: map[string]int{ + "expired:false launch_type:cold namespace:default ready_condition:true sandbox_template:unknown": 1, + }, + }, + { + name: "missing ready condition", + sandboxes: []runtime.Object{ + &sandboxv1alpha1.Sandbox{ + ObjectMeta: metav1.ObjectMeta{ + Name: "sandbox-missing", + Namespace: "default", + }, + Status: sandboxv1alpha1.SandboxStatus{ + Conditions: nil, + }, + }, + }, + expectedCount: 1, + expectedLabels: map[string]int{ + "expired:false launch_type:cold namespace:default ready_condition:false sandbox_template:unknown": 1, + }, + }, + { + name: "mixed sandboxes", + sandboxes: []runtime.Object{ + &sandboxv1alpha1.Sandbox{ + ObjectMeta: metav1.ObjectMeta{ + Name: "sandbox-1", + Namespace: "default", + }, + Status: sandboxv1alpha1.SandboxStatus{ + Conditions: []metav1.Condition{ + { + Type: string(sandboxv1alpha1.SandboxConditionReady), + Status: metav1.ConditionTrue, + }, + }, + }, + }, + &sandboxv1alpha1.Sandbox{ + ObjectMeta: metav1.ObjectMeta{ + Name: "sandbox-2", + Namespace: "test-ns", + Annotations: map[string]string{ + sandboxv1alpha1.SandboxPodNameAnnotation: "adopted-pod", + sandboxv1alpha1.SandboxTemplateRefAnnotation: "my-template", + }, + }, + Status: sandboxv1alpha1.SandboxStatus{ + Conditions: []metav1.Condition{ + { + Type: string(sandboxv1alpha1.SandboxConditionReady), + Status: metav1.ConditionFalse, + Reason: sandboxv1alpha1.SandboxReasonExpired, + }, + }, + }, + }, + &sandboxv1alpha1.Sandbox{ + ObjectMeta: metav1.ObjectMeta{ + Name: "sandbox-3", + Namespace: "default", + }, + Status: sandboxv1alpha1.SandboxStatus{ + Conditions: []metav1.Condition{ + { + Type: string(sandboxv1alpha1.SandboxConditionReady), + Status: metav1.ConditionFalse, + }, + }, + }, + }, + &sandboxv1alpha1.Sandbox{ + ObjectMeta: metav1.ObjectMeta{ + Name: "sandbox-4", + Namespace: "default", + }, + Status: sandboxv1alpha1.SandboxStatus{ + Conditions: []metav1.Condition{ + { + Type: string(sandboxv1alpha1.SandboxConditionReady), + Status: metav1.ConditionFalse, + }, + }, + }, + }, + }, + expectedCount: 3, // We expect 3 distinct metric series for the 4 sandboxes + expectedLabels: map[string]int{ + "expired:false launch_type:cold namespace:default ready_condition:true sandbox_template:unknown": 1, + "expired:true launch_type:warm namespace:test-ns ready_condition:false sandbox_template:my-template": 1, + "expired:false launch_type:cold namespace:default ready_condition:false sandbox_template:unknown": 2, + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + fakeClient := newFakeClient(tc.sandboxes...).Build() + collector := NewSandboxCollector(fakeClient, logr.Discard()) + registry := prometheus.NewRegistry() + registry.MustRegister(collector) + count, err := testutil.GatherAndCount(registry, "agent_sandboxes") + require.NoError(t, err) + require.Equal(t, tc.expectedCount, count) + metrics, err := registry.Gather() + require.NoError(t, err) + actualLabels := make(map[string]int) + for _, mf := range metrics { + if mf.GetName() == "agent_sandboxes" { + for _, m := range mf.GetMetric() { + labelStr := "" + for _, l := range m.GetLabel() { + labelStr += l.GetName() + ":" + l.GetValue() + " " + } + // Trim trailing space + if len(labelStr) > 0 { + labelStr = labelStr[:len(labelStr)-1] + } + actualLabels[labelStr] = int(m.GetGauge().GetValue()) + } + } + } + require.Equal(t, tc.expectedLabels, actualLabels) + }) + } +}