Skip to content
77 changes: 74 additions & 3 deletions controllers/sandbox_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"reflect"
"time"

"github.com/prometheus/client_golang/prometheus"
corev1 "k8s.io/api/core/v1"
k8serrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta"
Expand All @@ -36,16 +37,20 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller"
"sigs.k8s.io/controller-runtime/pkg/log"
ctrlMetrics "sigs.k8s.io/controller-runtime/pkg/metrics"
"sigs.k8s.io/controller-runtime/pkg/predicate"

sandboxv1alpha1 "sigs.k8s.io/agent-sandbox/api/v1alpha1"
extensionsv1alpha1 "sigs.k8s.io/agent-sandbox/extensions/api/v1alpha1"
asmetrics "sigs.k8s.io/agent-sandbox/internal/metrics"
)

const (
sandboxLabel = "agents.x-k8s.io/sandbox-name-hash"
SandboxPodNameAnnotation = "agents.x-k8s.io/pod-name"
sandboxControllerFieldOwner = "sandbox-controller"
sandboxLabel = "agents.x-k8s.io/sandbox-name-hash"
SandboxPodNameAnnotation = "agents.x-k8s.io/pod-name"
SandboxTemplateRefAnnotation = "agents.x-k8s.io/sandbox-template-ref"
sandboxControllerFieldOwner = "sandbox-controller"
metricsCollectTimeout = 5 * time.Second
)

var (
Expand Down Expand Up @@ -615,6 +620,68 @@ func sandboxMarkedExpired(sandbox *sandboxv1alpha1.Sandbox) bool {
return cond != nil && cond.Reason == sandboxv1alpha1.SandboxReasonExpired
}

// SandboxCollector is a custom Prometheus collector that dynamically fetches sandbox counts.
type SandboxCollector struct {
client client.Client
agentSandboxesDesc *prometheus.Desc
}

// NewSandboxCollector initializes a SandboxCollector.
func NewSandboxCollector(c client.Client) *SandboxCollector {
return &SandboxCollector{
client: c,
agentSandboxesDesc: asmetrics.AgentSandboxesDesc,
}
}

// Describe sends the metric descriptor to the channel.
func (c *SandboxCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- c.agentSandboxesDesc
}

// Collect fetches sandboxes, calculates labels, and sends metrics to the channel.
func (c *SandboxCollector) Collect(ch chan<- prometheus.Metric) {
var sandboxList sandboxv1alpha1.SandboxList
ctx, cancel := context.WithTimeout(context.Background(), metricsCollectTimeout)
defer cancel()
if err := c.client.List(ctx, &sandboxList); err != nil {
log.FromContext(ctx).Error(err, "Failed to list sandboxes for metrics collection")
return
}
counts := make(map[asmetrics.AgentSandboxesMetricKey]int)
for _, sandbox := range sandboxList.Items {
readyConditionStr := "false"
expiredStr := "false"
readyCond := meta.FindStatusCondition(sandbox.Status.Conditions, string(sandboxv1alpha1.SandboxConditionReady))
if readyCond != nil {
if readyCond.Status == metav1.ConditionTrue {
readyConditionStr = "true"
}
if readyCond.Reason == sandboxv1alpha1.SandboxReasonExpired || readyCond.Reason == extensionsv1alpha1.ClaimExpiredReason {
expiredStr = "true"
}
}
launchTypeStr := asmetrics.LaunchTypeCold
if _, ok := sandbox.Annotations[SandboxPodNameAnnotation]; ok && sandbox.Annotations[SandboxPodNameAnnotation] != "" {
launchTypeStr = asmetrics.LaunchTypeWarm
}
sandboxTemplateStr := "unknown"
if template, ok := sandbox.Annotations[SandboxTemplateRefAnnotation]; ok && template != "" {
sandboxTemplateStr = template
}
key := asmetrics.AgentSandboxesMetricKey{
ReadyCondition: readyConditionStr,
Expired: expiredStr,
LaunchType: launchTypeStr,
Template: sandboxTemplateStr,
}
counts[key]++
}
for key, count := range counts {
ch <- asmetrics.NewAgentSandboxesConstMetric(count, key)
}
}

// SetupWithManager sets up the controller with the Manager.
func (r *SandboxReconciler) SetupWithManager(mgr ctrl.Manager, concurrentWorkers int) error {
labelSelectorPredicate, err := predicate.LabelSelectorPredicate(metav1.LabelSelector{
Expand All @@ -629,6 +696,10 @@ func (r *SandboxReconciler) SetupWithManager(mgr ctrl.Manager, concurrentWorkers
if err != nil {
return err
}

// Register the custom Sandbox metric collector
ctrlMetrics.Registry.MustRegister(NewSandboxCollector(mgr.GetClient()))

return ctrl.NewControllerManagedBy(mgr).
For(&sandboxv1alpha1.Sandbox{}).
Owns(&corev1.Pod{}, builder.WithPredicates(labelSelectorPredicate)).
Expand Down
138 changes: 138 additions & 0 deletions controllers/sandbox_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ import (

"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp/cmpopts"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/testutil"
"github.com/stretchr/testify/require"
corev1 "k8s.io/api/core/v1"
k8serrors "k8s.io/apimachinery/pkg/api/errors"
Expand Down Expand Up @@ -203,6 +205,142 @@ func TestComputeReadyCondition(t *testing.T) {
}
}

func TestSandboxCollector(t *testing.T) {
testCases := []struct {
name string
sandboxes []runtime.Object
expectedCount int
expectedLabels map[string]int
}{
{
name: "single ready cold unknown sandbox",
sandboxes: []runtime.Object{
&sandboxv1alpha1.Sandbox{
ObjectMeta: metav1.ObjectMeta{
Name: "sandbox-1",
Namespace: "default",
},
Status: sandboxv1alpha1.SandboxStatus{
Conditions: []metav1.Condition{
{
Type: string(sandboxv1alpha1.SandboxConditionReady),
Status: metav1.ConditionTrue,
},
},
},
},
},
expectedCount: 1,
expectedLabels: map[string]int{
"expired:false launch_type:cold ready_condition:true sandbox_template:unknown": 1,
},
},
{
name: "mixed sandboxes",
sandboxes: []runtime.Object{
&sandboxv1alpha1.Sandbox{
ObjectMeta: metav1.ObjectMeta{
Name: "sandbox-1",
Namespace: "default",
},
Status: sandboxv1alpha1.SandboxStatus{
Conditions: []metav1.Condition{
{
Type: string(sandboxv1alpha1.SandboxConditionReady),
Status: metav1.ConditionTrue,
},
},
},
},
&sandboxv1alpha1.Sandbox{
ObjectMeta: metav1.ObjectMeta{
Name: "sandbox-2",
Namespace: "default",
Annotations: map[string]string{
SandboxPodNameAnnotation: "adopted-pod",
SandboxTemplateRefAnnotation: "my-template",
},
},
Status: sandboxv1alpha1.SandboxStatus{
Conditions: []metav1.Condition{
{
Type: string(sandboxv1alpha1.SandboxConditionReady),
Status: metav1.ConditionFalse,
Reason: "SandboxExpired",
},
},
},
},
&sandboxv1alpha1.Sandbox{
ObjectMeta: metav1.ObjectMeta{
Name: "sandbox-3",
Namespace: "default",
},
Status: sandboxv1alpha1.SandboxStatus{
Conditions: []metav1.Condition{
{
Type: string(sandboxv1alpha1.SandboxConditionReady),
Status: metav1.ConditionFalse,
},
},
},
},
&sandboxv1alpha1.Sandbox{
ObjectMeta: metav1.ObjectMeta{
Name: "sandbox-4",
Namespace: "default",
},
Status: sandboxv1alpha1.SandboxStatus{
Conditions: []metav1.Condition{
{
Type: string(sandboxv1alpha1.SandboxConditionReady),
Status: metav1.ConditionFalse,
},
},
},
},
},
expectedCount: 3, // We expect 3 distinct metric series for the 4 sandboxes
expectedLabels: map[string]int{
"expired:false launch_type:cold ready_condition:true sandbox_template:unknown": 1,
"expired:true launch_type:warm ready_condition:false sandbox_template:my-template": 1,
"expired:false launch_type:cold ready_condition:false sandbox_template:unknown": 2,
},
},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
fakeClient := newFakeClient(tc.sandboxes...)
collector := NewSandboxCollector(fakeClient)
registry := prometheus.NewRegistry()
registry.MustRegister(collector)
count, err := testutil.GatherAndCount(registry, "agent_sandboxes")
require.NoError(t, err)
require.Equal(t, tc.expectedCount, count)
metrics, err := registry.Gather()
require.NoError(t, err)
actualLabels := make(map[string]int)
for _, mf := range metrics {
if mf.GetName() == "agent_sandboxes" {
for _, m := range mf.GetMetric() {
labelStr := ""
for _, l := range m.GetLabel() {
labelStr += l.GetName() + ":" + l.GetValue() + " "
}
// Trim trailing space
if len(labelStr) > 0 {
labelStr = labelStr[:len(labelStr)-1]
}
actualLabels[labelStr] = int(m.GetGauge().GetValue())
}
}
}
require.Equal(t, tc.expectedLabels, actualLabels)
})
}
}

func TestReconcile(t *testing.T) {
sandboxName := "sandbox-name"
sandboxNs := "sandbox-ns"
Expand Down
3 changes: 3 additions & 0 deletions extensions/controllers/sandboxclaim_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,9 @@ func (r *SandboxClaimReconciler) createSandbox(ctx context.Context, claim *exten
sandbox.Annotations[asmetrics.TraceContextAnnotation] = tc
}

// Track the sandbox template ref to be used by metrics collector
sandbox.Annotations[sandboxcontrollers.SandboxTemplateRefAnnotation] = template.Name

template.Spec.PodTemplate.DeepCopyInto(&sandbox.Spec.PodTemplate)
// TODO: this is a workaround, remove replica assignment related issue #202
replicas := int32(1)
Expand Down
34 changes: 34 additions & 0 deletions internal/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,19 @@ var (
},
[]string{"namespace", "sandbox_template", "launch_type", "warmpool_name", "pod_condition"},
)

// AgentSandboxesDesc describes the agent_sandboxes metric point-in-time counts.
// Labels:
// - ready_condition: "true" | "false"
// - expired: "true" | "false"
// - launch_type: "warm" | "cold"
// - sandbox_template: sandboxTemplateRef
AgentSandboxesDesc = prometheus.NewDesc(
"agent_sandboxes",
"Monitor the point-in-time number of sandboxes in the cluster.",
[]string{"ready_condition", "expired", "launch_type", "sandbox_template"},
nil,
)
)

// Init registers custom metrics with the global controller-runtime registry.
Expand All @@ -75,3 +88,24 @@ func RecordClaimStartupLatency(startTime time.Time, launchType, templateName str
func RecordSandboxClaimCreation(namespace, templateName, launchType, warmPoolName, podCondition string) {
SandboxClaimCreationTotal.WithLabelValues(namespace, templateName, launchType, warmPoolName, podCondition).Inc()
}

// AgentSandboxesMetricKey is used to aggregate counts for identical Sandboxes metric label combinations.
type AgentSandboxesMetricKey struct {
ReadyCondition string
Expired string
LaunchType string
Template string
}

// NewAgentSandboxesConstMetric creates a new Prometheus ConstMetric for the agent_sandboxes gauge.
func NewAgentSandboxesConstMetric(count int, key AgentSandboxesMetricKey) prometheus.Metric {
return prometheus.MustNewConstMetric(
AgentSandboxesDesc,
prometheus.GaugeValue,
float64(count),
key.ReadyCondition,
key.Expired,
key.LaunchType,
key.Template,
)
}