Azure
diff --git a/‎.devcontainer/Dockerfile‎
Lines changed: 1 addition & 1 deletion b/‎.devcontainer/Dockerfile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎Taskfile.yml‎
Lines changed: 28 additions & 0 deletions b/‎Taskfile.yml‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎docs/hugo/content/guide/aso-controller-settings-options.md‎
Lines changed: 3 additions & 3 deletions b/‎docs/hugo/content/guide/aso-controller-settings-options.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎v2/.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎v2/.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎v2/charts/azure-service-operator/values.yaml‎
Lines changed: 4 additions & 4 deletions b/‎v2/charts/azure-service-operator/values.yaml‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎v2/go.mod‎
Lines changed: 1 addition & 0 deletions b/‎v2/go.mod‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎v2/go.sum‎
Lines changed: 2 additions & 0 deletions b/‎v2/go.sum‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎v2/internal/config/vars.go‎
Lines changed: 4 additions & 4 deletions b/‎v2/internal/config/vars.go‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎v2/internal/testcommon/podmetrics/collector.go‎
Lines changed: 194 additions & 0 deletions b/‎v2/internal/testcommon/podmetrics/collector.go‎
Lines changed: 194 additions & 0 deletions
@@ -14,7 +14,7 @@ ENV DEBIAN_FRONTEND=noninteractive
 RUN curl -fsSL https://download.docker.com/linux/debian/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg \
     && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian $(lsb_release -cs) stable" > /etc/apt/sources.list.d/docker.list \
     && apt-get update \
-    && apt-get -y install --no-install-recommends bash-completion lsb-release graphviz zip nodejs npm python3-pip docker-ce-cli docker-compose-plugin\
+    && apt-get -y install --no-install-recommends bash-completion lsb-release graphviz zip nodejs npm python3-pip docker-ce-cli docker-compose-plugin gnuplot\
     # install az-cli
     && curl -sL https://aka.ms/InstallAzureCLIDeb | bash - \
     # Temporary fix to avoid regression in v2.77
 
@@ -12,6 +12,8 @@ default.etcd
 
 # Output of the test and coverage results
 reports/
+# Allow perf reports
+!v2/test/perf/reports/
 
 bin/
 vendor/
 
@@ -760,6 +760,34 @@ tasks:
       # to keep CI fast
       - go test -timeout 40m -v -run '{{default ".*" .TEST_FILTER}}' ./test
 
+  # To control as many variables as possible and get repeatable runs, it's recommended to 
+  # run this test against a real AKS cluster rather than a local kind cluster
+  controller:test-perf:
+    desc: Run perf tests against the current cluster
+    dir: "{{.CONTROLLER_ROOT}}"
+    cmds:
+      - go test -timeout 30m -v -run '{{default ".*" .TEST_FILTER}}' ./test/perf
+
+  controller:plot-perf-metrics:
+    desc: Plot CPU and memory charts for all unplotted perf metric CSVs.
+    dir: "{{.CONTROLLER_ROOT}}"
+    cmds:
+      - |
+        graph_dir="test/perf/reports/graphs"
+        mkdir -p "$graph_dir"
+        plotted=0
+        for csv in test/perf/reports/metrics_*.csv; do
+          [ -f "$csv" ] || continue
+          base=$(basename "${csv}" .csv)
+          if [ ! -f "${graph_dir}/${base}_cpu.png" ] || [ ! -f "${graph_dir}/${base}_memory.png" ]; then
+            gnuplot -c test/perf/plot-metrics.gp "$csv" "$graph_dir"
+            plotted=$((plotted + 1))
+          fi
+        done
+        if [ "$plotted" -eq 0 ]; then
+          echo "No unplotted CSVs found in test/perf/reports/"
+        fi
+
   controller:test-multitenant-integration-kind-ci:
     desc: Run live multitenant integration tests in kind.
     dir: "{{.CONTROLLER_ROOT}}"
 
@@ -233,15 +233,15 @@ Required: True if using Entra resources, otherwise False
 ### MAX_CONCURRENT_RECONCILES
 
 MAX_CONCURRENT_RECONCILES is the number of threads/goroutines dedicated to reconciling each resource type.
-If not specified, the default is 1.
+If not specified, the default is 4.
 
 IMPORTANT: Having MAX_CONCURRENT_RECONCILES set to N does not mean that ASO is limited to N interactions with
 Azure at any given time, because the control loop yields to another resource while it is not actively issuing HTTP
 calls to Azure. Any single resource only blocks the control-loop for its resource-type for as long as it takes to issue
 an HTTP call to Azure, view the result, and make a decision. In most cases the time taken to perform these actions
 (and thus how long the loop is blocked and preventing other resources from being acted upon) is a few hundred
-milliseconds to at most a second or two. In a typical 60s period, many hundreds or even thousands of resources
-can be managed with this set to 1.
+milliseconds to at most a second or two. In a typical 60s period, hundreds of resources
+for a given resource type can be managed with this set to 1.
 
 MAX_CONCURRENT_RECONCILES applies to every registered resource type being watched/managed by ASO.
 
 
@@ -1,4 +1,5 @@
 config/crd/generated
 config/webhook/manifests.yaml
 config/rbac/role.yaml
+test/perf/reports/graphs/
 out/
@@ -248,16 +248,16 @@ tolerations: []
 affinity: {}
 
 # MaxConcurrentReconciles is the number of threads/goroutines dedicated to reconciling each resource type.
-# If not specified, the default is 1.
+# If not specified, the default is 4.
 # IMPORTANT: Having MaxConcurrentReconciles set to N does not mean that ASO is limited to N interactions with
 # Azure at any given time, because the control loop yields to another resource while it is not actively issuing HTTP
 # calls to Azure. Any single resource only blocks the control-loop for its resource-type for as long as it takes to issue
 # an HTTP call to Azure, view the result, and make a decision. In most cases the time taken to perform these actions
 # (and thus how long the loop is blocked and preventing other resources from being acted upon) is a few hundred
-# milliseconds to at most a second or two. In a typical 60s period, many hundreds or even thousands of resources
-# can be managed with this set to 1.
+# milliseconds to at most a second or two. In a typical 60s period, hundreds of resources
+# for a given resource type can be managed with this set to 1.
 # MaxConcurrentReconciles applies to every registered resource type being watched/managed by ASO.
-maxConcurrentReconciles: 1
+maxConcurrentReconciles: 4
 
 rateLimit:
   # mode configures the internal rate-limiting mode.
 
@@ -64,6 +64,7 @@ require (
 	k8s.io/apimachinery v0.35.2
 	k8s.io/client-go v0.35.2
 	k8s.io/klog/v2 v2.130.1
+	k8s.io/metrics v0.35.2
 	sigs.k8s.io/controller-runtime v0.23.1
 	sigs.k8s.io/yaml v1.6.0
 )
 
@@ -943,6 +943,8 @@ k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk=
 k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE=
 k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 h1:Y3gxNAuB0OBLImH611+UDZcmKS3g6CthxToOb37KgwE=
 k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912/go.mod h1:kdmbQkyfwUagLfXIad1y2TdrjPFWp2Q89B3qkRwf/pQ=
+k8s.io/metrics v0.35.2 h1:PJRP88qeadR5evg4ZKJAh3NR3ICchwM51/Aidd0LHjc=
+k8s.io/metrics v0.35.2/go.mod h1:w1pJmSu2j8ftVI26MGcJtMnpmZ06oKwb4Enm+xVl06Q=
 k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 h1:SjGebBtkBqHFOli+05xYbK8YF1Dzkbzn+gDM4X9T4Ck=
 k8s.io/utils v0.0.0-20251002143259-bc988d571ff4/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
 rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8=
 
@@ -23,7 +23,7 @@ const (
 )
 
 var (
-	DefaultMaxConcurrentReconciles = 1
+	DefaultMaxConcurrentReconciles = 4
 	DefaultSyncInterval            = mustParseDuration(DefaultSyncIntervalString)
 )
 
@@ -96,14 +96,14 @@ type Values struct {
 	UserAgentSuffix string
 
 	// MaxConcurrentReconciles is the number of threads/goroutines dedicated to reconciling each resource type.
-	// If not specified, the default is 1.
+	// If not specified, the default is 4.
 	// IMPORTANT: Having MaxConcurrentReconciles set to N does not mean that ASO is limited to N interactions with
 	// Azure at any given time, because the control loop yields to another resource while it is not actively issuing HTTP
 	// calls to Azure. Any single resource only blocks the control-loop for its resource-type for as long as it takes to issue
 	// an HTTP call to Azure, view the result, and make a decision. In most cases the time taken to perform these actions
 	// (and thus how long the loop is blocked and preventing other resources from being acted upon) is a few hundred
-	// milliseconds to at most a second or two. In a typical 60s period, many hundreds or even thousands of resources
-	// can be managed with this set to 1.
+	// milliseconds to at most a second or two. In a typical 60s period, hundreds of resources
+	// for a given resource type can be managed with this set to 1.
 	// MaxConcurrentReconciles applies to every registered resource type being watched/managed by ASO.
 	MaxConcurrentReconciles int
 
 
@@ -0,0 +1,194 @@
+/*
+Copyright (c) Microsoft Corporation.
+Licensed under the MIT license.
+*/
+
+package podmetrics
+
+import (
+	"context"
+	"fmt"
+	"slices"
+	"strings"
+	"sync"
+	"time"
+
+	"github.com/go-logr/logr"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/client-go/rest"
+	metricsv1beta1 "k8s.io/metrics/pkg/apis/metrics/v1beta1"
+	metricsclient "k8s.io/metrics/pkg/client/clientset/versioned"
+)
+
+const (
+	// DefaultNamespace is the namespace where ASO controller pods run.
+	DefaultNamespace = "azureserviceoperator-system"
+
+	// DefaultPodPrefix is the prefix of the ASO controller pod name.
+	DefaultPodPrefix = "azureserviceoperator-controller-manager"
+
+	// DefaultInterval is how frequently to poll for pod metrics.
+	DefaultInterval = 5 * time.Second
+)
+
+// CollectorConfig holds configuration for a MetricsCollector.
+type CollectorConfig struct {
+	// Namespace is the Kubernetes namespace to monitor.
+	Namespace string
+
+	// PodPrefix filters pods whose names start with this prefix.
+	PodPrefix string
+
+	// Interval is how often to poll the metrics API.
+	Interval time.Duration
+}
+
+// MetricsCollector polls the Kubernetes metrics API at a regular interval and stores samples.
+type MetricsCollector struct {
+	client    metricsclient.Interface
+	namespace string
+	podPrefix string
+	interval  time.Duration
+
+	mu      sync.Mutex
+	samples []Sample
+	start   time.Time
+
+	cancel context.CancelFunc
+	done   chan struct{}
+}
+
+// NewMetricsCollector creates a MetricsCollector that polls pod metrics from the given cluster.
+// It targets pods in the specified namespace whose names start with podPrefix.
+func NewMetricsCollector(cfg *rest.Config, collectorCfg CollectorConfig) (*MetricsCollector, error) {
+	mc, err := metricsclient.NewForConfig(cfg)
+	if err != nil {
+		return nil, fmt.Errorf("creating metrics client: %w", err)
+	}
+
+	namespace := collectorCfg.Namespace
+	if namespace == "" {
+		namespace = DefaultNamespace
+	}
+
+	podPrefix := collectorCfg.PodPrefix
+	if podPrefix == "" {
+		podPrefix = DefaultPodPrefix
+	}
+
+	interval := collectorCfg.Interval
+	if interval <= 0 {
+		interval = DefaultInterval
+	}
+
+	return &MetricsCollector{
+		client:    mc,
+		namespace: namespace,
+		podPrefix: podPrefix,
+		interval:  interval,
+	}, nil
+}
+
+// CheckAvailable probes the metrics API to verify that metrics-server is reachable.
+// Returns an error if the metrics API is not available.
+func (mc *MetricsCollector) CheckAvailable(ctx context.Context) error {
+	_, err := mc.client.MetricsV1beta1().PodMetricses(mc.namespace).List(ctx, metav1.ListOptions{Limit: 1})
+	if err != nil {
+		return fmt.Errorf("metrics-server not available in namespace %q: %w", mc.namespace, err)
+	}
+	return nil
+}
+
+// Start begins collecting metrics in a background goroutine.
+// Call Stop to end collection.
+func (mc *MetricsCollector) Start(log logr.Logger) {
+	mc.mu.Lock()
+	mc.start = time.Now()
+	mc.samples = nil
+	mc.mu.Unlock()
+
+	ctx, cancel := context.WithCancel(context.Background())
+	mc.cancel = cancel
+	mc.done = make(chan struct{})
+
+	go func() {
+		defer close(mc.done)
+		ticker := time.NewTicker(mc.interval)
+		defer ticker.Stop()
+
+		for {
+			select {
+			case <-ctx.Done():
+				return
+			case <-ticker.C:
+				if err := mc.collect(ctx); err != nil {
+					log.Error(err, "metrics collection error")
+				}
+			}
+		}
+	}()
+
+	log.Info("Metrics collector started",
+		"namespace", mc.namespace,
+		"podPrefix", mc.podPrefix,
+		"interval", mc.interval)
+}
+
+// Stop ends metrics collection and waits for the collector goroutine to exit.
+func (mc *MetricsCollector) Stop() {
+	if mc.cancel != nil {
+		mc.cancel()
+		<-mc.done
+	}
+}
+
+// Samples returns a copy of all collected samples.
+func (mc *MetricsCollector) Samples() []Sample {
+	mc.mu.Lock()
+	defer mc.mu.Unlock()
+	result := slices.Clone(mc.samples)
+	return result
+}
+
+// collect performs a single metrics API poll and stores the results.
+func (mc *MetricsCollector) collect(ctx context.Context) error {
+	podMetricsList, err := mc.client.MetricsV1beta1().PodMetricses(mc.namespace).List(ctx, metav1.ListOptions{})
+	if err != nil {
+		return fmt.Errorf("listing pod metrics: %w", err)
+	}
+
+	now := time.Now()
+	mc.mu.Lock()
+	defer mc.mu.Unlock()
+
+	for i := range podMetricsList.Items {
+		pod := &podMetricsList.Items[i]
+		if !strings.HasPrefix(pod.Name, mc.podPrefix) {
+			continue
+		}
+
+		mc.collectPodContainers(pod, now)
+	}
+
+	return nil
+}
+
+// collectPodContainers extracts metrics from each container in a pod.
+func (mc *MetricsCollector) collectPodContainers(pod *metricsv1beta1.PodMetrics, now time.Time) {
+	for i := range pod.Containers {
+		container := &pod.Containers[i]
+
+		cpuMillis := container.Usage.Cpu().MilliValue()
+		memBytes := container.Usage.Memory().Value()
+
+		sample := Sample{
+			Timestamp:     now,
+			Elapsed:       now.Sub(mc.start),
+			PodName:       pod.Name,
+			ContainerName: container.Name,
+			CPUMillicores: float64(cpuMillis),
+			MemoryBytes:   memBytes,
+		}
+		mc.samples = append(mc.samples, sample)
+	}
+}
Original file line number	Diff line number	Diff line change
`@@ -64,6 +64,7 @@ require (`
`64`	`64`	`k8s.io/apimachinery v0.35.2`
`65`	`65`	`k8s.io/client-go v0.35.2`
`66`	`66`	`k8s.io/klog/v2 v2.130.1`
	`67`	`+ k8s.io/metrics v0.35.2`
`67`	`68`	`sigs.k8s.io/controller-runtime v0.23.1`
`68`	`69`	`sigs.k8s.io/yaml v1.6.0`
`69`	`70`	`)`