Skip to content

Commit fbc07a2

Browse files
authored
Update default max parallelism to 4 and add perf tests (#4822)
* Update default max parallelism to 4 * Add perf tests * Collect metrics from perf test
1 parent 97dba19 commit fbc07a2

22 files changed

+1667
-15
lines changed

.devcontainer/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ ENV DEBIAN_FRONTEND=noninteractive
1414
RUN curl -fsSL https://download.docker.com/linux/debian/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg \
1515
&& echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian $(lsb_release -cs) stable" > /etc/apt/sources.list.d/docker.list \
1616
&& apt-get update \
17-
&& apt-get -y install --no-install-recommends bash-completion lsb-release graphviz zip nodejs npm python3-pip docker-ce-cli docker-compose-plugin\
17+
&& apt-get -y install --no-install-recommends bash-completion lsb-release graphviz zip nodejs npm python3-pip docker-ce-cli docker-compose-plugin gnuplot\
1818
# install az-cli
1919
&& curl -sL https://aka.ms/InstallAzureCLIDeb | bash - \
2020
# Temporary fix to avoid regression in v2.77

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ default.etcd
1212

1313
# Output of the test and coverage results
1414
reports/
15+
# Allow perf reports
16+
!v2/test/perf/reports/
1517

1618
bin/
1719
vendor/

Taskfile.yml

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -760,6 +760,34 @@ tasks:
760760
# to keep CI fast
761761
- go test -timeout 40m -v -run '{{default ".*" .TEST_FILTER}}' ./test
762762

763+
# To control as many variables as possible and get repeatable runs, it's recommended to
764+
# run this test against a real AKS cluster rather than a local kind cluster
765+
controller:test-perf:
766+
desc: Run perf tests against the current cluster
767+
dir: "{{.CONTROLLER_ROOT}}"
768+
cmds:
769+
- go test -timeout 30m -v -run '{{default ".*" .TEST_FILTER}}' ./test/perf
770+
771+
controller:plot-perf-metrics:
772+
desc: Plot CPU and memory charts for all unplotted perf metric CSVs.
773+
dir: "{{.CONTROLLER_ROOT}}"
774+
cmds:
775+
- |
776+
graph_dir="test/perf/reports/graphs"
777+
mkdir -p "$graph_dir"
778+
plotted=0
779+
for csv in test/perf/reports/metrics_*.csv; do
780+
[ -f "$csv" ] || continue
781+
base=$(basename "${csv}" .csv)
782+
if [ ! -f "${graph_dir}/${base}_cpu.png" ] || [ ! -f "${graph_dir}/${base}_memory.png" ]; then
783+
gnuplot -c test/perf/plot-metrics.gp "$csv" "$graph_dir"
784+
plotted=$((plotted + 1))
785+
fi
786+
done
787+
if [ "$plotted" -eq 0 ]; then
788+
echo "No unplotted CSVs found in test/perf/reports/"
789+
fi
790+
763791
controller:test-multitenant-integration-kind-ci:
764792
desc: Run live multitenant integration tests in kind.
765793
dir: "{{.CONTROLLER_ROOT}}"

docs/hugo/content/guide/aso-controller-settings-options.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -233,15 +233,15 @@ Required: True if using Entra resources, otherwise False
233233
### MAX_CONCURRENT_RECONCILES
234234

235235
MAX_CONCURRENT_RECONCILES is the number of threads/goroutines dedicated to reconciling each resource type.
236-
If not specified, the default is 1.
236+
If not specified, the default is 4.
237237

238238
IMPORTANT: Having MAX_CONCURRENT_RECONCILES set to N does not mean that ASO is limited to N interactions with
239239
Azure at any given time, because the control loop yields to another resource while it is not actively issuing HTTP
240240
calls to Azure. Any single resource only blocks the control-loop for its resource-type for as long as it takes to issue
241241
an HTTP call to Azure, view the result, and make a decision. In most cases the time taken to perform these actions
242242
(and thus how long the loop is blocked and preventing other resources from being acted upon) is a few hundred
243-
milliseconds to at most a second or two. In a typical 60s period, many hundreds or even thousands of resources
244-
can be managed with this set to 1.
243+
milliseconds to at most a second or two. In a typical 60s period, hundreds of resources
244+
for a given resource type can be managed with this set to 1.
245245

246246
MAX_CONCURRENT_RECONCILES applies to every registered resource type being watched/managed by ASO.
247247

v2/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
config/crd/generated
22
config/webhook/manifests.yaml
33
config/rbac/role.yaml
4+
test/perf/reports/graphs/
45
out/

v2/charts/azure-service-operator/values.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -248,16 +248,16 @@ tolerations: []
248248
affinity: {}
249249

250250
# MaxConcurrentReconciles is the number of threads/goroutines dedicated to reconciling each resource type.
251-
# If not specified, the default is 1.
251+
# If not specified, the default is 4.
252252
# IMPORTANT: Having MaxConcurrentReconciles set to N does not mean that ASO is limited to N interactions with
253253
# Azure at any given time, because the control loop yields to another resource while it is not actively issuing HTTP
254254
# calls to Azure. Any single resource only blocks the control-loop for its resource-type for as long as it takes to issue
255255
# an HTTP call to Azure, view the result, and make a decision. In most cases the time taken to perform these actions
256256
# (and thus how long the loop is blocked and preventing other resources from being acted upon) is a few hundred
257-
# milliseconds to at most a second or two. In a typical 60s period, many hundreds or even thousands of resources
258-
# can be managed with this set to 1.
257+
# milliseconds to at most a second or two. In a typical 60s period, hundreds of resources
258+
# for a given resource type can be managed with this set to 1.
259259
# MaxConcurrentReconciles applies to every registered resource type being watched/managed by ASO.
260-
maxConcurrentReconciles: 1
260+
maxConcurrentReconciles: 4
261261

262262
rateLimit:
263263
# mode configures the internal rate-limiting mode.

v2/go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ require (
6464
k8s.io/apimachinery v0.35.2
6565
k8s.io/client-go v0.35.2
6666
k8s.io/klog/v2 v2.130.1
67+
k8s.io/metrics v0.35.2
6768
sigs.k8s.io/controller-runtime v0.23.1
6869
sigs.k8s.io/yaml v1.6.0
6970
)

v2/go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -943,6 +943,8 @@ k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk=
943943
k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE=
944944
k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 h1:Y3gxNAuB0OBLImH611+UDZcmKS3g6CthxToOb37KgwE=
945945
k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912/go.mod h1:kdmbQkyfwUagLfXIad1y2TdrjPFWp2Q89B3qkRwf/pQ=
946+
k8s.io/metrics v0.35.2 h1:PJRP88qeadR5evg4ZKJAh3NR3ICchwM51/Aidd0LHjc=
947+
k8s.io/metrics v0.35.2/go.mod h1:w1pJmSu2j8ftVI26MGcJtMnpmZ06oKwb4Enm+xVl06Q=
946948
k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 h1:SjGebBtkBqHFOli+05xYbK8YF1Dzkbzn+gDM4X9T4Ck=
947949
k8s.io/utils v0.0.0-20251002143259-bc988d571ff4/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
948950
rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8=

v2/internal/config/vars.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ const (
2323
)
2424

2525
var (
26-
DefaultMaxConcurrentReconciles = 1
26+
DefaultMaxConcurrentReconciles = 4
2727
DefaultSyncInterval = mustParseDuration(DefaultSyncIntervalString)
2828
)
2929

@@ -96,14 +96,14 @@ type Values struct {
9696
UserAgentSuffix string
9797

9898
// MaxConcurrentReconciles is the number of threads/goroutines dedicated to reconciling each resource type.
99-
// If not specified, the default is 1.
99+
// If not specified, the default is 4.
100100
// IMPORTANT: Having MaxConcurrentReconciles set to N does not mean that ASO is limited to N interactions with
101101
// Azure at any given time, because the control loop yields to another resource while it is not actively issuing HTTP
102102
// calls to Azure. Any single resource only blocks the control-loop for its resource-type for as long as it takes to issue
103103
// an HTTP call to Azure, view the result, and make a decision. In most cases the time taken to perform these actions
104104
// (and thus how long the loop is blocked and preventing other resources from being acted upon) is a few hundred
105-
// milliseconds to at most a second or two. In a typical 60s period, many hundreds or even thousands of resources
106-
// can be managed with this set to 1.
105+
// milliseconds to at most a second or two. In a typical 60s period, hundreds of resources
106+
// for a given resource type can be managed with this set to 1.
107107
// MaxConcurrentReconciles applies to every registered resource type being watched/managed by ASO.
108108
MaxConcurrentReconciles int
109109

Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
/*
2+
Copyright (c) Microsoft Corporation.
3+
Licensed under the MIT license.
4+
*/
5+
6+
package podmetrics
7+
8+
import (
9+
"context"
10+
"fmt"
11+
"slices"
12+
"strings"
13+
"sync"
14+
"time"
15+
16+
"github.com/go-logr/logr"
17+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
18+
"k8s.io/client-go/rest"
19+
metricsv1beta1 "k8s.io/metrics/pkg/apis/metrics/v1beta1"
20+
metricsclient "k8s.io/metrics/pkg/client/clientset/versioned"
21+
)
22+
23+
const (
24+
// DefaultNamespace is the namespace where ASO controller pods run.
25+
DefaultNamespace = "azureserviceoperator-system"
26+
27+
// DefaultPodPrefix is the prefix of the ASO controller pod name.
28+
DefaultPodPrefix = "azureserviceoperator-controller-manager"
29+
30+
// DefaultInterval is how frequently to poll for pod metrics.
31+
DefaultInterval = 5 * time.Second
32+
)
33+
34+
// CollectorConfig holds configuration for a MetricsCollector.
35+
type CollectorConfig struct {
36+
// Namespace is the Kubernetes namespace to monitor.
37+
Namespace string
38+
39+
// PodPrefix filters pods whose names start with this prefix.
40+
PodPrefix string
41+
42+
// Interval is how often to poll the metrics API.
43+
Interval time.Duration
44+
}
45+
46+
// MetricsCollector polls the Kubernetes metrics API at a regular interval and stores samples.
47+
type MetricsCollector struct {
48+
client metricsclient.Interface
49+
namespace string
50+
podPrefix string
51+
interval time.Duration
52+
53+
mu sync.Mutex
54+
samples []Sample
55+
start time.Time
56+
57+
cancel context.CancelFunc
58+
done chan struct{}
59+
}
60+
61+
// NewMetricsCollector creates a MetricsCollector that polls pod metrics from the given cluster.
62+
// It targets pods in the specified namespace whose names start with podPrefix.
63+
func NewMetricsCollector(cfg *rest.Config, collectorCfg CollectorConfig) (*MetricsCollector, error) {
64+
mc, err := metricsclient.NewForConfig(cfg)
65+
if err != nil {
66+
return nil, fmt.Errorf("creating metrics client: %w", err)
67+
}
68+
69+
namespace := collectorCfg.Namespace
70+
if namespace == "" {
71+
namespace = DefaultNamespace
72+
}
73+
74+
podPrefix := collectorCfg.PodPrefix
75+
if podPrefix == "" {
76+
podPrefix = DefaultPodPrefix
77+
}
78+
79+
interval := collectorCfg.Interval
80+
if interval <= 0 {
81+
interval = DefaultInterval
82+
}
83+
84+
return &MetricsCollector{
85+
client: mc,
86+
namespace: namespace,
87+
podPrefix: podPrefix,
88+
interval: interval,
89+
}, nil
90+
}
91+
92+
// CheckAvailable probes the metrics API to verify that metrics-server is reachable.
93+
// Returns an error if the metrics API is not available.
94+
func (mc *MetricsCollector) CheckAvailable(ctx context.Context) error {
95+
_, err := mc.client.MetricsV1beta1().PodMetricses(mc.namespace).List(ctx, metav1.ListOptions{Limit: 1})
96+
if err != nil {
97+
return fmt.Errorf("metrics-server not available in namespace %q: %w", mc.namespace, err)
98+
}
99+
return nil
100+
}
101+
102+
// Start begins collecting metrics in a background goroutine.
103+
// Call Stop to end collection.
104+
func (mc *MetricsCollector) Start(log logr.Logger) {
105+
mc.mu.Lock()
106+
mc.start = time.Now()
107+
mc.samples = nil
108+
mc.mu.Unlock()
109+
110+
ctx, cancel := context.WithCancel(context.Background())
111+
mc.cancel = cancel
112+
mc.done = make(chan struct{})
113+
114+
go func() {
115+
defer close(mc.done)
116+
ticker := time.NewTicker(mc.interval)
117+
defer ticker.Stop()
118+
119+
for {
120+
select {
121+
case <-ctx.Done():
122+
return
123+
case <-ticker.C:
124+
if err := mc.collect(ctx); err != nil {
125+
log.Error(err, "metrics collection error")
126+
}
127+
}
128+
}
129+
}()
130+
131+
log.Info("Metrics collector started",
132+
"namespace", mc.namespace,
133+
"podPrefix", mc.podPrefix,
134+
"interval", mc.interval)
135+
}
136+
137+
// Stop ends metrics collection and waits for the collector goroutine to exit.
138+
func (mc *MetricsCollector) Stop() {
139+
if mc.cancel != nil {
140+
mc.cancel()
141+
<-mc.done
142+
}
143+
}
144+
145+
// Samples returns a copy of all collected samples.
146+
func (mc *MetricsCollector) Samples() []Sample {
147+
mc.mu.Lock()
148+
defer mc.mu.Unlock()
149+
result := slices.Clone(mc.samples)
150+
return result
151+
}
152+
153+
// collect performs a single metrics API poll and stores the results.
154+
func (mc *MetricsCollector) collect(ctx context.Context) error {
155+
podMetricsList, err := mc.client.MetricsV1beta1().PodMetricses(mc.namespace).List(ctx, metav1.ListOptions{})
156+
if err != nil {
157+
return fmt.Errorf("listing pod metrics: %w", err)
158+
}
159+
160+
now := time.Now()
161+
mc.mu.Lock()
162+
defer mc.mu.Unlock()
163+
164+
for i := range podMetricsList.Items {
165+
pod := &podMetricsList.Items[i]
166+
if !strings.HasPrefix(pod.Name, mc.podPrefix) {
167+
continue
168+
}
169+
170+
mc.collectPodContainers(pod, now)
171+
}
172+
173+
return nil
174+
}
175+
176+
// collectPodContainers extracts metrics from each container in a pod.
177+
func (mc *MetricsCollector) collectPodContainers(pod *metricsv1beta1.PodMetrics, now time.Time) {
178+
for i := range pod.Containers {
179+
container := &pod.Containers[i]
180+
181+
cpuMillis := container.Usage.Cpu().MilliValue()
182+
memBytes := container.Usage.Memory().Value()
183+
184+
sample := Sample{
185+
Timestamp: now,
186+
Elapsed: now.Sub(mc.start),
187+
PodName: pod.Name,
188+
ContainerName: container.Name,
189+
CPUMillicores: float64(cpuMillis),
190+
MemoryBytes: memBytes,
191+
}
192+
mc.samples = append(mc.samples, sample)
193+
}
194+
}

0 commit comments

Comments
 (0)