datum-cloud · zachsmith1 · Sep 18, 2025 · Sep 18, 2025 · Sep 19, 2025 · Sep 19, 2025
diff --git a/Taskfile.yaml b/Taskfile.yaml
@@ -497,3 +497,139 @@ tasks:
             echo "🎉 All Prometheus rule tests passed."
           fi
       silent: false
+
+  perf:run:
+    desc: Run Milo end-to-end performance scenario and download results
+    silent: true
+    cmds:
+      - |
+        set -euo pipefail
+        # Parse CLI key=value overrides passed after -- and export as env
+        for kv in {{.CLI_ARGS}}; do
+          case "$kv" in
+            *=*) key="${kv%%=*}"; val="${kv#*=}"; export "$key=$val" ;;
+            *) : ;; # ignore non key=value tokens
+          esac
+        done
+        NS="${NS:-milo-system}"
+        MILO_NS="${MILO_NAMESPACE:-milo-system}"
+        VM_NS="${VM_NAMESPACE:-telemetry-system}"
+        VM_SVC_NAME="${VM_SERVICE_NAME:-vmsingle-telemetry-system-vm-victoria-metrics-k8s-stack}"
+        VM_PORT="${VM_PORT:-8428}"
+        VM_BASE_URL="${VM_BASE_URL:-http://${VM_SVC_NAME}.${VM_NS}.svc.cluster.local:${VM_PORT}}"
+        APISERVER_REGEX="${APISERVER_POD_REGEX:-milo-apiserver.*}"
+        ETCD_REGEX="${ETCD_POD_REGEX:-etcd.*}"
+        MILO_KUBECONFIG_SECRET_NAME="${MILO_KUBECONFIG_SECRET_NAME:-milo-controller-manager-kubeconfig}"
+        MILO_KUBECONFIG_SECRET_KEY="${MILO_KUBECONFIG_SECRET_KEY:-kubeconfig}"
+        MILO_KUBECONFIG_PATH="${MILO_KUBECONFIG_PATH:-/work/milo-kubeconfig}"
+        NUM_PROJECTS="${NUM_PROJECTS:-{{default "100" .NUM_PROJECTS}}}"
+        NUM_SECRETS_PER_PROJECT="${NUM_SECRETS_PER_PROJECT:-{{default "100" .NUM_SECRETS_PER_PROJECT}}}"
+        NUM_CONFIGMAPS_PER_PROJECT="${NUM_CONFIGMAPS_PER_PROJECT:-{{default "100" .NUM_CONFIGMAPS_PER_PROJECT}}}"
+        PROJECT_CONCURRENCY="${PROJECT_CONCURRENCY:-{{default "4" .PROJECT_CONCURRENCY}}}"
+        OBJECT_CONCURRENCY="${OBJECT_CONCURRENCY:-{{default "8" .OBJECT_CONCURRENCY}}}"
+        RUN_OBJECTS_PHASE="${RUN_OBJECTS_PHASE:-{{default "true" .RUN_OBJECTS_PHASE}}}"
+        OUT_DIR="${OUT_DIR:-{{default "/work/out" .OUT_DIR}}}"
+        STABILIZE_SECONDS="${STABILIZE_SECONDS:-{{default "90" .STABILIZE_SECONDS}}}"
+        MEASURE_WINDOW="${MEASURE_WINDOW:-{{default "2m" .MEASURE_WINDOW}}}"
+        ORG_NAME="${ORG_NAME:-{{default "" .ORG_NAME}}}"
+
+        echo "🔎 Checking Milo kubeconfig …"
+        if [ ! -f ".milo/kubeconfig" ]; then
+          echo "Error: .milo/kubeconfig not found. Run 'task dev:setup' first." >&2
+          exit 1
+        fi
+
+        echo "🔐 Ensuring perf-runner RBAC is applied …"
+        sed "s/NAMESPACE_PLACEHOLDER/${NS}/g" test/performance/config/perf-runner-rbac.yaml | task test-infra:kubectl -- apply -f -
+
+        echo "🗂  Publishing perf script as ConfigMap …"
+        task test-infra:kubectl -- -n ${NS} create configmap perf-script \
+          --from-file=perf_run.py=test/performance/scripts/perf_run.py \
+          --dry-run=client -o yaml | task test-infra:kubectl -- apply -f -
+
+        echo "🚀 Launching perf runner Job …"
+        sed \
+          -e "s/MILO_NAMESPACE_PLACEHOLDER/${MILO_NS}/g" \
+          -e "s/NAMESPACE_PLACEHOLDER/${NS}/g" \
+          -e "s#VM_BASE_URL_PLACEHOLDER#${VM_BASE_URL}#g" \
+          -e "s/APISERVER_REGEX_PLACEHOLDER/${APISERVER_REGEX}/g" \
+          -e "s/ETCD_REGEX_PLACEHOLDER/${ETCD_REGEX}/g" \
+          -e "s/NUM_PROJECTS_PLACEHOLDER/${NUM_PROJECTS}/g" \
+          -e "s/NUM_SECRETS_PLACEHOLDER/${NUM_SECRETS_PER_PROJECT}/g" \
+          -e "s/NUM_CONFIGMAPS_PLACEHOLDER/${NUM_CONFIGMAPS_PER_PROJECT}/g" \
+          -e "s/STABILIZE_SECONDS_PLACEHOLDER/${STABILIZE_SECONDS}/g" \
+          -e "s/MEASURE_WINDOW_PLACEHOLDER/${MEASURE_WINDOW}/g" \
+          -e "s/ORG_NAME_PLACEHOLDER/${ORG_NAME}/g" \
+          -e "s/PROJECT_CONCURRENCY_PLACEHOLDER/${PROJECT_CONCURRENCY}/g" \
+          -e "s/OBJECT_CONCURRENCY_PLACEHOLDER/${OBJECT_CONCURRENCY}/g" \
+          -e "s/RUN_OBJECTS_PHASE_PLACEHOLDER/${RUN_OBJECTS_PHASE}/g" \
+          -e "s/MILO_KUBECONFIG_SECRET_PLACEHOLDER/${MILO_KUBECONFIG_SECRET_NAME}/g" \
+          -e "s/MILO_KUBECONFIG_KEY_PLACEHOLDER/${MILO_KUBECONFIG_SECRET_KEY}/g" \
+          test/performance/config/perf-runner-job.yaml | task test-infra:kubectl -- apply -f -
+
+        echo "⏳ Waiting for Job completion …"
+        task test-infra:kubectl -- -n ${NS} wait --for=condition=Complete job/perf-runner --timeout=45m
+
+        echo "⬇️  Downloading results …"
+        mkdir -p reports/perf
+        # Prefer ConfigMap (works even if pod already terminated)
+        TEST_ID=$(task test-infra:kubectl -- -n ${NS} get cm perf-results -o jsonpath='{.data.test_id}' 2>/dev/null || true)
+        OUT_DIR_LOCAL="reports/perf/${TEST_ID:-latest}"
+        mkdir -p "$OUT_DIR_LOCAL"
+        task test-infra:kubectl -- -n ${NS} get cm perf-results -o jsonpath='{.data.results\.json}' > "$OUT_DIR_LOCAL/results.json" || true
+        task test-infra:kubectl -- -n ${NS} get cm perf-results -o jsonpath='{.data.report\.html}' > "$OUT_DIR_LOCAL/report.html" || true
+        # Fallback to copying from the pod if ConfigMap wasn't available
+        if [ ! -s "$OUT_DIR_LOCAL/results.json" ] || [ ! -s "$OUT_DIR_LOCAL/report.html" ]; then
+          POD=$(task test-infra:kubectl -- -n ${NS} get pods -l job-name=perf-runner -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)
+          if [ -n "$POD" ]; then
+            task test-infra:kubectl -- -n ${NS} cp "$POD:/work/out/results.json" "$OUT_DIR_LOCAL/results.json" || true
+            task test-infra:kubectl -- -n ${NS} cp "$POD:/work/out/report.html" "$OUT_DIR_LOCAL/report.html" || true
+          fi
+        fi
+        echo "✅ Results saved to $OUT_DIR_LOCAL"
+
+  perf:cleanup:
+    desc: Cleanup resources created by the last perf run (org/projects/secrets/configmaps)
+    silent: true
+    cmds:
+      - |
+        set -euo pipefail
+        NS="${NS:-milo-system}"
+        MILO_KUBECONFIG_SECRET_NAME="${MILO_KUBECONFIG_SECRET_NAME:-milo-controller-manager-kubeconfig}"
+        MILO_KUBECONFIG_SECRET_KEY="${MILO_KUBECONFIG_SECRET_KEY:-kubeconfig}"
+        MILO_KUBECONFIG_PATH="${MILO_KUBECONFIG_PATH:-/work/milo-kubeconfig}"
+
+        if [ ! -f ".milo/kubeconfig" ]; then
+          echo "Error: .milo/kubeconfig not found. Run 'task dev:setup' first." >&2
+          exit 1
+        fi
+
+        echo "🔎 Discovering last test identifiers …"
+        # Allow override from CLI envs if ConfigMap isn't present
+        TEST_ID_CM=$(task test-infra:kubectl -- -n ${NS} get cm perf-results -o jsonpath='{.data.test_id}' 2>/dev/null || true)
+        ORG_NAME_CM=$(task test-infra:kubectl -- -n ${NS} get cm perf-results -o jsonpath='{.data.org_name}' 2>/dev/null || true)
+        TEST_ID="${TEST_ID:-$TEST_ID_CM}"
+        ORG_NAME="${ORG_NAME:-$ORG_NAME_CM}"
+        if [ -z "${TEST_ID}" ] || [ -z "${ORG_NAME}" ]; then
+          echo "No existing results found in namespace ${NS} (ConfigMap perf-results). Nothing to cleanup."
+          exit 0
+        fi
+
+        echo "🚮 Launching cleanup Job for test ${TEST_ID} …"
+        sed \
+          -e "s/NAMESPACE_PLACEHOLDER/${NS}/g" \
+          -e "s/TEST_ID_PLACEHOLDER/${TEST_ID}/g" \
+          -e "s/ORG_NAME_PLACEHOLDER/${ORG_NAME}/g" \
+          -e "s/MILO_KUBECONFIG_SECRET_PLACEHOLDER/${MILO_KUBECONFIG_SECRET_NAME}/g" \
+          -e "s/MILO_KUBECONFIG_KEY_PLACEHOLDER/${MILO_KUBECONFIG_SECRET_KEY}/g" \
+          test/performance/config/perf-cleanup-job.yaml | task test-infra:kubectl -- apply -f -
+
+        echo "⏳ Waiting for cleanup Job completion …"
+        task test-infra:kubectl -- -n ${NS} wait --for=condition=Complete job/perf-cleanup --timeout=30m
+
+        echo "🧹 Removing runner artifacts (keeping downloaded results) …"
+        task test-infra:kubectl -- -n ${NS} delete job/perf-runner --ignore-not-found
+        task test-infra:kubectl -- -n ${NS} delete job/perf-cleanup --ignore-not-found
+        task test-infra:kubectl -- -n ${NS} delete configmap perf-script --ignore-not-found
+        task test-infra:kubectl -- -n ${NS} delete configmap perf-results --ignore-not-found
+        echo "✅ Cleanup complete."
diff --git a/config/apiserver/deployment.yaml b/config/apiserver/deployment.yaml
@@ -135,7 +135,7 @@ spec:
             memory: 128Mi
           limits:
             cpu: 500m
-            memory: 512Mi
+            memory: 2G
         startupProbe:
           failureThreshold: 3
           httpGet:

diff --git a/config/components/prometheus-monitoring/servicemonitor-etcd.yaml b/config/components/prometheus-monitoring/servicemonitor-etcd.yaml
@@ -0,0 +1,18 @@
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: etcd-metrics
+  namespace: milo-system
+spec:
+  namespaceSelector:
+    matchNames: ["milo-system"]
+  selector:
+    matchLabels:
+      app.kubernetes.io/component: etcd
+      app.kubernetes.io/name: etcd
+  endpoints:
+    - port: client
+      path: /metrics
+      scheme: http
+      interval: 15s
+
diff --git a/config/dependencies/etcd/helmrelease.yaml b/config/dependencies/etcd/helmrelease.yaml
@@ -35,7 +35,7 @@ spec:
     resources:
       limits:
         cpu: 500m
-        memory: 512Mi
+        memory: 2G
       requests:
         cpu: 200m
         memory: 256Mi

diff --git a/config/overlays/test-infra/components/auth/auth-tokens-secret.yaml b/config/overlays/test-infra/components/auth/auth-tokens-secret.yaml
@@ -5,5 +5,5 @@ metadata:
 type: Opaque
 stringData:
   tokens.csv: |
-    test-admin-token,admin,1001,"system:masters"
+    test-admin-token,admin,admin,"system:masters"
     test-user-token,test-user,1002,"system:authenticated"
diff --git a/test/performance/README.md b/test/performance/README.md
@@ -0,0 +1,121 @@
+### Milo performance runner
+
+This performance suite provisions Milo/Etcd service monitors and measures CPU/Memory snapshots from VictoriaMetrics.
+
+Files and structure:
+- performance/scripts/perf_run.py: runner script executed inside a Kubernetes Job
+- performance/config/perf-runner-job.yaml: Job template for the run phase
+- performance/config/perf-cleanup-job.yaml: Job template for cleanup
+- performance/config/perf-runner-rbac.yaml: ServiceAccount/Role/RoleBinding used by the jobs
+
+#### Summary
+
+- Creates a Milo `Organization`, then N `Projects`, waits for all to be Ready, and times it.
+- Takes metrics snapshots before (baseline), after projects are ready, and optionally after per-project object creation.
+- Optionally creates M `Secrets` and K `ConfigMaps` in each Project (parallelized), then measures again.
+- Saves results to a ConfigMap and downloads a local HTML report and JSON.
+
+#### Prerequisites
+
+1) Bring up dev stack and observability:
+
+```bash
+task dev:setup && task dev:install-observability
+```
+
+2) Ensure a Milo kubeconfig secret exists in your cluster. By default the tasks mount `Secret/milo-controller-manager-kubeconfig` (key `kubeconfig`). You can override via env (see knobs below).
+
+#### How to run
+
+- Full run (org + projects + objects) with defaults:
+
+```bash
+task perf:run
+```
+
+- Projects-only (skip secrets/configmaps) and higher parallelism:
+
+```bash
+task perf:run -- RUN_OBJECTS_PHASE=false PROJECT_CONCURRENCY=10
+```
+
+- Cleanup all resources from the last run:
+
+```bash
+task perf:cleanup
+```
+
+#### Outputs
+
+- In-cluster: ConfigMap `perf-results` in `NS` (default `milo-system`) with keys `results.json`, `report.html`, `test_id`, `org_name`.
+- Local: `reports/perf/<test_id>/results.json` and `report.html` downloaded by the task after the Job completes. The HTML report includes grouped bar charts (CPU cores and Memory MB) and per-project delta KPIs for apiserver and etcd.
+
+#### What the runner does
+
+1) Baseline: query VictoriaMetrics for Milo apiserver and etcd CPU/memory.
+2) Create Organization (no wait), then create N Projects, wait for all Projects Ready; record duration.
+3) Stabilize, then snapshot “after projects”.
+4) If enabled, create per-Project objects (Secrets/ConfigMaps) concurrently; stabilize, then snapshot “after secrets+configmaps”.
+
+Snapshots come from VictoriaMetrics using `container_cpu_usage_seconds_total` (rate) and `container_memory_working_set_bytes` (avg_over_time) for pods matching the configured namespace and pod name regexes.
+
+#### Configuration knobs (env vars)
+
+Pass on the `task perf:run -- KEY=value ...` command line. Defaults shown in parentheses.
+
+- Resource selection
+  - `NS` (milo-system): Namespace to run Job and store results ConfigMap
+  - `MILO_NAMESPACE` (milo-system): Namespace to measure apiserver/etcd pods
+  - `APISERVER_POD_REGEX` (milo-apiserver.*): Regex for apiserver pods
+  - `ETCD_POD_REGEX` (etcd.*): Regex for etcd pods
+
+- Metrics source (VictoriaMetrics)
+  - `VM_NAMESPACE` (telemetry-system)
+  - `VM_SERVICE_NAME` (vmsingle-telemetry-system-vm-victoria-metrics-k8s-stack)
+  - `VM_PORT` (8428)
+  - `VM_BASE_URL` (optional override, e.g. http://hostname:8428). Default uses in-cluster FQDN: `http://<service>.<namespace>.svc.cluster.local:8428`.
+  - `MEASURE_WINDOW` (2m): Range window for rate/avg_over_time
+
+- Scale and workload
+  - `NUM_PROJECTS` (100)
+  - `RUN_OBJECTS_PHASE` (true): Toggle per-project Secrets/ConfigMaps phase
+  - `NUM_SECRETS_PER_PROJECT` (100)
+  - `NUM_CONFIGMAPS_PER_PROJECT` (100)
+  - `PROJECT_CONCURRENCY` (4): Number of projects processed in parallel when creating objects
+  - `OBJECT_CONCURRENCY` (8): Secrets/ConfigMaps parallelism inside each project
+
+- Stabilization windows
+  - `STABILIZE_SECONDS` (90): Sleep before snapshots after Projects and after Objects
+
+- Identity / scoping
+  - `ORG_NAME` (auto-generated): Name of Organization to create
+  - `MILO_KUBECONFIG_SECRET_NAME` (milo-controller-manager-kubeconfig): Secret containing Milo kubeconfig
+  - `MILO_KUBECONFIG_SECRET_KEY` (kubeconfig): Secret key with kubeconfig content
+  - `MILO_KUBECONFIG_PATH` (/work/milo-kubeconfig): In-container path to mount kubeconfig
+  - `AUTH_BEARER_TOKEN` (optional): Override token injected into kubeconfig user for troubleshooting
+
+#### Examples
+
+- Measure project-only impact:
+
+```bash
+task perf:run -- RUN_OBJECTS_PHASE=false STABILIZE_SECONDS=60 NUM_PROJECTS=200
+```
+
+- Heavier objects phase, more parallelism:
+
+```bash
+task perf:run -- NUM_SECRETS_PER_PROJECT=500 NUM_CONFIGMAPS_PER_PROJECT=500 PROJECT_CONCURRENCY=12 OBJECT_CONCURRENCY=24
+```
+
+- Point to a custom VictoriaMetrics endpoint:
+
+```bash
+task perf:run -- VM_BASE_URL=http://vm.my-domain.local:8428
+```
+
+- Use a specific Organization name:
+
+```bash
+task perf:run -- ORG_NAME=perf-cow
+```
diff --git a/test/performance/config/perf-cleanup-job.yaml b/test/performance/config/perf-cleanup-job.yaml
@@ -0,0 +1,52 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: perf-cleanup
+  namespace: NAMESPACE_PLACEHOLDER
+spec:
+  ttlSecondsAfterFinished: 300
+  backoffLimit: 0
+  template:
+    spec:
+      serviceAccountName: perf-runner
+      restartPolicy: Never
+      containers:
+      - name: cleanup
+        image: python:3.11
+        imagePullPolicy: IfNotPresent
+        env:
+        - name: TARGET_NAMESPACE
+          value: NAMESPACE_PLACEHOLDER
+        - name: RUN_MODE
+          value: cleanup
+        - name: TEST_ID
+          value: "TEST_ID_PLACEHOLDER"
+        - name: ORG_NAME
+          value: "ORG_NAME_PLACEHOLDER"
+        - name: MILO_KUBECONFIG_PATH
+          value: "/work/milo-kubeconfig"
+        volumeMounts:
+        - name: script
+          mountPath: /work/perf_run.py
+          subPath: perf_run.py
+          readOnly: true
+        - name: milo-kubeconfig
+          mountPath: /work/milo-kubeconfig
+          subPath: MILO_KUBECONFIG_KEY_PLACEHOLDER
+          readOnly: true
+        command: ["bash","-lc"]
+        args:
+        - >-
+          python -m pip install --no-cache-dir kubernetes requests pyyaml &&
+          python -u /work/perf_run.py
+      volumes:
+      - name: script
+        configMap:
+          name: perf-script
+          defaultMode: 0444
+      - name: milo-kubeconfig
+        secret:
+          secretName: MILO_KUBECONFIG_SECRET_PLACEHOLDER
+          defaultMode: 0400
+
+