diff --git a/.gitignore b/.gitignore index afc947e9..059b9dba 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,5 @@ test-results *.goe **/kuttl-test.json **/kubeconfig +AGENT.md +.gitlab-ci.yml \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 76b9b5ec..f4aff2b2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,94 @@ handling on your side. ## Unreleased +## 2025-09-23 - Release v100.0.11 + +### Fixes/Improvements + +- health: Fail liveness on sustained port exhaustion to auto-restart pod + - /healthz returns 503 when EADDRNOTAVAIL errors exceed threshold within window + - Enabled by default; control via envs: + - `HEALTH_FAIL_ON_PORT_EXHAUSTION` (default: true) + - `PORT_EXHAUSTION_WINDOW` (default: 60s) + - `PORT_EXHAUSTION_THRESHOLD` (default: 8) +- registry: Record EADDRNOTAVAIL in HTTP transport; sliding-window tracker powers health gate + +### Notes + +- No manifest changes required; your existing livenessProbe on `/healthz` will now trigger restart when ports are exhausted. + +## 2025-09-23 - Release v100.0.10a + +### Fixes/Improvements + +- registry: Add periodic HTTP transport janitor to proactively close idle connections + - New helper `StartTransportJanitor(interval)` runs `CloseIdleConnections()` across all cached transports + - Wired into both `run` and `webhook` commands; stops gracefully on shutdown + - Default interval: 5m; can be tuned via `REGISTRY_TRANSPORT_JANITOR_INTERVAL` (set `0` to disable) + +### Rationale + +- Mitigates long-running cluster issues where outbound dials eventually fail with + `connect: cannot assign requested address` (ephemeral port/SNAT exhaustion) by + improving connection reuse and cleaning up idle sockets over time. + +### Notes + +- This complements existing changes: shared transports per registry, tuned + `MaxConnsPerHost`/idle pools/timeouts, and per‑registry in‑flight caps. + +## 2025-09-19 - Release v100.0.9a + +### Changes + +- registry: Increase default per-attempt timeouts to 60s for tag and manifest fetches +- registry: Make per-attempt timeouts env-tunable via `REGISTRY_TAG_TIMEOUT` and `REGISTRY_MANIFEST_TIMEOUT` +- http transport: Default `ResponseHeaderTimeout` raised to 60s; env-tunable via `REGISTRY_RESPONSE_HEADER_TIMEOUT` +- docs: Document new envs and updated defaults + +### Notes + +- If your registries are occasionally slow under load, you can set `REGISTRY_TAG_TIMEOUT=90s`, `REGISTRY_MANIFEST_TIMEOUT=90s`, and `REGISTRY_RESPONSE_HEADER_TIMEOUT=90s` to tolerate longer server delays. Consider also lowering concurrency and adding per‑registry rate limits. + +## 2025-09-19 - Release v100.0.8a + +### Changes + +- registry-scanner: JWT auth dedupe and retries stabilized; add metrics nil-guards to avoid panics in tests +- registry-scanner: Fix jittered exponential backoff math for retries +- tests(registry): Add JWT singleflight, different scopes, and retry/backoff tests; reset Prometheus registry per test + +### Notes + +- This release contains the JWT singleflight + authorizer transport cache improvements; ensure you update the embedded `registry-scanner` module. + +## 2025-09-19 - Release v100.0.7 + +### Fixes + +- fix(continuous): initialize Argo client when warm-up is disabled (`--warmup-cache=false`) to prevent panic in `runContinuousOnce` + +### Changes + +- scheduler: continuous tick cadence set to ~1s (from ~100ms) +- docs: clarify boolean flag usage for `--warmup-cache=false` + +### Notes + +- If you disable warm-up, continuous starts immediately; each ~1s tick lists and filters apps, then dispatches those due. Unsupported apps are skipped. + +## 2025-09-19 - Release v100.0.6a + +### Changes + +- scheduler(continuous): increase tick cadence from ~100ms to ~1s to reduce log noise and API/list pressure; no change to per-app `--interval` gating +- docs(readme): remove Mermaid diagram; add ASCII architecture; add rate limiting/backpressure section; add phase comparison table (stock vs tuned) + +### Notes + +- Behavior impact: only the scheduler’s discovery cadence changes; application dispatch still respects `--interval`, in-flight guards, fairness (LRU/fail-first, cooldown, per-repo-cap), and concurrency caps. +- Recommended: if startup delay is undesirable, run with `--warmup-cache=false`. + ### Upgrade notes (no really, you MUST read this) * **Attention**: By default, `argocd-image-updater` now uses the K8s API to retrieve applications, instead of the Argo CD API. Also, it is now recommended to install in the same namespace as Argo CD is running in (`argocd` by default). For existing installations, which are running in a dedicated namespace. @@ -29,6 +117,164 @@ handling on your side. * refactor: make argocd-image-updater-config volume mapping optional (#145) + +## 2025-09-18 - Release v100.0.5a + +### Fixes + +- fix(git): Prevent panic in batched writer when `GetCreds` is nil or write-back method is not Git + - Only enqueue batched writes when `wbc.Method == git` + - Guard in `repoWriter.commitBatch` for missing `GetCreds` (skip with log) + +### Tests + +- test(git): Strengthen batched writer test to set `Method: WriteBackGit` and provide `GetCreds` stub, so missing-GetCreds would fail tests + +### Notes + +- No flags or defaults changed; safe upgrade from v100.0.4a + +## 2025-09-18 - Release v100.0.4a + +### Changes + +- test(git): Add unit test verifying batched writer flushes per-branch (monorepo safety) +- fix(git): Guard `getWriteBackBranch` against nil Application source +- docs: Clarify `--max-concurrency=0` (auto) in README quick reference + +### Notes + +- All existing tests pass. No changes to defaults or flags. + +## 2025-09-18 - Release v100.0.3a + +### Highlights + +- Continuous mode: per-app scheduling with independent timers (no full-cycle waits) +- Auto concurrency: `--max-concurrency=0` computes workers from CPUs/apps +- Robust registry auth and I/O: singleflight + retries with backoff on `/jwt/auth`, tag and manifest operations +- Safer connection handling: transport reuse, tuned timeouts, per‑registry in‑flight caps +- Git efficiency: per‑repo batched writer + retries +- Deep metrics: apps, cycles, registry, JWT + +### New features + +- feat(mode): `--mode=continuous` (default remains `cycle`) +- feat(concurrency): `--max-concurrency=0` for auto sizing +- feat(schedule): LRU / fail-first with `--schedule`; fairness with `--per-repo-cap`, `--cooldown` +- feat(auth): JWT `/jwt/auth` retries with backoff (singleflight dedupe) + - Env: `REGISTRY_JWT_ATTEMPTS` (default 7), `REGISTRY_JWT_RETRY_BASE` (200ms), `REGISTRY_JWT_RETRY_MAX` (3s) +- feat(metrics): Per-application timings and state + - `argocd_image_updater_application_update_duration_seconds{application}` + - `argocd_image_updater_application_last_attempt_timestamp{application}` + - `argocd_image_updater_application_last_success_timestamp{application}` + - `argocd_image_updater_images_considered_total{application}` + - `argocd_image_updater_images_skipped_total{application}` + - `argocd_image_updater_scheduler_skipped_total{reason}` +- feat(metrics): Cycle timing + - `argocd_image_updater_update_cycle_duration_seconds` + - `argocd_image_updater_update_cycle_last_end_timestamp` +- feat(metrics): Registry visibility + - `argocd_image_updater_registry_in_flight_requests{registry}` + - `argocd_image_updater_registry_request_duration_seconds{registry}` + - `argocd_image_updater_registry_http_status_total{registry,code}` + - `argocd_image_updater_registry_request_retries_total{registry,op}` + - `argocd_image_updater_registry_errors_total{registry,kind}` +- feat(metrics): Singleflight effectiveness + - `argocd_image_updater_singleflight_leaders_total{kind}` + - `argocd_image_updater_singleflight_followers_total{kind}` +- feat(metrics): JWT visibility + - `argocd_image_updater_registry_jwt_auth_requests_total{registry,service,scope}` + - `argocd_image_updater_registry_jwt_auth_errors_total{registry,service,scope,reason}` + - `argocd_image_updater_registry_jwt_auth_duration_seconds{registry,service,scope}` + - `argocd_image_updater_registry_jwt_token_ttl_seconds{registry,service,scope}` + +### Improvements + +- perf(registry): HTTP transport reuse; tuned `MaxIdleConns`, `MaxIdleConnsPerHost`, `MaxConnsPerHost`; response and handshake timeouts +- perf(registry): Per‑registry in‑flight cap to prevent connection storms +- resiliency(registry): Jittered retries for tags/manifests; `/jwt/auth` retries with backoff +- perf(git): Batched per‑repo writer; retries for fetch/shallow-fetch/push +- sched: Fairness via LRU/fail-first, cooldown, and per-repo caps + +### Defaults enabled (no flags) + +- Transport reuse and tuned timeouts +- Per‑registry in‑flight cap (default 15) +- Authorizer cache per (registry, repo) +- Singleflight on tags, manifests, and `/jwt/auth` +- Retries: tags/manifests (3x), JWT auth (defaults above) +- Git retries (env-overridable); Batched writer (disable via `GIT_BATCH_DISABLE=true`) + +### Docs + +- docs(install): Performance flags and defaults (continuous mode, auto concurrency, JWT retry envs) +- docs(metrics): Expanded metrics section + +### Tests + +- test: Unit tests for transport caching, metrics wrappers, continuous scheduler basics, and end-to-end build + +### Known issues + +- Under very high concurrency and bursty load, upstream registry/SNAT limits may still cause intermittent timeouts. The new caps, retries, and singleflight significantly reduce impact; tune per‑registry limits and consider HTTP/2 where available. + +## 2025-09-17 - Release v99.9.9 - 66de072 + +### New features + +* feat: Reuse HTTP transports for registries with keep-alives and timeouts +* feat: Initialize registry refresh-token map to enable token reuse +* feat: Add Makefile `DOCKER` variable to support `podman` + +### Improvements + +* perf: Cache transports per registry+TLS mode; add sensible connection/timeouts +* resiliency: Retry/backoff for registry tag listing +* resiliency: Retry/backoff for git fetch/shallow-fetch/push during write-back + +### Tests/Docs + +* test: Add unit tests for transport caching and token map init +* docs: Requirements/notes updates + +### Upgrade notes + +* None + +### Bug fixes + +* None + +### Bugs + +* Under very high concurrency (300–500) after 2–3 hours, nodes may hit ephemeral port exhaustion causing registry dials to fail: + + Example error observed: + + `dial tcp 10.2.163.141:5000: connect: cannot assign requested address` + + Notes: + - This typically manifests across all registries simultaneously under heavy outbound connection churn. + - Root cause is excessive parallel dials combined with short‑lived connections (TIME_WAIT buildup), not a specific registry outage. + - Mitigations available in v100.0.0a: larger keep‑alive pools, lower MaxConnsPerHost, and ability to close idle on cache clear. Operational mitigations: reduce updater concurrency and/or per‑registry limits (e.g., 500→250; 50 rps→20–30 rps) while investigating. + + Details: + - Old ports are “released” only after TIME_WAIT (2MSL). With HTTP/1.1 and big bursts, you create more concurrent outbound sockets than the ephemeral range can recycle before TIME_WAIT expires, so you hit “cannot assign requested address” even though old sockets eventually close. + - Why it still happens under 250/100 RPS: + - Each new dial consumes a unique local ephemeral port to the same dst tuple. TIME_WAIT lasts ~60–120s (kernel dependent). Bursty concurrency + short interval means you outpace reuse. + - Go HTTP/1.1 doesn’t pipeline; reuse works only if there’s an idle kept‑alive socket. If many goroutines need sockets at once, you dial anyway. + - Often compounded by SNAT limits at the node (Kubernetes egress): per‑dst NAT port cap can exhaust even faster. + - How to confirm quickly: + - Check TIME_WAIT to the registry IP:port: `ss -antp | grep :5000 | grep TIME_WAIT | wc -l` + - Check ephemeral range: `sysctl net.ipv4.ip_local_port_range` + - In Kubernetes, inspect node SNAT usage (some clouds cap SNAT ports per node/destination). + - What fixes it (software‑side, regardless of kernel/NAT tuning): + - Add a hard per‑registry in‑flight cap (e.g., 10–15) so requests queue instead of dialing new sockets. + - Lower `MaxConnsPerHost` further (e.g., 15). Keep large idle pools to maximize reuse. + - Add jitter to scheduling (avoid synchronized bursts); consider 30s interval over 15s. + - If the registry supports HTTP/2 over TLS, H2 multiplexing drastically reduces sockets. + ## 2020-12-06 - Release v0.8.0 ### Upgrade notes (no really, you MUST read this) diff --git a/Dockerfile b/Dockerfile index 39482c52..2a0ea6f5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,10 +2,10 @@ FROM golang:1.24 AS builder RUN mkdir -p /src/argocd-image-updater WORKDIR /src/argocd-image-updater +# copy the entire repo first so local replaces (./registry-scanner) exist in context +COPY . . # cache dependencies as a layer for faster rebuilds -COPY go.mod go.sum ./ RUN go mod download -COPY . . RUN mkdir -p dist && \ make controller diff --git a/Makefile b/Makefile index e4d7090b..0c80921c 100644 --- a/Makefile +++ b/Makefile @@ -12,6 +12,9 @@ ARCH?=$(shell go env GOARCH) OUTDIR?=dist BINNAME?=argocd-image-updater +# Container runtime (override with DOCKER=podman) +DOCKER?=docker + CURRENT_DIR=$(shell pwd) VERSION=$(shell cat ${CURRENT_DIR}/VERSION) GIT_COMMIT=$(shell git rev-parse HEAD) @@ -87,14 +90,14 @@ controller: .PHONY: image image: clean-image - docker build \ + ${DOCKER} build \ -t ${IMAGE_PREFIX}${IMAGE_NAME}:${IMAGE_TAG} \ --pull \ . .PHONY: multiarch-image multiarch-image: - docker buildx build \ + ${DOCKER} buildx build \ -t ${IMAGE_PREFIX}${IMAGE_NAME}:${IMAGE_TAG} \ --progress plain \ --pull \ @@ -103,7 +106,7 @@ multiarch-image: .PHONY: multiarch-image-push multiarch-image-push: - docker buildx build \ + ${DOCKER} buildx build \ -t ${IMAGE_PREFIX}${IMAGE_NAME}:${IMAGE_TAG} \ --progress plain \ --pull \ @@ -113,7 +116,7 @@ multiarch-image-push: .PHONY: image-push image-push: image - docker push ${IMAGE_PREFIX}${IMAGE_NAME}:${IMAGE_TAG} + ${DOCKER} push ${IMAGE_PREFIX}${IMAGE_NAME}:${IMAGE_TAG} .PHONY: release-binaries release-binaries: @@ -130,10 +133,10 @@ release-binaries: .PHONY: extract-binary extract-binary: - docker rm argocd-image-updater-${IMAGE_TAG} || true - docker create --name argocd-image-updater-${IMAGE_TAG} ${IMAGE_PREFIX}${IMAGE_NAME}:${IMAGE_TAG} - docker cp argocd-image-updater-${IMAGE_TAG}:/usr/local/bin/argocd-image-updater /tmp/argocd-image-updater_${IMAGE_TAG}_linux-amd64 - docker rm argocd-image-updater-${IMAGE_TAG} + ${DOCKER} rm argocd-image-updater-${IMAGE_TAG} || true + ${DOCKER} create --name argocd-image-updater-${IMAGE_TAG} ${IMAGE_PREFIX}${IMAGE_NAME}:${IMAGE_TAG} + ${DOCKER} cp argocd-image-updater-${IMAGE_TAG}:/usr/local/bin/argocd-image-updater /tmp/argocd-image-updater_${IMAGE_TAG}_linux-amd64 + ${DOCKER} rm argocd-image-updater-${IMAGE_TAG} .PHONY: lint lint: @@ -148,7 +151,7 @@ codegen: manifests .PHONY: run-test run-test: - docker run -v $(HOME)/.kube:/kube --rm -it \ + ${DOCKER} run -v $(HOME)/.kube:/kube --rm -it \ -e ARGOCD_TOKEN \ ${IMAGE_PREFIX}${IMAGE_NAME}:${IMAGE_TAG} \ --kubeconfig /kube/config \ @@ -157,5 +160,5 @@ run-test: .PHONY: serve-docs serve-docs: - docker run ${MKDOCS_RUN_ARGS} --rm -it -p 8000:8000 -v ${CURRENT_DIR}:/docs ${MKDOCS_DOCKER_IMAGE} serve -a 0.0.0.0:8000 + ${DOCKER} run ${MKDOCS_RUN_ARGS} --rm -it -p 8000:8000 -v ${CURRENT_DIR}:/docs ${MKDOCS_DOCKER_IMAGE} serve -a 0.0.0.0:8000 diff --git a/README.md b/README.md index 085d85f4..dc37a367 100644 --- a/README.md +++ b/README.md @@ -111,3 +111,202 @@ see how we can make this happen. There is [an open proposal](https://github.com/argoproj/argo-cd/issues/7385) to migrate this project into the `argoproj` org (out of the `argoproj-labs` org) and include it in the installation of Argo CD. + +## Engineering notes (recent changes) + +- Registry client hardening + - HTTP transport reuse per registry with sane timeouts (keep-alives, capped TLS and response header timeouts) to cut connection churn under load. + - Singleflight-style deduplication and jittered retries (tags, manifests) with per-attempt deadlines to avoid thundering herds and reduce /jwt/auth pressure. + +- Git write-back throughput + - Per-repo serialization to eliminate races in monorepos, plus a batched writer that coalesces multiple intents into a single commit/push per repo/branch. + - Multi-branch grouping: intents for different write branches never mix; each branch flushes independently. + - Logs reflect queued writes: look for "Queuing N parameter update(s) … (git write pending)" and the subsequent commit/push logs. + +- Scheduling and fairness + - Optional scheduler flags to prioritize apps: `--schedule` (default|lru|fail-first), `--cooldown` (deprioritize recently successful apps), and `--per-repo-cap` (cap updates per repo per cycle). + - Goal: prevent a hot monorepo from starving others while keeping high concurrency. + +- Operational guidance + - Concurrency: set `--max-concurrency` roughly ≥ number of active repos; monorepos serialize on their writer, others proceed in parallel. + - Registry RPS: tune `limit` in `registries.conf` (e.g., 30–50 RPS) and monitor latency/429s. + - Monorepos: prefer per-app write branches or rely on batching to reduce fetch/commit/push churn. + +Flags added +- `--schedule` (env: IMAGE_UPDATER_SCHEDULE): default|lru|fail-first +- `--cooldown` (env: IMAGE_UPDATER_COOLDOWN): duration (e.g., 30s) +- `--per-repo-cap` (env: IMAGE_UPDATER_PER_REPO_CAP): integer (0 = unlimited) + +Notes +- For tests or legacy behavior, set `GIT_BATCH_DISABLE=true` to perform immediate (non-batched) write-back. + +## Runtime limits and tunables (quick reference) + +- Max concurrency: `--max-concurrency` (env `IMAGE_UPDATER_MAX_CONCURRENCY`), default 10; set `0` for auto sizing +- Interval: `--interval` (env `IMAGE_UPDATER_INTERVAL`), default 2m +- Scheduler: `--schedule` (env `IMAGE_UPDATER_SCHEDULE`), default `default` (also `lru|fail-first`) +- Cooldown: `--cooldown` (env `IMAGE_UPDATER_COOLDOWN`), default 0 +- Per-repo cap: `--per-repo-cap` (env `IMAGE_UPDATER_PER_REPO_CAP`), default 0 +- Git retries (env): `ARGOCD_GIT_ATTEMPTS_COUNT`=3, `ARGOCD_GIT_RETRY_DURATION`=500ms, `ARGOCD_GIT_RETRY_MAX_DURATION`=10s, `ARGOCD_GIT_RETRY_FACTOR`=2 +- Registry rate limit: `limit` in `registries.conf` per registry, default 20 rps if unspecified +- HTTP transport (per registry, defaults): MaxIdleConns=1000, MaxIdleConnsPerHost=200, MaxConnsPerHost=30, IdleConnTimeout=90s, TLSHandshakeTimeout=10s, ResponseHeaderTimeout=60s, ExpectContinueTimeout=1s, HTTP/2 on HTTPS via ALPN + +### What each tunable does and affects + +- max-concurrency (env: `IMAGE_UPDATER_MAX_CONCURRENCY`) + - What: Number of parallel application update workers. + - Affects: CPU, concurrent registry and Git load. Higher = faster coverage but more burst pressure. + - Guidance: 100–250 for large fleets; raise cautiously per registry/Git capacity. + +- interval (env: `IMAGE_UPDATER_INTERVAL`) + - What: Delay between full update cycles (0 = run once). + - Affects: How often apps are reconsidered; shorter intervals increase burstiness and port churn. + - Guidance: 30–60s is a good balance; 15s can cause synchronized spikes. + +- schedule (env: `IMAGE_UPDATER_SCHEDULE`) + - What: Processing order: `default` | `lru` | `fail-first`. + - Affects: Fairness and recovery. `lru` prioritizes least-recently updated; `fail-first` attacks recent failures first. + +- cooldown (env: `IMAGE_UPDATER_COOLDOWN`) + - What: Deprioritizes apps updated within this duration. + - Affects: Reduces thrash on “hot” apps; spreads work evenly. + +- per-repo-cap (env: `IMAGE_UPDATER_PER_REPO_CAP`) + - What: Max apps per repository processed per cycle (0 = unlimited). + - Affects: Prevents a monorepo from monopolizing a cycle; improves fleet fairness. + +- Git retry env (`ARGOCD_GIT_ATTEMPTS_COUNT`, `ARGOCD_GIT_RETRY_DURATION`, `ARGOCD_GIT_RETRY_MAX_DURATION`, `ARGOCD_GIT_RETRY_FACTOR`) + - What: Exponential backoff for fetch/shallow-fetch/push. + - Affects: Resilience vs. latency on transient failures. + - Defaults: attempts=3; base=500ms; max=10s; factor=2. + +- registries.conf `limit` (per registry) + - What: Requests-per-second cap to a registry. + - Affects: Upstream load and rate-limit avoidance. Higher = faster metadata fetches; too high = 429/timeouts. + - Guidance: 30–80 RPS typical; tune to registry capacity. + +- HTTP transport defaults (per registry) + - MaxIdleConns / MaxIdleConnsPerHost: Size of keep-alive pools; larger pools reduce new dials/TLS handshakes. + - MaxConnsPerHost: Cap on parallel sockets to a host; lower values reduce ephemeral port exhaustion. + - Timeouts: `IdleConnTimeout`, `TLSHandshakeTimeout`, `ResponseHeaderTimeout`, `ExpectContinueTimeout` prevent hangs and free stale resources. + - HTTP/2 (HTTPS + ALPN): Multiplexes many requests over few sockets, drastically reducing socket count under load. + +- Combined effects (scheduler + limits) + - Higher `--max-concurrency` with `--per-repo-cap` and `--cooldown` improves fleet throughput and fairness while avoiding monorepo starvation. + +## Rate limiting and backpressure + +This fork adds layered controls to protect upstreams and the process under load: + +- Global worker pool + - Controlled by `--max-concurrency` (or auto with `0`). Limits total concurrent app updates. + +- Per-registry request rate (token bucket) + - Configured via `registries.conf` `limit` per registry (requests/second). + - Requests beyond the budget are delayed locally to smooth spikes; reduces 429/timeouts. + +- Per-registry in-flight cap + - Socket-level caps via HTTP transport (`MaxConnsPerHost`) plus internal semaphores where applicable. + - Prevents connection storms and ephemeral port exhaustion. + +- Singleflight de-duplication + - Tags/manifests and JWT auth are de-duplicated. One leader performs the call, followers wait for the result. + - Cuts redundant upstream traffic during bursts. + +- Jittered exponential backoff retries + - Applied to tags/manifests and JWT auth. Short, bounded retries with jitter to avoid synchronization. + +- Git backpressure (batched writer) + - Per-repo queue serializes commit/push; multiple app intents per branch coalesce into one commit. + - Retries with backoff for transient fetch/push errors. + +- Fair scheduling + - `--per-repo-cap` limits apps from one repo per cycle; `--cooldown` deprioritizes recently updated apps. + +Observability: +- Metrics expose queue lengths, in-flight counts, retry counts, singleflight leader/follower, and durations to tune the above without guesswork. + +## ASCII architecture (fork-specific) + +The same runtime, depicted in ASCII for environments without Mermaid rendering. + +``` + +-----------------------------------------+ + | Scheduler | + |-----------------------------------------| + flags: --mode=continuous | per-app timers (interval) | + --max-concurrency=0 | auto concurrency sizing | + --schedule=lru|fail | LRU / Fail-first prioritization | + --cooldown=30s | cooldown to dampen hot apps | + --per-repo-cap=20 | fairness cap per Git repo per pass | + +--------------------+--------------------+ + | + v + +--------------+--------------+ + | Worker Pool | + +--------------+--------------+ + | + v + +-----------+-----------+ + | Worker (per app) | + |----------------------| + | 1) Compute images | + | 2) Registry ops | + | 3) Patch spec in mem | + | 4a) WriteBack=Git -> |----+ + | enqueue intent | | + | 4b) WriteBack=ArgoCD | | + | Update via API | | + +----------------------+ | + | + v + +-----------------------------------+ | + | Registry Client (per endpoint) | | + |-----------------------------------| | + | Transport cache (keep-alive) | | + | Sane timeouts, MaxConnsPerHost | | + | Per-reg in-flight cap (queue) | | + | Singleflight: tags/manifests | | + | JWT auth: singleflight + retries | | + | HTTP/2 over TLS when available | | + +------------------+----------------+ | + | | + v | + +------------+-----------+ | + | Remote registry/API | | + +------------------------+ | + | + v + +-------------------------------------------------------------+ + | Per-repo Batched Git Writer | + |-------------------------------------------------------------| + | intent queue (repo) -> group by branch -> commitBatch | + | fetch/checkout/commit/push (retries/backoff) | + +----------------------------+--------------------------------+ + | + v + +------+------+ + | Remote | + | Git | + +-------------+ + +Observability: +- Metrics: app timings (last attempt/success, durations), cycle duration, registry in-flight/duration/status/retries/errors, + JWT auth (requests/errors/duration/TTL), singleflight leader/follower counts. +- Logs: startup settings; per-app "continuous: start/finished"; queued write-backs; Git and registry error details. +``` + +### Phase comparison: stock vs our tuned configuration + +| Phase | Stock defaults (cycle mode, basic concurrency) | Tuned configuration (continuous, auto concurrency, LRU, cooldown, per-repo-cap, singleflight, retries) | +| --- | --- | --- | +| Startup | Minimal logging; default transports; limited tuning | Logs full settings; shared transports with timeouts; metrics/health; optional warmup | +| Scheduling | Global pass every `--interval`; fixed concurrency | Lightweight pass ~1s; per-app due check against `--interval`; auto concurrency sizing | +| Discovery/filter | List apps every pass; warn on unsupported each pass | Same listing; will throttle/dedupe repeated unsupported warnings; same filters | +| Prioritization | Default order | LRU or Fail-first; cooldown deprioritizes recent successes; per-repo-cap fairness | +| Dispatch | Semaphore up to `--max-concurrency` | Same guard; plus per-app in-flight guard to avoid double dispatch in continuous | +| Registry IO | Direct calls; limited retry semantics | Per-reg RPS limiter and in-flight cap; singleflight for tags/manifests and JWT; jittered backoff retries; shared transports; HTTP/2 | +| Update decision | Compare live vs candidate; may skip | Same logic, but less flap due to fairness/cooldown | +| Write-back | Immediate Git per app (can thrash in monorepos) | Per-repo batched writer; group by branch; one commit/push per batch; retries | +| Non-Git write-back | ArgoCD `UpdateSpec` | Same, with conflict-retry backoff | +| Observability | Basic metrics/logs | Expanded metrics (JWT, singleflight, durations); per-app continuous start/finish logs; queue and retry metrics | diff --git a/VERSION b/VERSION index d45dc58d..726abb83 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -99.9.9 \ No newline at end of file +100.0.11 \ No newline at end of file diff --git a/cmd/main.go b/cmd/main.go index b8750b2b..35c1d4e0 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -52,6 +52,11 @@ type ImageUpdaterConfig struct { DisableKubeEvents bool GitCreds git.CredsStore EnableWebhook bool + // Scheduler options + Schedule string + Cooldown time.Duration + PerRepoCap int + Mode string } // newRootCommand implements the root command of argocd-image-updater diff --git a/cmd/run.go b/cmd/run.go index 1113c2d6..96be2644 100644 --- a/cmd/run.go +++ b/cmd/run.go @@ -10,6 +10,8 @@ import ( "sync" "text/template" "time" + "sort" + "runtime" "github.com/argoproj-labs/argocd-image-updater/pkg/argocd" "github.com/argoproj-labs/argocd-image-updater/pkg/common" @@ -32,6 +34,42 @@ import ( v1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) +// allow tests to stub the application update function +var updateAppFn = argocd.UpdateApplication +var contState *argocd.SyncIterationState +var contMu sync.Mutex +var contInFlight = map[string]bool{} + +// orderApplications reorders apps by schedule policy: lru (least-recent success first), +// fail-first (recent failures first), with optional cooldown to deprioritize recently +// successful apps. +func orderApplications(names []string, appList map[string]argocd.ApplicationImages, state *argocd.SyncIterationState, cfg *ImageUpdaterConfig) []string { + stats := state.GetStats() + type item struct{ name string; score int64 } + items := make([]item, 0, len(names)) + now := time.Now() + for _, n := range names { + s := stats[n] + score := int64(0) + switch cfg.Schedule { + case "lru": + // Older success => higher priority (lower score) + if !s.LastSuccess.IsZero() { score -= int64(now.Sub(s.LastSuccess).Milliseconds()) } + case "fail-first": + score += int64(s.FailCount) * 1_000_000 // dominate by failures + if !s.LastAttempt.IsZero() { score -= int64(now.Sub(s.LastAttempt).Milliseconds()) } + } + if cfg.Cooldown > 0 && !s.LastSuccess.IsZero() && now.Sub(s.LastSuccess) < cfg.Cooldown { + score -= 1 // slight deprioritization + } + items = append(items, item{name: n, score: score}) + } + sort.Slice(items, func(i, j int) bool { return items[i].score > items[j].score }) + out := make([]string, len(items)) + for i := range items { out[i] = items[i].name } + return out +} + // newRunCommand implements "run" command func newRunCommand() *cobra.Command { var cfg *ImageUpdaterConfig = &ImageUpdaterConfig{} @@ -50,17 +88,17 @@ func newRunCommand() *cobra.Command { return err } - if once { + if once { cfg.CheckInterval = 0 cfg.HealthPort = 0 } - // Enforce sane --max-concurrency values - if cfg.MaxConcurrency < 1 { - return fmt.Errorf("--max-concurrency must be greater than 1") - } + // Enforce sane --max-concurrency values (0=auto) + if cfg.MaxConcurrency < 0 { + return fmt.Errorf("--max-concurrency cannot be negative (0=auto)") + } - log.Infof("%s %s starting [loglevel:%s, interval:%s, healthport:%s]", + log.Infof("%s %s starting [loglevel:%s, interval:%s, healthport:%s]", version.BinaryName(), version.Version(), strings.ToUpper(cfg.LogLevel), @@ -110,7 +148,12 @@ func newRunCommand() *cobra.Command { } } - if cfg.CheckInterval > 0 && cfg.CheckInterval < 60*time.Second { + // Start HTTP transport janitor to close idle connections periodically + // Interval is tunable via REGISTRY_TRANSPORT_JANITOR_INTERVAL (0 disables) + janitorInterval := env.ParseDurationFromEnv("REGISTRY_TRANSPORT_JANITOR_INTERVAL", 5*time.Minute, 0, 24*time.Hour) + stopJanitor := registry.StartTransportJanitor(janitorInterval) + + if cfg.CheckInterval > 0 && cfg.CheckInterval < 60*time.Second && cfg.Mode != "continuous" { log.Warnf("Check interval is very low - it is not recommended to run below 1m0s") } @@ -143,8 +186,21 @@ func newRunCommand() *cobra.Command { cfg.ClientOpts.Plaintext, ) + // Log effective runtime settings + log.Infof("Runtime settings: mode=%s interval=%s max_concurrency=%d schedule=%s cooldown=%s per_repo_cap=%d health_port=%d metrics_port=%d registries_conf=%s", + cfg.Mode, + getPrintableInterval(cfg.CheckInterval), + cfg.MaxConcurrency, + cfg.Schedule, + cfg.Cooldown.String(), + cfg.PerRepoCap, + cfg.HealthPort, + cfg.MetricsPort, + cfg.RegistriesConf, + ) + // Initialize metrics before starting the metrics server or using any counters - metrics.InitMetrics() + metrics.InitMetrics() // Health server will start in a go routine and run asynchronously var hsErrCh chan error @@ -213,6 +269,8 @@ func newRunCommand() *cobra.Command { harborHandler := webhook.NewHarborWebhook(webhookCfg.HarborSecret) handler.RegisterHandler(harborHandler) + // GitLab Container Registry webhooks are not supported upstream + quayHandler := webhook.NewQuayWebhook(webhookCfg.QuaySecret) handler.RegisterHandler(quayHandler) @@ -252,7 +310,24 @@ func newRunCommand() *cobra.Command { // This is our main loop. We leave it only when our health probe server // returns an error. - for { + // Ensure Argo client is initialized when skipping warmup (e.g., --warmup-cache=false) in continuous mode + if cfg.Mode == "continuous" && cfg.ArgoClient == nil { + var err error + switch cfg.ApplicationsAPIKind { + case applicationsAPIKindK8S: + cfg.ArgoClient, err = argocd.NewK8SClient(cfg.KubeClient, &argocd.K8SClientOptions{AppNamespace: cfg.AppNamespace}) + case applicationsAPIKindArgoCD: + cfg.ArgoClient, err = argocd.NewAPIClient(&cfg.ClientOpts) + default: + err = fmt.Errorf("application api '%s' is not supported", cfg.ApplicationsAPIKind) + } + if err != nil { + log.Errorf("Could not initialize Argo client: %v", err) + return err + } + } + + for { select { case err := <-hsErrCh: if err != nil { @@ -266,6 +341,8 @@ func newRunCommand() *cobra.Command { log.Errorf("Error stopping webhook server: %v", err) } } + // Stop janitor before exit + stopJanitor() return nil case err := <-msErrCh: if err != nil { @@ -279,30 +356,40 @@ func newRunCommand() *cobra.Command { log.Errorf("Error stopping webhook server: %v", err) } } + // Stop janitor before exit + stopJanitor() return nil case err := <-whErrCh: log.Errorf("Webhook server exited with error: %v", err) + // Stop janitor before exit + stopJanitor() return nil default: - if lastRun.IsZero() || time.Since(lastRun) > cfg.CheckInterval { - result, err := runImageUpdater(cfg, false) - if err != nil { - log.Errorf("Error: %v", err) - } else { - log.Infof("Processing results: applications=%d images_considered=%d images_skipped=%d images_updated=%d errors=%d", - result.NumApplicationsProcessed, - result.NumImagesConsidered, - result.NumSkipped, - result.NumImagesUpdated, - result.NumErrors) + if cfg.Mode == "continuous" { + runContinuousOnce(cfg) + // continuous scheduler loops internally; tick at ~1s + time.Sleep(1 * time.Second) + } else { + if lastRun.IsZero() || time.Since(lastRun) > cfg.CheckInterval { + result, err := runImageUpdater(cfg, false) + if err != nil { + log.Errorf("Error: %v", err) + } else { + log.Infof("Processing results: applications=%d images_considered=%d images_skipped=%d images_updated=%d errors=%d", + result.NumApplicationsProcessed, + result.NumImagesConsidered, + result.NumSkipped, + result.NumImagesUpdated, + result.NumErrors) + } + lastRun = time.Now() } - lastRun = time.Now() } } if cfg.CheckInterval == 0 { break } - time.Sleep(100 * time.Millisecond) + time.Sleep(1 * time.Second) } log.Infof("Finished.") return nil @@ -327,7 +414,7 @@ func newRunCommand() *cobra.Command { runCmd.Flags().IntVar(&cfg.MetricsPort, "metrics-port", 8081, "port to start the metrics server on, 0 to disable") runCmd.Flags().BoolVar(&once, "once", false, "run only once, same as specifying --interval=0 and --health-port=0") runCmd.Flags().StringVar(&cfg.RegistriesConf, "registries-conf-path", defaultRegistriesConfPath, "path to registries configuration file") - runCmd.Flags().IntVar(&cfg.MaxConcurrency, "max-concurrency", 10, "maximum number of update threads to run concurrently") + runCmd.Flags().IntVar(&cfg.MaxConcurrency, "max-concurrency", 10, "maximum number of update threads to run concurrently (0=auto)") runCmd.Flags().StringVar(&cfg.ArgocdNamespace, "argocd-namespace", "", "namespace where ArgoCD runs in (current namespace by default)") runCmd.Flags().StringVar(&cfg.AppNamespace, "application-namespace", v1.NamespaceAll, "namespace where Argo Image Updater will manage applications (all namespaces by default)") @@ -337,6 +424,10 @@ func newRunCommand() *cobra.Command { runCmd.Flags().StringVar(&cfg.AppLabel, "match-application-label", "", "label selector to match application labels against. DEPRECATED: this flag will be removed in a future version.") runCmd.Flags().BoolVar(&warmUpCache, "warmup-cache", true, "whether to perform a cache warm-up on startup") + runCmd.Flags().StringVar(&cfg.Schedule, "schedule", env.GetStringVal("IMAGE_UPDATER_SCHEDULE", "default"), "scheduling policy: default|lru|fail-first") + runCmd.Flags().DurationVar(&cfg.Cooldown, "cooldown", env.GetDurationVal("IMAGE_UPDATER_COOLDOWN", 0), "deprioritize apps updated within this duration") + runCmd.Flags().IntVar(&cfg.PerRepoCap, "per-repo-cap", env.ParseNumFromEnv("IMAGE_UPDATER_PER_REPO_CAP", 0, 0, 100000), "max updates per repo per cycle (0 = unlimited)") + runCmd.Flags().StringVar(&cfg.Mode, "mode", env.GetStringVal("IMAGE_UPDATER_MODE", "cycle"), "execution mode: cycle|continuous") runCmd.Flags().StringVar(&cfg.GitCommitUser, "git-commit-user", env.GetStringVal("GIT_COMMIT_USER", "argocd-image-updater"), "Username to use for Git commits") runCmd.Flags().StringVar(&cfg.GitCommitMail, "git-commit-email", env.GetStringVal("GIT_COMMIT_EMAIL", "noreply@argoproj.io"), "E-Mail address to use for Git commits") runCmd.Flags().StringVar(&cfg.GitCommitSigningKey, "git-commit-signing-key", env.GetStringVal("GIT_COMMIT_SIGNING_KEY", ""), "GnuPG key ID or path to Private SSH Key used to sign the commits") @@ -352,6 +443,7 @@ func newRunCommand() *cobra.Command { runCmd.Flags().StringVar(&webhookCfg.QuaySecret, "quay-webhook-secret", env.GetStringVal("QUAY_WEBHOOK_SECRET", ""), "Secret for validating Quay webhooks") runCmd.Flags().StringVar(&webhookCfg.HarborSecret, "harbor-webhook-secret", env.GetStringVal("HARBOR_WEBHOOK_SECRET", ""), "Secret for validating Harbor webhooks") runCmd.Flags().IntVar(&webhookCfg.RateLimitNumAllowedRequests, "webhook-ratelimit-allowed", env.ParseNumFromEnv("WEBHOOK_RATELIMIT_ALLOWED", 0, 0, math.MaxInt), "The number of allowed requests in an hour for webhook rate limiting, setting to 0 disables ratelimiting") + // GitLab Container Registry webhooks are not supported upstream return runCmd } @@ -360,6 +452,7 @@ func newRunCommand() *cobra.Command { func runImageUpdater(cfg *ImageUpdaterConfig, warmUp bool) (argocd.ImageUpdaterResult, error) { result := argocd.ImageUpdaterResult{} var err error + cycleStart := time.Now() var argoClient argocd.ArgoCD switch cfg.ApplicationsAPIKind { case applicationsAPIKindK8S: @@ -399,11 +492,20 @@ func runImageUpdater(cfg *ImageUpdaterConfig, warmUp bool) (argocd.ImageUpdaterR log.Infof("Starting image update cycle, considering %d annotated application(s) for update", len(appList)) } - syncState := argocd.NewSyncIterationState() + syncState := argocd.NewSyncIterationState() // Allow a maximum of MaxConcurrency number of goroutines to exist at the // same time. If in warm-up mode, set to 1 explicitly. - var concurrency int = cfg.MaxConcurrency + var concurrency int = cfg.MaxConcurrency + if concurrency == 0 { // auto + // simple heuristic: 8x CPUs, capped to number of apps + cpu := runtime.NumCPU() + if cpu < 1 { cpu = 1 } + concurrency = cpu * 8 + if concurrency > len(appList) { concurrency = len(appList) } + if concurrency < 1 { concurrency = 1 } + log.Infof("Auto concurrency selected: %d workers (cpus=%d apps=%d)", concurrency, cpu, len(appList)) + } if warmUp { concurrency = 1 } @@ -416,7 +518,26 @@ func runImageUpdater(cfg *ImageUpdaterConfig, warmUp bool) (argocd.ImageUpdaterR var wg sync.WaitGroup wg.Add(len(appList)) - for app, curApplication := range appList { + // Optionally reorder apps by scheduling policy + ordered := make([]string, 0, len(appList)) + for app := range appList { ordered = append(ordered, app) } + if cfg.Schedule != "default" || cfg.Cooldown > 0 || cfg.PerRepoCap > 0 { + ordered = orderApplications(ordered, appList, syncState, cfg) + } + + perRepoCounter := map[string]int{} + + for _, app := range ordered { + curApplication := appList[app] + // Per-repo cap if configured + if cfg.PerRepoCap > 0 { + repo := argocd.GetApplicationSource(&curApplication.Application).RepoURL + if perRepoCounter[repo] >= cfg.PerRepoCap { + continue + } + } + syncState.RecordAttempt(app) + metrics.Applications().SetLastAttempt(app, time.Now()) lockErr := sem.Acquire(context.Background(), 1) if lockErr != nil { log.Errorf("Could not acquire semaphore for application %s: %v", app, lockErr) @@ -428,6 +549,7 @@ func runImageUpdater(cfg *ImageUpdaterConfig, warmUp bool) (argocd.ImageUpdaterR go func(app string, curApplication argocd.ApplicationImages) { defer sem.Release(1) log.Debugf("Processing application %s", app) + appStart := time.Now() upconf := &argocd.UpdateConfiguration{ NewRegFN: registry.NewClient, ArgoClient: cfg.ArgoClient, @@ -443,7 +565,13 @@ func runImageUpdater(cfg *ImageUpdaterConfig, warmUp bool) (argocd.ImageUpdaterR DisableKubeEvents: cfg.DisableKubeEvents, GitCreds: cfg.GitCreds, } - res := argocd.UpdateApplication(upconf, syncState) + res := updateAppFn(upconf, syncState) + metrics.Applications().ObserveAppUpdateDuration(app, time.Since(appStart)) + syncState.RecordResult(app, res.NumErrors > 0) + if cfg.PerRepoCap > 0 { + repo := argocd.GetApplicationSource(&curApplication.Application).RepoURL + perRepoCounter[repo] = perRepoCounter[repo] + 1 + } result.NumApplicationsProcessed += 1 result.NumErrors += res.NumErrors result.NumImagesConsidered += res.NumImagesConsidered @@ -454,16 +582,101 @@ func runImageUpdater(cfg *ImageUpdaterConfig, warmUp bool) (argocd.ImageUpdaterR } metrics.Applications().IncreaseUpdateErrors(app, res.NumErrors) metrics.Applications().SetNumberOfImagesWatched(app, res.NumImagesConsidered) + if res.NumErrors == 0 { + metrics.Applications().SetLastSuccess(app, time.Now()) + } wg.Done() }(app, curApplication) } - // Wait for all goroutines to finish + // Wait for all goroutines to finish wg.Wait() + metrics.Applications().ObserveCycleDuration(time.Since(cycleStart)) + metrics.Applications().SetCycleLastEnd(time.Now()) return result, nil } +// runContinuousOnce runs a non-blocking scheduling pass that launches or skips +// per-app workers based on last attempt time and the configured interval. Each +// app re-schedules independently; shared limits still apply downstream. +func runContinuousOnce(cfg *ImageUpdaterConfig) { + apps, err := cfg.ArgoClient.ListApplications(cfg.AppLabel) + if err != nil { log.Errorf("continuous: list apps error: %v", err); return } + appList, err := argocd.FilterApplicationsForUpdate(apps, cfg.AppNamePatterns) + if err != nil { log.Errorf("continuous: filter apps error: %v", err); return } + + // Build or fetch per-process state + if contState == nil { contState = argocd.NewSyncIterationState() } + syncState := contState + ordered := make([]string, 0, len(appList)) + for a := range appList { ordered = append(ordered, a) } + if cfg.Schedule != "default" || cfg.Cooldown > 0 || cfg.PerRepoCap > 0 { + ordered = orderApplications(ordered, appList, syncState, cfg) + } + + // Use auto-concurrency when set + concurrency := cfg.MaxConcurrency + if concurrency == 0 { + cpu := runtime.NumCPU(); if cpu < 1 { cpu = 1 } + concurrency = cpu * 8 + if concurrency > len(appList) { concurrency = len(appList) } + if concurrency < 1 { concurrency = 1 } + } + sem := semaphore.NewWeighted(int64(concurrency)) + + now := time.Now() + for _, name := range ordered { + s := syncState.GetStats()[name] + if !s.LastAttempt.IsZero() && now.Sub(s.LastAttempt) < cfg.CheckInterval { + continue // not due yet + } + // don't double-dispatch same app + contMu.Lock() + if contInFlight[name] { contMu.Unlock(); continue } + contInFlight[name] = true + contMu.Unlock() + if err := sem.Acquire(context.Background(), 1); err != nil { continue } + cur := appList[name] + syncState.RecordAttempt(name) + if m := metrics.Applications(); m != nil { + m.SetLastAttempt(name, time.Now()) + } + go func(appName string, ai argocd.ApplicationImages) { + defer sem.Release(1) + defer func(){ contMu.Lock(); delete(contInFlight, appName); contMu.Unlock() }() + start := time.Now() + log.WithContext().AddField("application", appName).Infof("continuous: start processing") + upconf := &argocd.UpdateConfiguration{ + NewRegFN: registry.NewClient, + ArgoClient: cfg.ArgoClient, + KubeClient: cfg.KubeClient, + UpdateApp: &ai, + DryRun: cfg.DryRun, + GitCommitUser: cfg.GitCommitUser, + GitCommitEmail: cfg.GitCommitMail, + GitCommitMessage: cfg.GitCommitMessage, + GitCommitSigningKey: cfg.GitCommitSigningKey, + GitCommitSigningMethod: cfg.GitCommitSigningMethod, + GitCommitSignOff: cfg.GitCommitSignOff, + DisableKubeEvents: cfg.DisableKubeEvents, + GitCreds: cfg.GitCreds, + } + res := updateAppFn(upconf, syncState) + if m := metrics.Applications(); m != nil { + m.ObserveAppUpdateDuration(appName, time.Since(start)) + if res.NumErrors == 0 { m.SetLastSuccess(appName, time.Now()) } + } + dur := time.Since(start) + if res.NumErrors == 0 { + log.WithContext().AddField("application", appName).Infof("continuous: finished processing: success, duration=%s", dur) + } else { + log.WithContext().AddField("application", appName).Infof("continuous: finished processing: failed, duration=%s", dur) + } + }(name, cur) + } +} + // warmupImageCache performs a cache warm-up, which is basically one cycle of // the image update process with dryRun set to true and a maximum concurrency // of 1, i.e. sequential processing. diff --git a/cmd/run_test.go b/cmd/run_test.go index 11aa4ee9..9cdd4cee 100644 --- a/cmd/run_test.go +++ b/cmd/run_test.go @@ -4,10 +4,19 @@ import ( "os" "testing" "time" + "context" + "net" + "net/http" + "fmt" "github.com/stretchr/testify/assert" "github.com/argoproj-labs/argocd-image-updater/registry-scanner/pkg/env" + "github.com/argoproj-labs/argocd-image-updater/pkg/argocd" + "github.com/argoproj-labs/argocd-image-updater/pkg/common" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + v1alpha1 "github.com/argoproj/argo-cd/v2/pkg/apis/application/v1alpha1" + "github.com/argoproj/argo-cd/v2/pkg/apiclient/application" ) // TestNewRunCommand tests various flags and their default values. @@ -55,3 +64,102 @@ func TestRootCmd(t *testing.T) { err := newRootCommand() assert.Nil(t, err) } + +// TestContinuousScheduling ensures that due apps are launched independently and respect interval. +func TestContinuousScheduling(t *testing.T) { + t.Cleanup(func(){ updateAppFn = argocd.UpdateApplication }) + // stub updater: sleep based on app name prefix to simulate slow/fast + var fastRuns, slowRuns int + updateAppFn = func(conf *argocd.UpdateConfiguration, state *argocd.SyncIterationState) argocd.ImageUpdaterResult { + app := conf.UpdateApp.Application.GetName() + if app == "slow" { + slowRuns++ + time.Sleep(250 * time.Millisecond) + } else { + fastRuns++ + time.Sleep(10 * time.Millisecond) + } + return argocd.ImageUpdaterResult{} + } + + cfg := &ImageUpdaterConfig{ + ApplicationsAPIKind: applicationsAPIKindK8S, + CheckInterval: 100 * time.Millisecond, + MaxConcurrency: 2, + Mode: "continuous", + } + cfg.ArgoClient = &fakeArgo{apps: []string{"slow", "fast"}} + + // Kick scheduler a few times within ~400ms window + deadline := time.Now().Add(400 * time.Millisecond) + for time.Now().Before(deadline) { + runContinuousOnce(cfg) + time.Sleep(20 * time.Millisecond) + } + + // Expect fast to have run more than slow + if !(fastRuns > slowRuns) { + t.Fatalf("expected fast runs > slow runs; got fast=%d slow=%d", fastRuns, slowRuns) + } +} + +// pickFreePort returns an available TCP port on localhost. +func pickFreePort(t *testing.T) int { + l, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { t.Fatalf("pickFreePort: %v", err) } + defer l.Close() + return l.Addr().(*net.TCPAddr).Port +} + +// TestRun_StartsWebhookWhenEnabled verifies run mode starts webhook server when enabled via env. +func TestRun_StartsWebhookWhenEnabled(t *testing.T) { + // choose a free port + port := pickFreePort(t) + + // Prepare command with flags that avoid external deps and exit quickly + cmd := newRunCommand() + cmd.SetArgs([]string{ + "--disable-kubernetes", + "--warmup-cache=false", + "--once", + "--loglevel", "debug", + }) + + // Set envs to enable webhook + t.Setenv("ENABLE_WEBHOOK", "true") + t.Setenv("WEBHOOK_PORT", fmt.Sprintf("%d", port)) + + // Run the command; it should return (once-mode) while webhook goroutine keeps listening + if err := cmd.Execute(); err != nil { + t.Fatalf("run command returned error: %v", err) + } + + // Give the server a moment to bind + deadline := time.Now().Add(500 * time.Millisecond) + var lastErr error + for time.Now().Before(deadline) { + resp, err := http.Get(fmt.Sprintf("http://127.0.0.1:%d/healthz", port)) + if err == nil && resp != nil { + resp.Body.Close() + if resp.StatusCode == http.StatusOK { return } + } + lastErr = err + time.Sleep(50 * time.Millisecond) + } + t.Fatalf("webhook did not start on port %d: lastErr=%v", port, lastErr) +} + +type fakeArgo struct{ apps []string } +func (f *fakeArgo) GetApplication(ctx context.Context, name string) (*v1alpha1.Application, error) { return nil, nil } +func (f *fakeArgo) ListApplications(_ string) ([]v1alpha1.Application, error) { + out := make([]v1alpha1.Application, 0, len(f.apps)) + for _, n := range f.apps { + out = append(out, v1alpha1.Application{ + ObjectMeta: v1.ObjectMeta{Name: n, Annotations: map[string]string{common.ImageUpdaterAnnotation: ""}}, + Spec: v1alpha1.ApplicationSpec{Source: &v1alpha1.ApplicationSource{Kustomize: &v1alpha1.ApplicationSourceKustomize{}}}, + Status: v1alpha1.ApplicationStatus{SourceType: v1alpha1.ApplicationSourceTypeKustomize}, + }) + } + return out, nil +} +func (f *fakeArgo) UpdateSpec(ctx context.Context, _ *application.ApplicationUpdateSpecRequest) (*v1alpha1.ApplicationSpec, error) { return nil, nil } diff --git a/cmd/webhook.go b/cmd/webhook.go index f4641098..5b994fbb 100644 --- a/cmd/webhook.go +++ b/cmd/webhook.go @@ -201,7 +201,7 @@ Supported registries: webhookCmd.Flags().StringVar(&webhookCfg.GHCRSecret, "ghcr-webhook-secret", env.GetStringVal("GHCR_WEBHOOK_SECRET", ""), "Secret for validating GitHub Container Registry webhooks") webhookCmd.Flags().StringVar(&webhookCfg.QuaySecret, "quay-webhook-secret", env.GetStringVal("QUAY_WEBHOOK_SECRET", ""), "Secret for validating Quay webhooks") webhookCmd.Flags().StringVar(&webhookCfg.HarborSecret, "harbor-webhook-secret", env.GetStringVal("HARBOR_WEBHOOK_SECRET", ""), "Secret for validating Harbor webhooks") - webhookCmd.Flags().IntVar(&webhookCfg.RateLimitNumAllowedRequests, "webhook-ratelimit-allowed", env.ParseNumFromEnv("WEBHOOK_RATELIMIT_ALLOWED", 0, 0, math.MaxInt), "The number of allowed requests in an hour for webhook rate limiting, setting to 0 disables ratelimiting") + webhookCmd.Flags().IntVar(&webhookCfg.RateLimitNumAllowedRequests, "webhook-ratelimit-allowed", env.ParseNumFromEnv("WEBHOOK_RATELIMIT_ALLOWED", 0, 0, math.MaxInt), "The number of allowed requests in an hour for webhook rate limiting, setting to 0 disables ratelimiting") return webhookCmd } @@ -241,11 +241,11 @@ func runWebhook(cfg *ImageUpdaterConfig, webhookCfg *WebhookConfig) error { ghcrHandler := webhook.NewGHCRWebhook(webhookCfg.GHCRSecret) handler.RegisterHandler(ghcrHandler) - quayHandler := webhook.NewQuayWebhook(webhookCfg.QuaySecret) - handler.RegisterHandler(quayHandler) + quayHandler := webhook.NewQuayWebhook(webhookCfg.QuaySecret) + handler.RegisterHandler(quayHandler) - harborHandler := webhook.NewHarborWebhook(webhookCfg.HarborSecret) - handler.RegisterHandler(harborHandler) + harborHandler := webhook.NewHarborWebhook(webhookCfg.HarborSecret) + handler.RegisterHandler(harborHandler) // Create webhook server server := webhook.NewWebhookServer(webhookCfg.Port, handler, cfg.KubeClient, cfg.ArgoClient) diff --git a/config/example-grafana-dashboard.json b/config/example-grafana-dashboard.json index 65246ca5..51f8cd68 100644 --- a/config/example-grafana-dashboard.json +++ b/config/example-grafana-dashboard.json @@ -33,6 +33,37 @@ "title": "Configuration", "type": "row" }, + { + "collapsed": false, + "datasource": null, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 28}, + "id": 30, + "panels": [], + "title": "Lagging applications", + "type": "row" + }, + { + "datasource": null, + "description": "Minutes since last successful update per application; shows only those > 5 minutes.", + "fieldConfig": {"defaults": {"unit": "m"}, "overrides": []}, + "gridPos": {"h": 10, "w": 24, "x": 0, "y": 29}, + "id": 32, + "options": {"showHeader": true}, + "pluginVersion": "10.x", + "targets": [ + { + "expr": "((time() - max by (application) (argocd_image_updater_application_last_success_timestamp)) / 60) > 5", + "format": "table", + "instant": true, + "refId": "A" + } + ], + "title": "Apps lagging > 5 minutes (minutes)", + "transformations": [ + {"id": "labelsToFields", "options": {"mode": "columns"}} + ], + "type": "table" + }, { "aliasColors": {}, "bars": false, diff --git a/docs/configuration/webhook.md b/docs/configuration/webhook.md index 506e7d77..56249c59 100644 --- a/docs/configuration/webhook.md +++ b/docs/configuration/webhook.md @@ -147,6 +147,97 @@ Supported Registries That Use This: Also be aware that if the container registry has a built-in secrets method you will not be able to use this method. +## Running the webhook as a Kubernetes sidecar + +If you prefer not to enable the webhook server inside the main `run` process, you can run +the dedicated webhook server as a sidecar container in the same Pod. This is often the +most reliable setup when fronting the server with an Ingress/Gateway and makes +troubleshooting straightforward (the sidecar will always bind its port independently +from the main loop). + +### Note on run mode vs sidecar + +When the webhook is enabled in `run` mode, the server is started after the process +initializes its Kubernetes/Argo CD clients. If that initialization is slow or fails +(e.g., API reachability, DNS, RBAC), the execution path may not reach the webhook start, +so the port will not bind and you will not see the +"Starting webhook server on port " log. + +The sidecar approach avoids that dependency and binds immediately, which is often +preferable when exposing the webhook through a Gateway/Ingress. If you do use the +`run` mode webhook, ensure cluster connectivity is healthy and verify startup logs +for the binding message. + +### Secret example (Harbor) + +If you want the server to validate Harbor webhooks using a shared secret, create or +update the existing secret as follows: + +```yaml +apiVersion: v1 +kind: Secret +metadata: + name: argocd-image-updater-secret + namespace: argocd +type: Opaque +stringData: + webhook.harbor-secret: "" +``` + +Notes: +- If you set the secret, the server expects an HMAC-SHA256 signature of the request body + in the `Authorization` header (e.g., `sha256=`). If your registry cannot generate + that header, leave the secret empty (validation disabled) and do not send an auth header. + +### Sidecar container example + +Add this container to your Deployment. It serves the webhook and health endpoints +on port `8082`. + +```yaml +- name: argocd-image-updater-webhook + image: quay.io/argoprojlabs/argocd-image-updater:latest + args: ["webhook","--webhook-port","8082"] + env: + - name: HARBOR_WEBHOOK_SECRET + valueFrom: + secretKeyRef: + name: argocd-image-updater-secret + key: webhook.harbor-secret + ports: + - containerPort: 8082 + name: webhook + # Optional: silence warnings by mounting config (if you already mount it for the main container) + volumeMounts: + - mountPath: /app/config + name: image-updater-conf +``` + +Expose the webhook port via your Service and route traffic to `/webhook`: + +```yaml +apiVersion: v1 +kind: Service +metadata: + name: argocd-image-updater + namespace: argocd +spec: + selector: + app.kubernetes.io/name: argocd-image-updater + ports: + - name: webhook + port: 8082 + targetPort: 8082 +``` + +In Harbor, configure the endpoint URL to: + +``` +https:///webhook?type=harbor +``` + +and select the “Artifact pushed” event. + ## Exposing the Server To expose the webhook server we have provided a service and ingress to get diff --git a/docs/install/cmd/run.md b/docs/install/cmd/run.md index 99cabd1d..2253563c 100644 --- a/docs/install/cmd/run.md +++ b/docs/install/cmd/run.md @@ -263,3 +263,49 @@ means that the rate limiting is disabled. Can also be set with the `WEBHOOK_RATELIMIT_ALLOWED` environment variable. [label selector syntax]: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors + +### Example: Kubernetes deployment args and env + +The following example shows a common way to run the updater in continuous mode with LRU scheduling, limited concurrency, a label selector, and tuned registry HTTP settings. + +```yaml +spec: + template: + spec: + containers: + - name: argocd-image-updater + args: + - run + - "--interval" + - 60s + - "--max-concurrency" + - "8" + - "--match-application-label" + - argocd-project=my-project + - "--mode" + - continuous + - "--schedule" + - lru + - "--warmup-cache=false" + env: + - name: REGISTRY_TLS_HANDSHAKE_TIMEOUT + value: 30s + - name: REGISTRY_JWT_ATTEMPTS + value: "10" + - name: REGISTRY_JWT_RETRY_BASE + value: 500ms + - name: REGISTRY_JWT_RETRY_MAX + value: 5s + - name: REGISTRY_TAG_TIMEOUT + value: 120s + - name: REGISTRY_MANIFEST_TIMEOUT + value: 120s + - name: REGISTRY_RESPONSE_HEADER_TIMEOUT + value: 120s + - name: REGISTRY_MAX_CONNS_PER_HOST + value: "10" +``` + +Notes: +- Replace `my-project` with your actual label value. +- The registry timeouts and limits are safe defaults for busy registries; tune for your environment. diff --git a/docs/install/installation.md b/docs/install/installation.md index 72986243..efc2a3d6 100644 --- a/docs/install/installation.md +++ b/docs/install/installation.md @@ -271,5 +271,63 @@ The following metrics are being made available: * `argocd_image_updater_registry_requests_total` * `argocd_image_updater_registry_requests_failed_total` +### Additional metrics (performance and troubleshooting) + +* Per-application timings and state + * `argocd_image_updater_application_update_duration_seconds{application}` + * `argocd_image_updater_application_last_attempt_timestamp{application}` + * `argocd_image_updater_application_last_success_timestamp{application}` + * `argocd_image_updater_images_considered_total{application}` + * `argocd_image_updater_images_skipped_total{application}` + * `argocd_image_updater_scheduler_skipped_total{reason}` (e.g., cooldown, per-repo-cap) + +* Update cycle timing + * `argocd_image_updater_update_cycle_duration_seconds` + * `argocd_image_updater_update_cycle_last_end_timestamp` + +* Registry request health + * `argocd_image_updater_registry_in_flight_requests{registry}` + * `argocd_image_updater_registry_request_duration_seconds{registry}` + * `argocd_image_updater_registry_http_status_total{registry,code}` + * `argocd_image_updater_registry_request_retries_total{registry,op}` (auth|tags|manifest) + * `argocd_image_updater_registry_errors_total{registry,kind}` (timeout, dial_error, auth_error, 429, 5xx, ctx_deadline) + +* Singleflight (deduplication effectiveness) + * `argocd_image_updater_singleflight_leaders_total{kind}` (tags|manifest) + * `argocd_image_updater_singleflight_followers_total{kind}` + +Notes: +* Metrics are exposed at `/metrics` (see `--metrics-port`, default 8081). +* Labels like `{application}`, `{registry}`, `{repo}`, `{op}`, `{kind}` enable fine-grained dashboards. + A (very) rudimentary example dashboard definition for Grafana is provided [here](https://github.com/argoproj-labs/argocd-image-updater/tree/master/config) + +## Performance flags (recommended) + +For large fleets and monorepos, enable continuous scheduling and auto concurrency: + +For a complete example of args and environment variables, see the run command reference. We keep one canonical example there to avoid duplication. + +See: [Run command examples](./cmd/run.md#example-kubernetes-deployment-args-and-env) + +Notes: +- Continuous mode preserves all shared protections (per‑registry in‑flight cap, retries, singleflight, git batching). +- Keep per‑registry rate limits tuned in `registries.conf` to match registry capacity. +- `--per-repo-cap`: maximum apps from the same Git repository processed in one pass. Prevents a single monorepo from monopolizing workers; improves fleet fairness. +- `--cooldown`: deprioritizes apps successfully updated within this duration so other apps get slots first. Reduces thrash on hot apps. + + +### Defaults enabled without flags + +The following performance features are ON by default (no flags required): + +- HTTP transport reuse with tuned timeouts (keep‑alive pools, sane phase timeouts) +- Per‑registry in‑flight cap (default 15 concurrent requests per registry) +- Authorizer cache per (registry, repo) for bearer/JWT reuse +- Singleflight deduplication for tags, manifests, and /jwt/auth +- Retries: tags/manifests (3 attempts with jitter) +- JWT auth retries (defaults: 7 attempts; see REGISTRY_JWT_* above to tune) +- Git retries (fetch/shallow/push) with sane defaults (env overridable) +- Batched Git writer (coalesces per‑repo writes; disable via GIT_BATCH_DISABLE=true) +- Expanded Prometheus metrics for apps, cycles, registry, JWT diff --git a/docs/metrics.md b/docs/metrics.md new file mode 100644 index 00000000..d6daed0d --- /dev/null +++ b/docs/metrics.md @@ -0,0 +1,100 @@ +# Metrics and useful PromQL + +This page lists the most useful Prometheus metrics exported by `argocd-image-updater` and ready‑to‑use PromQL queries. + +## Core counters and gauges + +- Applications watched: `argocd_image_updater_applications_watched_total` +- Images watched: `argocd_image_updater_images_watched_total` +- Images updated: `argocd_image_updater_images_updated_total` +- Image update errors: `argocd_image_updater_images_updated_error_total` +- Registry requests by status: `argocd_image_updater_registry_http_status_total{registry,code}` +- Registry request duration: `argocd_image_updater_registry_request_duration_seconds{registry}` (histogram) +- JWT auth metrics: `argocd_image_updater_registry_jwt_*` +- Per‑app timestamps: `argocd_image_updater_application_last_attempt_timestamp{application}` and `argocd_image_updater_application_last_success_timestamp{application}` + +## Which apps are lagging more than 5 minutes? + +Minutes since last success (collapsed across replicas): + +```promql +((time() - max by (application) (argocd_image_updater_application_last_success_timestamp)) / 60) +``` + +Filter for > 5 minutes: + +```promql +((time() - max by (application) (argocd_image_updater_application_last_success_timestamp)) / 60) > 5 +``` + +Fallback to last attempt if there has never been a success: + +```promql +( + (time() - max by (application) (argocd_image_updater_application_last_success_timestamp)) + or on (application) + (time() - max by (application) (argocd_image_updater_application_last_attempt_timestamp)) +) / 60 > 5 +``` + +Top 20 most lagging apps (minutes): + +```promql +topk(20, (time() - max by (application) (argocd_image_updater_application_last_success_timestamp)) / 60) +``` + +## Update throughput + +Total updates and errors per minute: + +```promql +sum(increase(argocd_image_updater_images_updated_total[1m])) +sum(increase(argocd_image_updater_images_updated_error_total[1m])) +``` + +Per‑application updates (5m window): + +```promql +sum by (application) (increase(argocd_image_updater_images_updated_total[5m])) +``` + +## Registry health + +Requests vs errors per registry (1m window): + +```promql +sum by (registry) (increase(argocd_image_updater_registry_requests_total[1m])) +sum by (registry) (increase(argocd_image_updater_registry_requests_failed_total[1m])) +``` + +Latency (p50/p90/p99): + +```promql +histogram_quantile(0.5, sum by (le, registry) (rate(argocd_image_updater_registry_request_duration_seconds_bucket[5m]))) +histogram_quantile(0.9, sum by (le, registry) (rate(argocd_image_updater_registry_request_duration_seconds_bucket[5m]))) +histogram_quantile(0.99, sum by (le, registry) (rate(argocd_image_updater_registry_request_duration_seconds_bucket[5m]))) +``` + +HTTP status breakdown per registry: + +```promql +sum by (registry, code) (increase(argocd_image_updater_registry_http_status_total[5m])) +``` + +## JWT auth health + +Requests, errors, and durations (5m window): + +```promql +sum by (registry, service, scope) (increase(argocd_image_updater_registry_jwt_auth_requests_total[5m])) +sum by (registry, service, scope) (increase(argocd_image_updater_registry_jwt_auth_errors_total[5m])) +``` + +## Tips + +- Use `max by (application)` to deduplicate across multiple replicas. +- Switch Grafana to Table view to list lagging apps with values. +- Consider adding thresholds/alerts for lag > 10–15 minutes. + + + diff --git a/ext/git/client.go b/ext/git/client.go index 622bcf1d..a6ae12b3 100644 --- a/ext/git/client.go +++ b/ext/git/client.go @@ -118,7 +118,7 @@ type runOpts struct { } var ( - maxAttemptsCount = 1 + maxAttemptsCount = 3 maxRetryDuration time.Duration retryDuration time.Duration factor int64 @@ -133,9 +133,10 @@ func init() { } } - maxRetryDuration = env.ParseDurationFromEnv(common.EnvGitRetryMaxDuration, common.DefaultGitRetryMaxDuration, 0, math.MaxInt64) - retryDuration = env.ParseDurationFromEnv(common.EnvGitRetryDuration, common.DefaultGitRetryDuration, 0, math.MaxInt64) - factor = env.ParseInt64FromEnv(common.EnvGitRetryFactor, common.DefaultGitRetryFactor, 0, math.MaxInt64) + // Defaults if env not set: maxRetries backoff settings per request + maxRetryDuration = env.ParseDurationFromEnv(common.EnvGitRetryMaxDuration, 10*time.Second, 0, math.MaxInt64) + retryDuration = env.ParseDurationFromEnv(common.EnvGitRetryDuration, 500*time.Millisecond, 0, math.MaxInt64) + factor = env.ParseInt64FromEnv(common.EnvGitRetryFactor, 2, 0, math.MaxInt64) } @@ -369,12 +370,24 @@ func (m *nativeGitClient) Fetch(revision string) error { defer done() } - err := m.fetch(revision) + var err error + for attempt := 0; attempt < maxAttemptsCount; attempt++ { + err = m.fetch(revision) + if err == nil { + break + } + // exponential backoff before retrying + timeToWait := float64(retryDuration) * (math.Pow(float64(factor), float64(attempt))) + if maxRetryDuration > 0 { + timeToWait = math.Min(float64(maxRetryDuration), timeToWait) + } + time.Sleep(time.Duration(timeToWait)) + } // When we have LFS support enabled, check for large files and fetch them too. if err == nil && m.IsLFSEnabled() { - largeFiles, err := m.LsLargeFiles() - if err == nil && len(largeFiles) > 0 { + largeFiles, lerr := m.LsLargeFiles() + if lerr == nil && len(largeFiles) > 0 { err = m.runCredentialedCmd("lfs", "fetch", "--all") if err != nil { return err diff --git a/ext/git/writer.go b/ext/git/writer.go index 79534474..69d66b3b 100644 --- a/ext/git/writer.go +++ b/ext/git/writer.go @@ -5,6 +5,8 @@ import ( "os/exec" "strconv" "strings" + "math" + "time" "github.com/argoproj-labs/argocd-image-updater/registry-scanner/pkg/log" ) @@ -93,7 +95,20 @@ func (m *nativeGitClient) Push(remote string, branch string, force bool) error { args = append(args, "-f") } args = append(args, remote, branch) - err := m.runCredentialedCmd(args...) + + var err error + for attempt := 0; attempt < maxAttemptsCount; attempt++ { + err = m.runCredentialedCmd(args...) + if err == nil { + return nil + } + // exponential backoff before retrying + timeToWait := float64(retryDuration) * (math.Pow(float64(factor), float64(attempt))) + if maxRetryDuration > 0 { + timeToWait = math.Min(float64(maxRetryDuration), timeToWait) + } + time.Sleep(time.Duration(timeToWait)) + } if err != nil { return fmt.Errorf("could not push %s to %s: %v", branch, remote, err) } diff --git a/go.mod b/go.mod index c49565ea..1f49eb82 100644 --- a/go.mod +++ b/go.mod @@ -37,7 +37,7 @@ require ( dario.cat/mergo v1.0.1 // indirect github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 // indirect github.com/MakeNowJust/heredoc v1.0.0 // indirect - github.com/Masterminds/semver/v3 v3.3.1 // indirect + github.com/Masterminds/semver/v3 v3.4.0 // indirect github.com/Microsoft/go-winio v0.6.2 // indirect github.com/ProtonMail/go-crypto v1.1.6 // indirect github.com/benbjohnson/clock v1.3.0 // indirect @@ -174,10 +174,10 @@ require ( ) replace ( - github.com/cyphar/filepath-securejoin => github.com/cyphar/filepath-securejoin v0.3.6 - // Uncomment for local testing - // github.com/argoproj-labs/argocd-image-updater/registry-scanner => ./registry-scanner/ - github.com/golang/protobuf => github.com/golang/protobuf v1.5.4 + github.com/cyphar/filepath-securejoin => github.com/cyphar/filepath-securejoin v0.3.6 + // Use local registry-scanner during build (Dockerfile copies whole repo before mod download) + github.com/argoproj-labs/argocd-image-updater/registry-scanner => ./registry-scanner/ + github.com/golang/protobuf => github.com/golang/protobuf v1.5.4 k8s.io/api => k8s.io/api v0.32.2 k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.32.2 diff --git a/go.sum b/go.sum index ec8b1e6b..2dc0ec5a 100644 --- a/go.sum +++ b/go.sum @@ -10,8 +10,8 @@ github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161/go.mod h1:xomTg6 github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/MakeNowJust/heredoc v1.0.0 h1:cXCdzVdstXyiTqTvfqk9SDHpKNjxuom+DOlyEeQ4pzQ= github.com/MakeNowJust/heredoc v1.0.0/go.mod h1:mG5amYoWBHf8vpLOuehzbGGw0EHxpZZ6lCpQ4fNJ8LE= -github.com/Masterminds/semver/v3 v3.3.1 h1:QtNSWtVZ3nBfk8mAOu/B6v7FMJ+NHTIgUPi7rj+4nv4= -github.com/Masterminds/semver/v3 v3.3.1/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= +github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0= +github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= github.com/Microsoft/go-winio v0.5.2/go.mod h1:WpS1mjBmmwHBEWmogvA2mj8546UReBk4v8QkMxJ6pZY= github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY= github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU= diff --git a/pkg/argocd/git.go b/pkg/argocd/git.go index 53cc6b94..2a41f371 100644 --- a/pkg/argocd/git.go +++ b/pkg/argocd/git.go @@ -10,6 +10,8 @@ import ( "path" "path/filepath" "text/template" + "sync" + "time" "sigs.k8s.io/kustomize/api/konfig" "sigs.k8s.io/kustomize/api/types" @@ -19,12 +21,15 @@ import ( "github.com/argoproj-labs/argocd-image-updater/pkg/common" "github.com/argoproj-labs/argocd-image-updater/registry-scanner/pkg/image" + "github.com/argoproj-labs/argocd-image-updater/registry-scanner/pkg/env" "github.com/argoproj-labs/argocd-image-updater/ext/git" "github.com/argoproj-labs/argocd-image-updater/registry-scanner/pkg/log" "github.com/argoproj/argo-cd/v2/pkg/apis/application/v1alpha1" ) +// allow tests to replace git client factory +var newGitClient = git.NewClientExt // templateCommitMessage renders a commit message template and returns it as // as a string. If the template could not be rendered, returns a default @@ -129,6 +134,175 @@ func TemplateBranchName(branchName string, changeList []ChangeEntry) string { type changeWriter func(app *v1alpha1.Application, wbc *WriteBackConfig, gitC git.Client) (err error, skip bool) +// repoMu serializes git operations per repository URL to reduce contention in monorepos +var repoMu sync.Map // map[string]*sync.Mutex + +func getRepoMutex(repo string) *sync.Mutex { + if v, ok := repoMu.Load(repo); ok { + return v.(*sync.Mutex) + } + m := &sync.Mutex{} + actual, _ := repoMu.LoadOrStore(repo, m) + return actual.(*sync.Mutex) +} + +// ----------------------- +// Batched repo writer +// ----------------------- + +type writeIntent struct { + app *v1alpha1.Application + wbc *WriteBackConfig + changeList []ChangeEntry + writeFn changeWriter +} + +type repoWriter struct { + repoURL string + intentsCh chan writeIntent + flushEvery time.Duration + maxBatch int + stopCh chan struct{} +} + +var writers sync.Map // map[string]*repoWriter + +func getOrCreateWriter(repo string) *repoWriter { + if v, ok := writers.Load(repo); ok { + return v.(*repoWriter) + } + rw := &repoWriter{ + repoURL: repo, + intentsCh: make(chan writeIntent, 1024), + flushEvery: env.GetDurationVal("GIT_BATCH_FLUSH_INTERVAL", 2*time.Second), + maxBatch: env.ParseNumFromEnv("GIT_BATCH_MAX", 10, 1, 1000), + stopCh: make(chan struct{}), + } + go rw.loop() + actual, _ := writers.LoadOrStore(repo, rw) + return actual.(*repoWriter) +} + +func (rw *repoWriter) loop() { + ticker := time.NewTicker(rw.flushEvery) + defer ticker.Stop() + batch := make([]writeIntent, 0, rw.maxBatch) + flush := func() { if len(batch) > 0 { rw.flushBatch(batch); batch = batch[:0] } } + for { + select { + case wi := <-rw.intentsCh: + batch = append(batch, wi) + if len(batch) >= rw.maxBatch { flush() } + case <-ticker.C: + flush() + case <-rw.stopCh: + flush(); return + } + } +} + +func (rw *repoWriter) flushBatch(batch []writeIntent) { + // Group intents by resolved push branch to avoid mixing branches + byBranch := groupIntentsByBranch(batch) + for branch, intents := range byBranch { + rw.commitBatch(branch, intents) + } +} + +// groupIntentsByBranch groups write intents by their resolved push branch. +// This is a pure function intended for testing and reuse. +func groupIntentsByBranch(batch []writeIntent) map[string][]writeIntent { + byBranch := map[string][]writeIntent{} + for _, wi := range batch { + branch := wi.wbc.GitWriteBranch + if branch == "" { + branch = getWriteBackBranch(wi.app) + } + byBranch[branch] = append(byBranch[branch], wi) + } + return byBranch +} + +func (rw *repoWriter) commitBatch(branch string, intents []writeIntent) { + if len(intents) == 0 { return } + // Use creds and identity from first intent + first := intents[0] + logCtx := log.WithContext().AddField("repository", rw.repoURL) + + // Only Git write-back is supported by the batched writer + if first.wbc == nil || first.wbc.Method != WriteBackGit { + logCtx.Warnf("skipping batched commit: non-git write-back method") + return + } + + if first.wbc.GetCreds == nil { + logCtx.Errorf("batched commit: missing GetCreds resolver; skipping") + return + } + creds, err := first.wbc.GetCreds(first.app) + if err != nil { logCtx.Errorf("could not get creds: %v", err); return } + + tempRoot, err := os.MkdirTemp(os.TempDir(), "git-batch-") + if err != nil { logCtx.Errorf("temp dir: %v", err); return } + defer func(){ _ = os.RemoveAll(tempRoot) }() + + gitC, err := newGitClient(rw.repoURL, tempRoot, creds, false, false, "") + if err != nil { logCtx.Errorf("git client: %v", err); return } + if err = gitC.Init(); err != nil { logCtx.Errorf("git init: %v", err); return } + + // Resolve checkout and push branch similarly to commitChangesGit + checkOutBranch := getWriteBackBranch(first.app) + if first.wbc.GitBranch != "" { checkOutBranch = first.wbc.GitBranch } + if checkOutBranch == "" || checkOutBranch == "HEAD" { + b, err := gitC.SymRefToBranch(checkOutBranch) + if err != nil { logCtx.Errorf("resolve branch: %v", err); return } + checkOutBranch = b + } + pushBranch := branch + + // Ensure the branch exists locally + if pushBranch != checkOutBranch { + if err := gitC.ShallowFetch(pushBranch, 1); err != nil { + if err2 := gitC.ShallowFetch(checkOutBranch, 1); err2 != nil { logCtx.Errorf("fetch: %v", err2); return } + if err := gitC.Branch(checkOutBranch, pushBranch); err != nil { logCtx.Errorf("branch: %v", err); return } + } + } else { + if err := gitC.ShallowFetch(checkOutBranch, 1); err != nil { logCtx.Errorf("fetch: %v", err); return } + } + if err := gitC.Checkout(pushBranch, false); err != nil { logCtx.Errorf("checkout: %v", err); return } + + // Apply writes for each intent using shared repo + combinedChanges := 0 + for _, wi := range intents { + if wi.wbc.GitCommitUser != "" && wi.wbc.GitCommitEmail != "" { + _ = gitC.Config(wi.wbc.GitCommitUser, wi.wbc.GitCommitEmail) + } + if err, skip := wi.writeFn(wi.app, wi.wbc, gitC); err != nil { + logCtx.Errorf("write failed for app %s: %v", wi.app.GetName(), err) + continue + } else if skip { + continue + } + combinedChanges += len(wi.changeList) + } + if combinedChanges == 0 { return } + + // Compose a commit message summarizing apps + msg := "Update parameters for " + for i, wi := range intents { + if i > 0 { msg += ", " } + msg += wi.app.GetName() + } + + commitOpts := &git.CommitOptions{ CommitMessageText: msg, SigningKey: first.wbc.GitCommitSigningKey, SigningMethod: first.wbc.GitCommitSigningMethod, SignOff: first.wbc.GitCommitSignOff } + if err := gitC.Commit("", commitOpts); err != nil { logCtx.Errorf("commit: %v", err); return } + if err := gitC.Push("origin", pushBranch, pushBranch != checkOutBranch); err != nil { logCtx.Errorf("push: %v", err); return } +} + +func enqueueWriteIntent(wi writeIntent) { + getOrCreateWriter(wi.wbc.GitRepo).intentsCh <- wi +} + // getWriteBackBranch returns the branch to use for write-back operations. // It first checks for a branch specified in annotations, then uses the // targetRevision from the matching git source, falling back to getApplicationSource. @@ -151,15 +325,23 @@ func getWriteBackBranch(app *v1alpha1.Application) string { } } - // Fall back to getApplicationSource's targetRevision - // This maintains consistency with how other parts of the code select the source - return getApplicationSource(app).TargetRevision + // Fall back to getApplicationSource's targetRevision + // This maintains consistency with how other parts of the code select the source + src := getApplicationSource(app) + if src == nil { + return "" + } + return src.TargetRevision } // commitChanges commits any changes required for updating one or more images // after the UpdateApplication cycle has finished. func commitChangesGit(app *v1alpha1.Application, wbc *WriteBackConfig, changeList []ChangeEntry, write changeWriter) error { - logCtx := log.WithContext().AddField("application", app.GetName()) + logCtx := log.WithContext().AddField("application", app.GetName()) + // Serialize per repo to avoid many workers hammering the same monorepo + repoLock := getRepoMutex(wbc.GitRepo) + repoLock.Lock() + defer repoLock.Unlock() creds, err := wbc.GetCreds(app) if err != nil { return fmt.Errorf("could not get creds for repo '%s': %v", wbc.GitRepo, err) @@ -176,7 +358,7 @@ func commitChangesGit(app *v1alpha1.Application, wbc *WriteBackConfig, changeLis logCtx.Errorf("could not remove temp dir: %v", err) } }() - gitC, err = git.NewClientExt(wbc.GitRepo, tempRoot, creds, false, false, "") + gitC, err = newGitClient(wbc.GitRepo, tempRoot, creds, false, false, "") if err != nil { return err } diff --git a/pkg/argocd/git_test.go b/pkg/argocd/git_test.go index 655285fa..d83785ef 100644 --- a/pkg/argocd/git_test.go +++ b/pkg/argocd/git_test.go @@ -1,462 +1,80 @@ package argocd import ( - "os" - "testing" - "text/template" - "time" - - "github.com/argoproj-labs/argocd-image-updater/pkg/common" - "github.com/argoproj-labs/argocd-image-updater/registry-scanner/pkg/image" - "github.com/argoproj-labs/argocd-image-updater/registry-scanner/pkg/tag" - - v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - - "sigs.k8s.io/kustomize/api/types" - kyaml "sigs.k8s.io/kustomize/kyaml/yaml" - - "github.com/argoproj/argo-cd/v2/pkg/apis/application/v1alpha1" - "github.com/stretchr/testify/assert" + "testing" + "sync/atomic" + "github.com/argoproj-labs/argocd-image-updater/pkg/common" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + v1alpha1 "github.com/argoproj/argo-cd/v2/pkg/apis/application/v1alpha1" + extgit "github.com/argoproj-labs/argocd-image-updater/ext/git" ) -func Test_TemplateCommitMessage(t *testing.T) { - t.Run("Template default commit message", func(t *testing.T) { - exp := `build: automatic update of foobar - -updates image foo/bar tag '1.0' to '1.1' -updates image bar/baz tag '2.0' to '2.1' -` - tpl := template.Must(template.New("sometemplate").Parse(common.DefaultGitCommitMessage)) - cl := []ChangeEntry{ - { - Image: image.NewFromIdentifier("foo/bar"), - OldTag: tag.NewImageTag("1.0", time.Now(), ""), - NewTag: tag.NewImageTag("1.1", time.Now(), ""), - }, - { - Image: image.NewFromIdentifier("bar/baz"), - OldTag: tag.NewImageTag("2.0", time.Now(), ""), - NewTag: tag.NewImageTag("2.1", time.Now(), ""), - }, - } - r := TemplateCommitMessage(tpl, "foobar", cl) - assert.NotEmpty(t, r) - assert.Equal(t, exp, r) - }) -} - -func Test_TemplateBranchName(t *testing.T) { - t.Run("Template branch name with image name", func(t *testing.T) { - exp := `image-updater-foo/bar-1.1-bar/baz-2.1` - tpl := "image-updater{{range .Images}}-{{.Name}}-{{.NewTag}}{{end}}" - cl := []ChangeEntry{ - { - Image: image.NewFromIdentifier("foo/bar"), - OldTag: tag.NewImageTag("1.0", time.Now(), ""), - NewTag: tag.NewImageTag("1.1", time.Now(), ""), - }, - { - Image: image.NewFromIdentifier("bar/baz"), - OldTag: tag.NewImageTag("2.0", time.Now(), ""), - NewTag: tag.NewImageTag("2.1", time.Now(), ""), - }, - } - r := TemplateBranchName(tpl, cl) - assert.NotEmpty(t, r) - assert.Equal(t, exp, r) - }) - t.Run("Template branch name with alias", func(t *testing.T) { - exp := `image-updater-bar-1.1` - tpl := "image-updater{{range .Images}}-{{.Alias}}-{{.NewTag}}{{end}}" - cl := []ChangeEntry{ - { - Image: image.NewFromIdentifier("bar=0001.dkr.ecr.us-east-1.amazonaws.com/bar"), - OldTag: tag.NewImageTag("1.0", time.Now(), ""), - NewTag: tag.NewImageTag("1.1", time.Now(), ""), - }, - } - r := TemplateBranchName(tpl, cl) - assert.NotEmpty(t, r) - assert.Equal(t, exp, r) - }) - t.Run("Template branch name with hash", func(t *testing.T) { - // Expected value generated from https://emn178.github.io/online-tools/sha256.html - exp := `image-updater-0fcc2782543e4bb067c174c21bf44eb947f3e55c0d62c403e359c1c209cbd041` - tpl := "image-updater-{{.SHA256}}" - cl := []ChangeEntry{ - { - Image: image.NewFromIdentifier("foo/bar"), - OldTag: tag.NewImageTag("1.0", time.Now(), ""), - NewTag: tag.NewImageTag("1.1", time.Now(), ""), - }, - } - r := TemplateBranchName(tpl, cl) - assert.NotEmpty(t, r) - assert.Equal(t, exp, r) - }) - t.Run("Template branch over 255 chars", func(t *testing.T) { - tpl := "image-updater-lorem-ipsum-dolor-sit-amet-consectetur-" + - "adipiscing-elit-phasellus-imperdiet-vitae-elit-quis-pulvinar-" + - "suspendisse-pulvinar-lacus-vel-semper-congue-enim-purus-posuere-" + - "orci-ut-vulputate-mi-ipsum-quis-ipsum-quisque-elit-arcu-lobortis-" + - "in-blandit-vel-pharetra-vel-urna-aliquam-euismod-elit-vel-mi" - exp := tpl[:255] - cl := []ChangeEntry{} - r := TemplateBranchName(tpl, cl) - assert.NotEmpty(t, r) - assert.Equal(t, exp, r) - assert.Len(t, r, 255) - }) -} - -func Test_parseImageOverride(t *testing.T) { - cases := []struct { - name string - override v1alpha1.KustomizeImage - expected types.Image - }{ - {"tag update", "ghcr.io:1234/foo/foo:123", types.Image{ - Name: "ghcr.io:1234/foo/foo", - NewTag: "123", - }}, - {"image update", "ghcr.io:1234/foo/foo=ghcr.io:1234/bar", types.Image{ - Name: "ghcr.io:1234/foo/foo", - NewName: "ghcr.io:1234/bar", - }}, - {"update everything", "ghcr.io:1234/foo/foo=1234.foo.com:9876/bar:123", types.Image{ - Name: "ghcr.io:1234/foo/foo", - NewName: "1234.foo.com:9876/bar", - NewTag: "123", - }}, - {"change registry and tag", "ghcr.io:1234/foo/foo=1234.dkr.ecr.us-east-1.amazonaws.com/bar:123", types.Image{ - Name: "ghcr.io:1234/foo/foo", - NewName: "1234.dkr.ecr.us-east-1.amazonaws.com/bar", - NewTag: "123", - }}, - {"change only registry", "0001.dkr.ecr.us-east-1.amazonaws.com/bar=1234.dkr.ecr.us-east-1.amazonaws.com/bar", types.Image{ - Name: "0001.dkr.ecr.us-east-1.amazonaws.com/bar", - NewName: "1234.dkr.ecr.us-east-1.amazonaws.com/bar", - }}, - {"change image and set digest", "foo=acme/app@sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", types.Image{ - Name: "foo", - NewName: "acme/app", - Digest: "sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", - }}, - {"set digest", "acme/app@sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", types.Image{ - Name: "acme/app", - Digest: "sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", - }}, - } - - for _, tt := range cases { - t.Run(tt.name, func(t *testing.T) { - assert.Equal(t, tt.expected, parseImageOverride(tt.override)) - }) - } - -} - -func Test_imagesFilter(t *testing.T) { - for _, tt := range []struct { - name string - images v1alpha1.KustomizeImages - expected string - }{ - {name: "simple", images: v1alpha1.KustomizeImages{"foo"}, expected: ` -images: -- name: foo -`}, - {name: "tagged", images: v1alpha1.KustomizeImages{"foo:bar"}, expected: ` -images: -- name: foo - newTag: bar -`}, - {name: "rename", images: v1alpha1.KustomizeImages{"baz=foo:bar"}, expected: ` -images: -- name: baz - newName: foo - newTag: bar -`}, - {name: "digest", images: v1alpha1.KustomizeImages{"baz=foo@sha12345"}, expected: ` -images: -- name: baz - newName: foo - digest: sha12345 -`}, - {name: "digest simple", images: v1alpha1.KustomizeImages{"foo@sha12345"}, expected: ` -images: -- name: foo - digest: sha12345 -`}, - {name: "all", images: v1alpha1.KustomizeImages{ - "foo", - "foo=bar", // merges with above - "baz@sha12345", - "bar:123", - "foo=bar:123", // merges and overwrites the first two - }, expected: ` -images: -- name: foo - newName: bar - newTag: "123" -- name: baz - digest: sha12345 -- name: bar - newTag: "123" -`}, - } { - t.Run(tt.name, func(t *testing.T) { - filter, err := imagesFilter(tt.images) - assert.NoError(t, err) - - node := kyaml.NewRNode(&kyaml.Node{Kind: kyaml.DocumentNode, Content: []*kyaml.Node{ - kyaml.NewMapRNode(nil).YNode(), - }}) - node, err = filter.Filter(node) - assert.NoError(t, err) - assert.YAMLEq(t, tt.expected, node.MustString()) - }) - } -} - -func Test_updateKustomizeFile(t *testing.T) { - makeTmpKustomization := func(t *testing.T, content []byte) string { - f, err := os.CreateTemp("", "kustomization-*.yaml") - if err != nil { - t.Fatal(err) - } - _, err = f.Write(content) - if err != nil { - t.Fatal(err) - } - f.Close() - t.Cleanup(func() { - os.Remove(f.Name()) - }) - return f.Name() - } - - filter, err := imagesFilter(v1alpha1.KustomizeImages{"foo@sha23456"}) - if err != nil { - t.Fatal(err) - } - - tests := []struct { - name string - content string - wantContent string - filter kyaml.Filter - wantErr bool - }{ - { - name: "sorted", - content: `images: -- digest: sha12345 - name: foo -`, - wantContent: `images: -- digest: sha23456 - name: foo -`, - filter: filter, - }, - { - name: "not-sorted", - content: `images: -- name: foo - digest: sha12345 -`, - wantContent: `images: -- name: foo - digest: sha23456 -`, - filter: filter, - }, - { - name: "indented", - content: `images: - - name: foo - digest: sha12345 -`, - wantContent: `images: - - name: foo - digest: sha23456 -`, - filter: filter, - }, - { - name: "no-change", - content: `images: -- name: foo - digest: sha23456 -`, - wantContent: "", - filter: filter, - }, - { - name: "invalid-path", - content: `images: -- name: foo - digest: sha12345 -`, - wantContent: "", - filter: filter, - wantErr: true, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - var path string - if tt.wantErr { - path = "/invalid-path" - } else { - path = makeTmpKustomization(t, []byte(tt.content)) - } - - err, skip := updateKustomizeFile(tt.filter, path) - if tt.wantErr { - assert.Error(t, err) - assert.False(t, skip) - } else if tt.name == "no-change" { - assert.Nil(t, err) - assert.True(t, skip) - } else { - got, err := os.ReadFile(path) - if err != nil { - t.Fatal(err) - } - assert.Equal(t, tt.wantContent, string(got)) - assert.False(t, skip) - } - }) - } +// Test that grouping does not mix different branches in one commit/push +func Test_groupIntentsByBranch(t *testing.T) { + appA := &v1alpha1.Application{ObjectMeta: v1.ObjectMeta{Annotations: map[string]string{common.GitBranchAnnotation: "main:appA-branch"}}} + appB := &v1alpha1.Application{ObjectMeta: v1.ObjectMeta{Annotations: map[string]string{common.GitBranchAnnotation: "main:appB-branch"}}} + wbcA := &WriteBackConfig{GitRepo: "https://example/repo.git", GitWriteBranch: "appA-branch"} + wbcB := &WriteBackConfig{GitRepo: "https://example/repo.git", GitWriteBranch: "appB-branch"} + + by := groupIntentsByBranch([]writeIntent{ + {app: appA, wbc: wbcA, changeList: []ChangeEntry{{}}, writeFn: writeOverrides}, + {app: appB, wbc: wbcB, changeList: []ChangeEntry{{}}, writeFn: writeOverrides}, + {app: appA, wbc: wbcA, changeList: []ChangeEntry{{}}, writeFn: writeOverrides}, + }) + + if len(by["appA-branch"]) != 2 { + t.Fatalf("expected 2 intents for appA-branch, got %d", len(by["appA-branch"])) + } + if len(by["appB-branch"]) != 1 { + t.Fatalf("expected 1 intent for appB-branch, got %d", len(by["appB-branch"])) + } } -func Test_getApplicationSource(t *testing.T) { - t.Run("multi-source without git repo annotation", func(t *testing.T) { - app := &v1alpha1.Application{ - ObjectMeta: v1.ObjectMeta{ - Name: "test-app", - }, - Spec: v1alpha1.ApplicationSpec{ - Sources: v1alpha1.ApplicationSources{ - { - RepoURL: "https://charts.bitnami.com/bitnami", - TargetRevision: "18.2.3", - Chart: "nginx", - Helm: &v1alpha1.ApplicationSourceHelm{}, - }, - { - RepoURL: "https://github.com/chengfang/image-updater-examples.git", - TargetRevision: "main", - }, - }, - }, - } - - source := getApplicationSource(app) - assert.Equal(t, "18.2.3", source.TargetRevision) - assert.Equal(t, "https://charts.bitnami.com/bitnami", source.RepoURL) - }) - - t.Run("single source application", func(t *testing.T) { - app := &v1alpha1.Application{ - ObjectMeta: v1.ObjectMeta{ - Name: "test-app", - }, - Spec: v1alpha1.ApplicationSpec{ - Source: &v1alpha1.ApplicationSource{ - RepoURL: "https://github.com/example/repo.git", - TargetRevision: "main", - }, - }, - } - - source := getApplicationSource(app) - assert.Equal(t, "main", source.TargetRevision) - assert.Equal(t, "https://github.com/example/repo.git", source.RepoURL) - }) -} - -func Test_getWriteBackBranch(t *testing.T) { - t.Run("nil application", func(t *testing.T) { - branch := getWriteBackBranch(nil) - assert.Equal(t, "", branch) - }) - - t.Run("matching git-repository annotation", func(t *testing.T) { - app := &v1alpha1.Application{ - ObjectMeta: v1.ObjectMeta{ - Name: "test-app", - Annotations: map[string]string{ - "argocd-image-updater.argoproj.io/git-repository": "https://github.com/chengfang/image-updater-examples.git", - }, - }, - Spec: v1alpha1.ApplicationSpec{ - Sources: v1alpha1.ApplicationSources{ - { - RepoURL: "https://charts.bitnami.com/bitnami", - TargetRevision: "18.2.3", - Chart: "nginx", - }, - { - RepoURL: "https://github.com/chengfang/image-updater-examples.git", - TargetRevision: "main", - }, - }, - }, - } - - branch := getWriteBackBranch(app) - assert.Equal(t, "main", branch) - }) - - t.Run("fallback to primary source when no match", func(t *testing.T) { - app := &v1alpha1.Application{ - ObjectMeta: v1.ObjectMeta{ - Name: "test-app", - }, - Spec: v1alpha1.ApplicationSpec{ - Sources: v1alpha1.ApplicationSources{ - { - RepoURL: "https://charts.bitnami.com/bitnami", - TargetRevision: "18.2.3", - Chart: "nginx", - Helm: &v1alpha1.ApplicationSourceHelm{}, - }, - { - RepoURL: "https://github.com/chengfang/image-updater-examples.git", - TargetRevision: "main", - }, - }, - }, - } - - branch := getWriteBackBranch(app) - assert.Equal(t, "18.2.3", branch) - }) - - t.Run("git-repository annotation with non-matching URL", func(t *testing.T) { - app := &v1alpha1.Application{ - ObjectMeta: v1.ObjectMeta{ - Name: "test-app", - Annotations: map[string]string{ - "argocd-image-updater.argoproj.io/git-repository": "https://github.com/different/repo.git", - }, - }, - Spec: v1alpha1.ApplicationSpec{ - Sources: v1alpha1.ApplicationSources{ - { - RepoURL: "https://charts.bitnami.com/bitnami", - TargetRevision: "18.2.3", - Chart: "nginx", - Helm: &v1alpha1.ApplicationSourceHelm{}, - }, - }, - }, - } - - branch := getWriteBackBranch(app) - assert.Equal(t, "18.2.3", branch) - }) +type fakeGitClient struct{ pushes int32 } +func (f *fakeGitClient) Root() string { return "/tmp" } +func (f *fakeGitClient) Init() error { return nil } +func (f *fakeGitClient) Fetch(revision string) error { return nil } +func (f *fakeGitClient) ShallowFetch(revision string, depth int) error { return nil } +func (f *fakeGitClient) Submodule() error { return nil } +func (f *fakeGitClient) Checkout(revision string, submoduleEnabled bool) error { return nil } +func (f *fakeGitClient) LsRefs() (*extgit.Refs, error) { return &extgit.Refs{}, nil } +func (f *fakeGitClient) LsRemote(revision string) (string, error) { return "", nil } +func (f *fakeGitClient) LsFiles(path string, enableNewGitFileGlobbing bool) ([]string, error) { return nil, nil } +func (f *fakeGitClient) LsLargeFiles() ([]string, error) { return nil, nil } +func (f *fakeGitClient) CommitSHA() (string, error) { return "", nil } +func (f *fakeGitClient) RevisionMetadata(revision string) (*extgit.RevisionMetadata, error) { return nil, nil } +func (f *fakeGitClient) VerifyCommitSignature(string) (string, error) { return "", nil } +func (f *fakeGitClient) IsAnnotatedTag(string) bool { return false } +func (f *fakeGitClient) ChangedFiles(revision string, targetRevision string) ([]string, error) { return nil, nil } +func (f *fakeGitClient) Commit(path string, opts *extgit.CommitOptions) error { return nil } +func (f *fakeGitClient) Branch(from, to string) error { return nil } +func (f *fakeGitClient) Push(remote, branch string, force bool) error { atomic.AddInt32(&f.pushes, 1); return nil } +func (f *fakeGitClient) Add(path string) error { return nil } +func (f *fakeGitClient) SymRefToBranch(symRef string) (string, error) { return "main", nil } +func (f *fakeGitClient) Config(username string, email string) error { return nil } + +func Test_repoWriter_BatchesPerBranch(t *testing.T) { + // stub git client factory + old := newGitClient + defer func(){ newGitClient = old }() + fg := &fakeGitClient{} + newGitClient = func(rawRepoURL string, root string, creds extgit.Creds, insecure bool, enableLfs bool, proxy string, opts ...extgit.ClientOpts) (extgit.Client, error) { + return fg, nil + } + + appMain := &v1alpha1.Application{} + appDev := &v1alpha1.Application{ObjectMeta: appMain.ObjectMeta} + wbcMain := &WriteBackConfig{GitRepo: "https://example/repo.git", GitWriteBranch: "main", Method: WriteBackGit, GetCreds: func(a *v1alpha1.Application) (extgit.Creds, error) { return extgit.NopCreds{}, nil }} + wbcDev := &WriteBackConfig{GitRepo: "https://example/repo.git", GitWriteBranch: "dev", Method: WriteBackGit, GetCreds: func(a *v1alpha1.Application) (extgit.Creds, error) { return extgit.NopCreds{}, nil }} + + rw := &repoWriter{repoURL: wbcMain.GitRepo, intentsCh: make(chan writeIntent, 10), flushEvery: 0, maxBatch: 100, stopCh: make(chan struct{})} + // Directly call flushBatch to avoid goroutine timing + rw.flushBatch([]writeIntent{ + {app: appMain, wbc: wbcMain, changeList: []ChangeEntry{{}}, writeFn: func(a *v1alpha1.Application, w *WriteBackConfig, c extgit.Client) (error, bool) { return nil, false }}, + {app: appDev, wbc: wbcDev, changeList: []ChangeEntry{{}}, writeFn: func(a *v1alpha1.Application, w *WriteBackConfig, c extgit.Client) (error, bool) { return nil, false }}, + }) + + if fg.pushes != 2 { + t.Fatalf("expected 2 pushes (one per branch), got %d", fg.pushes) + } } diff --git a/pkg/argocd/update.go b/pkg/argocd/update.go index a095821f..6c37290b 100644 --- a/pkg/argocd/update.go +++ b/pkg/argocd/update.go @@ -23,6 +23,7 @@ import ( "github.com/argoproj-labs/argocd-image-updater/registry-scanner/pkg/log" "github.com/argoproj-labs/argocd-image-updater/registry-scanner/pkg/registry" "github.com/argoproj-labs/argocd-image-updater/registry-scanner/pkg/tag" + "github.com/argoproj-labs/argocd-image-updater/registry-scanner/pkg/env" "github.com/argoproj/argo-cd/v2/pkg/apiclient/application" "github.com/argoproj/argo-cd/v2/pkg/apis/application/v1alpha1" @@ -122,15 +123,65 @@ type ChangeEntry struct { type SyncIterationState struct { lock sync.Mutex repositoryLocks map[string]*sync.Mutex + lastSuccess map[string]time.Time + lastAttempt map[string]time.Time + failCount map[string]int } // NewSyncIterationState returns a new instance of SyncIterationState func NewSyncIterationState() *SyncIterationState { return &SyncIterationState{ - repositoryLocks: make(map[string]*sync.Mutex), + repositoryLocks: make(map[string]*sync.Mutex), + lastSuccess: make(map[string]time.Time), + lastAttempt: make(map[string]time.Time), + failCount: make(map[string]int), } } +// AppStats is a snapshot of per-application scheduling stats +type AppStats struct { + LastSuccess time.Time + LastAttempt time.Time + FailCount int +} + +// RecordAttempt notes that an attempt was made to process the application +func (state *SyncIterationState) RecordAttempt(app string) { + state.lock.Lock() + state.lastAttempt[app] = time.Now() + state.lock.Unlock() +} + +// RecordResult records the outcome for an application +func (state *SyncIterationState) RecordResult(app string, hadErrors bool) { + state.lock.Lock() + defer state.lock.Unlock() + if hadErrors { + state.failCount[app] = state.failCount[app] + 1 + } else { + state.failCount[app] = 0 + state.lastSuccess[app] = time.Now() + } +} + +// GetStats returns a copy of current stats for scheduling decisions +func (state *SyncIterationState) GetStats() map[string]AppStats { + state.lock.Lock() + defer state.lock.Unlock() + out := make(map[string]AppStats, len(state.lastAttempt)) + // Union of keys + for k := range state.lastAttempt { out[k] = AppStats{} } + for k := range state.lastSuccess { out[k] = AppStats{} } + for k := range state.failCount { out[k] = AppStats{} } + for k, v := range out { + v.LastAttempt = state.lastAttempt[k] + v.LastSuccess = state.lastSuccess[k] + v.FailCount = state.failCount[k] + out[k] = v + } + return out +} + // GetRepositoryLock returns the lock for a specified repository func (state *SyncIterationState) GetRepositoryLock(repository string) *sync.Mutex { state.lock.Lock() @@ -168,7 +219,7 @@ func UpdateApplication(updateConf *UpdateConfiguration, state *SyncIterationStat result.NumApplicationsProcessed += 1 - // Loop through all images of current application, and check whether one of + // Loop through all images of current application, and check whether one of // its images is eligible for updating. // // Whether an image qualifies for update is dependent on semantic version @@ -249,7 +300,7 @@ func UpdateApplication(updateConf *UpdateConfiguration, state *SyncIterationStat imgCtx.Warnf("Could not fetch credentials: %v", err) result.NumErrors += 1 continue - } + } } regClient, err := updateConf.NewRegFN(rep, creds.Username, creds.Password) @@ -356,18 +407,32 @@ func UpdateApplication(updateConf *UpdateConfiguration, state *SyncIterationStat wbc.GitCommitSignOff = updateConf.GitCommitSignOff } - if needUpdate { + if needUpdate { logCtx := log.WithContext().AddField("application", app) - log.Debugf("Using commit message: %s", wbc.GitCommitMessage) - if !updateConf.DryRun { - logCtx.Infof("Committing %d parameter update(s) for application %s", result.NumImagesUpdated, app) - err := commitChangesLocked(&updateConf.UpdateApp.Application, wbc, state, changeList) + log.Debugf("Using commit message: %s", wbc.GitCommitMessage) + if !updateConf.DryRun { + // If write-back is Git, we can batch; otherwise commit directly via Argo CD API + var err error + if wbc.Method != WriteBackGit || env.GetBoolVal("GIT_BATCH_DISABLE", false) { + err = commitChangesLocked(&updateConf.UpdateApp.Application, wbc, state, changeList) + } else { + logCtx.Infof("Queuing %d parameter update(s) for application %s (git write pending)", result.NumImagesUpdated, app) + // Ensure Git credentials resolver is set for batched writer + if wbc.GetCreds == nil { + if perr := parseGitConfig(&updateConf.UpdateApp.Application, updateConf.KubeClient, wbc, "repocreds"); perr != nil { + logCtx.Warnf("could not prepare git credentials for batched write: %v", perr) + } + } + wi := writeIntent{app: &updateConf.UpdateApp.Application, wbc: wbc, changeList: changeList, writeFn: writeOverrides} + if wbc.KustomizeBase != "" { wi.writeFn = writeKustomization } + enqueueWriteIntent(wi) + } if err != nil { logCtx.Errorf("Could not update application spec: %v", err) result.NumErrors += 1 result.NumImagesUpdated = 0 - } else { - logCtx.Infof("Successfully updated the live application spec") + } else { + logCtx.Infof("Application spec updated in-memory; write-back scheduled") if !updateConf.DisableKubeEvents && updateConf.KubeClient != nil { annotations := map[string]string{} for i, c := range changeList { diff --git a/pkg/argocd/update_test.go b/pkg/argocd/update_test.go index 6a115fb9..60fab637 100644 --- a/pkg/argocd/update_test.go +++ b/pkg/argocd/update_test.go @@ -43,8 +43,8 @@ func Test_UpdateApplication(t *testing.T) { return ®Mock, nil } - argoClient := argomock.ArgoCD{} - argoClient.On("UpdateSpec", mock.Anything, mock.Anything).Return(nil, nil) + argoClient := argomock.ArgoCD{} + argoClient.On("UpdateSpec", mock.Anything, mock.Anything).Return(nil, nil) kubeClient := kube.ImageUpdaterKubernetesClient{ KubeClient: ®istryKube.KubernetesClient{ @@ -83,15 +83,15 @@ func Test_UpdateApplication(t *testing.T) { }, Images: *parseImageList(annotations), } - res := UpdateApplication(&UpdateConfiguration{ + res := UpdateApplication(&UpdateConfiguration{ NewRegFN: mockClientFn, ArgoClient: &argoClient, KubeClient: &kubeClient, UpdateApp: appImages, DryRun: false, }, NewSyncIterationState()) - assert.Equal(t, v1alpha1.KustomizeImage("gcr.io/jannfis/foobar:1.0.3"), appImages.Application.Spec.Source.Kustomize.Images[0]) - assert.Equal(t, v1alpha1.KustomizeImage("gcr.io/jannfis/barbar:1.0.3"), appImages.Application.Spec.Source.Kustomize.Images[1]) + assert.Equal(t, v1alpha1.KustomizeImage("gcr.io/jannfis/foobar:1.0.3"), appImages.Application.Spec.Source.Kustomize.Images[0]) + assert.Equal(t, v1alpha1.KustomizeImage("gcr.io/jannfis/barbar:1.0.3"), appImages.Application.Spec.Source.Kustomize.Images[1]) assert.Equal(t, 0, res.NumErrors) assert.Equal(t, 0, res.NumSkipped) assert.Equal(t, 1, res.NumApplicationsProcessed) @@ -154,6 +154,9 @@ func Test_UpdateApplication(t *testing.T) { }, Images: *parseImageList(annotations), } + // Disable batching to preserve legacy immediate write-back semantics for this test + _ = os.Setenv("GIT_BATCH_DISABLE", "true") + defer os.Unsetenv("GIT_BATCH_DISABLE") res := UpdateApplication(&UpdateConfiguration{ NewRegFN: mockClientFn, ArgoClient: &argoClient, diff --git a/pkg/health/health.go b/pkg/health/health.go index cbc4977c..c7c0ac58 100644 --- a/pkg/health/health.go +++ b/pkg/health/health.go @@ -3,10 +3,13 @@ package health // Most simple health check probe to see whether our server is still alive import ( - "fmt" - "net/http" + "fmt" + "net/http" + "os" + "strings" - "github.com/argoproj-labs/argocd-image-updater/registry-scanner/pkg/log" + "github.com/argoproj-labs/argocd-image-updater/registry-scanner/pkg/log" + "github.com/argoproj-labs/argocd-image-updater/registry-scanner/pkg/registry" ) func StartHealthServer(port int) chan error { @@ -20,6 +23,23 @@ func StartHealthServer(port int) chan error { } func HealthProbe(w http.ResponseWriter, r *http.Request) { - log.Tracef("/healthz ping request received, replying with pong") - fmt.Fprintf(w, "OK\n") + log.Tracef("/healthz ping request received, replying with pong") + // optional fail-open behavior on detected port exhaustion to trigger pod restart + if shouldFailOnPortExhaustion() && registry.IsPortExhaustionDegraded() { + w.WriteHeader(http.StatusServiceUnavailable) + if _, err := w.Write([]byte("PORT-EXHAUSTION")); err != nil { + log.Errorf("/healthz write failed: %v", err) + } + return + } + fmt.Fprintf(w, "OK\n") +} + +func shouldFailOnPortExhaustion() bool { + v := strings.ToLower(os.Getenv("HEALTH_FAIL_ON_PORT_EXHAUSTION")) + if v == "" { + // Default: enabled (fail liveness on sustained port exhaustion) + return true + } + return v == "1" || v == "true" || v == "yes" } diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index be47bfb2..6f7fe0e1 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -1,18 +1,20 @@ package metrics import ( - "fmt" - "net/http" + "fmt" + "net/http" + "time" - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" - "github.com/prometheus/client_golang/prometheus/promhttp" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" + "github.com/prometheus/client_golang/prometheus/promhttp" ) type Metrics struct { Endpoint *EndpointMetrics Applications *ApplicationMetrics Clients *ClientMetrics + Singleflight *SingleflightMetrics } var defaultMetrics *Metrics @@ -21,6 +23,15 @@ var defaultMetrics *Metrics type EndpointMetrics struct { requestsTotal *prometheus.CounterVec requestsFailed *prometheus.CounterVec + inFlight *prometheus.GaugeVec + requestDur *prometheus.HistogramVec + httpStatus *prometheus.CounterVec + errorKind *prometheus.CounterVec + retriesTotal *prometheus.CounterVec + jwtAuthRequests *prometheus.CounterVec + jwtAuthErrors *prometheus.CounterVec + jwtAuthDur *prometheus.HistogramVec + jwtTokenTTL *prometheus.HistogramVec } // ApplicationMetrics stores metrics for applications @@ -29,6 +40,20 @@ type ApplicationMetrics struct { imagesWatchedTotal *prometheus.GaugeVec imagesUpdatedTotal *prometheus.CounterVec imagesUpdatedErrorsTotal *prometheus.CounterVec + appUpdateDuration *prometheus.HistogramVec + lastAttemptTs *prometheus.GaugeVec + lastSuccessTs *prometheus.GaugeVec + cycleDuration prometheus.Histogram + cycleLastEndTs prometheus.Gauge + imagesConsideredTotal *prometheus.CounterVec + imagesSkippedTotal *prometheus.CounterVec + schedulerSkippedTotal *prometheus.CounterVec +} + +// SingleflightMetrics captures dedup effectiveness +type SingleflightMetrics struct { + leadersTotal *prometheus.CounterVec + followersTotal *prometheus.CounterVec } // ClientMetrics stores metrics for K8s and ArgoCD clients @@ -63,6 +88,54 @@ func NewEndpointMetrics() *EndpointMetrics { Help: "The number of failed requests to this endpoint", }, []string{"registry"}) + metrics.inFlight = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Name: "argocd_image_updater_registry_in_flight_requests", + Help: "Current number of in-flight registry requests", + }, []string{"registry"}) + + metrics.requestDur = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Name: "argocd_image_updater_registry_request_duration_seconds", + Help: "Registry request duration", + Buckets: prometheus.DefBuckets, + }, []string{"registry"}) + + metrics.httpStatus = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "argocd_image_updater_registry_http_status_total", + Help: "HTTP status codes returned by registry", + }, []string{"registry", "code"}) + + metrics.errorKind = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "argocd_image_updater_registry_errors_total", + Help: "Categorized registry request errors", + }, []string{"registry", "kind"}) + + metrics.retriesTotal = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "argocd_image_updater_registry_request_retries_total", + Help: "Number of retries performed for registry operations", + }, []string{"registry", "op"}) + + metrics.jwtAuthRequests = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "argocd_image_updater_registry_jwt_auth_requests_total", + Help: "Number of JWT auth requests", + }, []string{"registry", "service", "scope"}) + + metrics.jwtAuthErrors = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "argocd_image_updater_registry_jwt_auth_errors_total", + Help: "JWT auth errors by reason", + }, []string{"registry", "service", "scope", "reason"}) + + metrics.jwtAuthDur = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Name: "argocd_image_updater_registry_jwt_auth_duration_seconds", + Help: "JWT auth request duration", + Buckets: prometheus.DefBuckets, + }, []string{"registry", "service", "scope"}) + + metrics.jwtTokenTTL = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Name: "argocd_image_updater_registry_jwt_token_ttl_seconds", + Help: "JWT token TTL as reported by registry", + Buckets: prometheus.DefBuckets, + }, []string{"registry", "service", "scope"}) + return metrics } @@ -90,6 +163,48 @@ func NewApplicationsMetrics() *ApplicationMetrics { Help: "Number of errors reported by Argo CD Image Updater", }, []string{"application"}) + metrics.appUpdateDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Name: "argocd_image_updater_application_update_duration_seconds", + Help: "Time to process a single application", + Buckets: prometheus.DefBuckets, + }, []string{"application"}) + + metrics.lastAttemptTs = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Name: "argocd_image_updater_application_last_attempt_timestamp", + Help: "Unix timestamp of the last attempt for an application", + }, []string{"application"}) + + metrics.lastSuccessTs = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Name: "argocd_image_updater_application_last_success_timestamp", + Help: "Unix timestamp of the last successful attempt for an application", + }, []string{"application"}) + + metrics.cycleDuration = promauto.NewHistogram(prometheus.HistogramOpts{ + Name: "argocd_image_updater_update_cycle_duration_seconds", + Help: "Time to complete a full update cycle across applications", + Buckets: prometheus.DefBuckets, + }) + + metrics.cycleLastEndTs = promauto.NewGauge(prometheus.GaugeOpts{ + Name: "argocd_image_updater_update_cycle_last_end_timestamp", + Help: "Unix timestamp of the end of the most recent update cycle", + }) + + metrics.imagesConsideredTotal = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "argocd_image_updater_images_considered_total", + Help: "Images considered per application", + }, []string{"application"}) + + metrics.imagesSkippedTotal = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "argocd_image_updater_images_skipped_total", + Help: "Images skipped per application", + }, []string{"application"}) + + metrics.schedulerSkippedTotal = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "argocd_image_updater_scheduler_skipped_total", + Help: "Applications skipped by scheduler and reason", + }, []string{"reason"}) + return metrics } @@ -120,11 +235,26 @@ func NewClientMetrics() *ClientMetrics { return metrics } +// NewSingleflightMetrics returns new singleflight metrics +func NewSingleflightMetrics() *SingleflightMetrics { + m := &SingleflightMetrics{} + m.leadersTotal = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "argocd_image_updater_singleflight_leaders_total", + Help: "Number of leader executions per kind", + }, []string{"kind"}) + m.followersTotal = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "argocd_image_updater_singleflight_followers_total", + Help: "Number of follower coalesced calls per kind", + }, []string{"kind"}) + return m +} + func NewMetrics() *Metrics { return &Metrics{ Endpoint: NewEndpointMetrics(), Applications: NewApplicationsMetrics(), - Clients: NewClientMetrics(), + Clients: NewClientMetrics(), + Singleflight: NewSingleflightMetrics(), } } @@ -152,6 +282,14 @@ func Clients() *ClientMetrics { return defaultMetrics.Clients } +// Singleflight returns singleflight metrics +func Singleflight() *SingleflightMetrics { + if defaultMetrics == nil { + return nil + } + return defaultMetrics.Singleflight +} + // IncreaseRequest increases the request counter of EndpointMetrics object func (epm *EndpointMetrics) IncreaseRequest(registryURL string, isFailed bool) { epm.requestsTotal.WithLabelValues(registryURL).Inc() @@ -160,6 +298,56 @@ func (epm *EndpointMetrics) IncreaseRequest(registryURL string, isFailed bool) { } } +// IncInFlight increments in-flight gauge for a registry +func (epm *EndpointMetrics) IncInFlight(registryURL string) { + epm.inFlight.WithLabelValues(registryURL).Inc() +} + +// DecInFlight decrements in-flight gauge for a registry +func (epm *EndpointMetrics) DecInFlight(registryURL string) { + epm.inFlight.WithLabelValues(registryURL).Dec() +} + +// ObserveRequestDuration observes a request duration for a registry +func (epm *EndpointMetrics) ObserveRequestDuration(registryURL string, d time.Duration) { + epm.requestDur.WithLabelValues(registryURL).Observe(d.Seconds()) +} + +// ObserveHTTPStatus increments per-status counters +func (epm *EndpointMetrics) ObserveHTTPStatus(registryURL string, code int) { + epm.httpStatus.WithLabelValues(registryURL, fmt.Sprintf("%d", code)).Inc() +} + +// IncreaseRetry increases retry counter for an operation +func (epm *EndpointMetrics) IncreaseRetry(registryURL, op string) { + epm.retriesTotal.WithLabelValues(registryURL, op).Inc() +} + +// IncreaseErrorKind categorizes and counts errors +func (epm *EndpointMetrics) IncreaseErrorKind(registryURL, kind string) { + epm.errorKind.WithLabelValues(registryURL, kind).Inc() +} + +// IncreaseJWTAuthRequest increments JWT auth request counter +func (epm *EndpointMetrics) IncreaseJWTAuthRequest(registryURL, service, scope string) { + epm.jwtAuthRequests.WithLabelValues(registryURL, service, scope).Inc() +} + +// IncreaseJWTAuthError increments JWT auth error counter with a reason +func (epm *EndpointMetrics) IncreaseJWTAuthError(registryURL, service, scope, reason string) { + epm.jwtAuthErrors.WithLabelValues(registryURL, service, scope, reason).Inc() +} + +// ObserveJWTAuthDuration records JWT auth request duration +func (epm *EndpointMetrics) ObserveJWTAuthDuration(registryURL, service, scope string, d time.Duration) { + epm.jwtAuthDur.WithLabelValues(registryURL, service, scope).Observe(d.Seconds()) +} + +// ObserveJWTTokenTTL records reported JWT token TTL in seconds +func (epm *EndpointMetrics) ObserveJWTTokenTTL(registryURL, service, scope string, ttlSeconds float64) { + epm.jwtTokenTTL.WithLabelValues(registryURL, service, scope).Observe(ttlSeconds) +} + // SetNumberOfApplications sets the total number of currently watched applications func (apm *ApplicationMetrics) SetNumberOfApplications(num int) { apm.applicationsTotal.Set(float64(num)) @@ -180,6 +368,55 @@ func (apm *ApplicationMetrics) IncreaseUpdateErrors(application string, by int) apm.imagesUpdatedErrorsTotal.WithLabelValues(application).Add(float64(by)) } +// ObserveAppUpdateDuration observes duration for processing an application +func (apm *ApplicationMetrics) ObserveAppUpdateDuration(application string, d time.Duration) { + apm.appUpdateDuration.WithLabelValues(application).Observe(d.Seconds()) +} + +// SetLastAttempt records the last attempt timestamp for an application +func (apm *ApplicationMetrics) SetLastAttempt(application string, ts time.Time) { + apm.lastAttemptTs.WithLabelValues(application).Set(float64(ts.Unix())) +} + +// SetLastSuccess records the last success timestamp for an application +func (apm *ApplicationMetrics) SetLastSuccess(application string, ts time.Time) { + apm.lastSuccessTs.WithLabelValues(application).Set(float64(ts.Unix())) +} + +// ObserveCycleDuration observes the duration of a full update cycle +func (apm *ApplicationMetrics) ObserveCycleDuration(d time.Duration) { + apm.cycleDuration.Observe(d.Seconds()) +} + +// SetCycleLastEnd sets the timestamp of the end of the most recent cycle +func (apm *ApplicationMetrics) SetCycleLastEnd(ts time.Time) { + apm.cycleLastEndTs.Set(float64(ts.Unix())) +} + +// IncreaseImagesConsidered increases considered counter per app +func (apm *ApplicationMetrics) IncreaseImagesConsidered(application string, by int) { + apm.imagesConsideredTotal.WithLabelValues(application).Add(float64(by)) +} + +// IncreaseImagesSkipped increases skipped counter per app +func (apm *ApplicationMetrics) IncreaseImagesSkipped(application string, by int) { + apm.imagesSkippedTotal.WithLabelValues(application).Add(float64(by)) +} + +// SchedulerSkipped increments skip reasons +func (apm *ApplicationMetrics) SchedulerSkipped(reason string, by int) { + apm.schedulerSkippedTotal.WithLabelValues(reason).Add(float64(by)) +} + +// Singleflight helpers +func (sfm *SingleflightMetrics) IncreaseLeaders(kind string) { + sfm.leadersTotal.WithLabelValues(kind).Inc() +} + +func (sfm *SingleflightMetrics) IncreaseFollowers(kind string) { + sfm.followersTotal.WithLabelValues(kind).Inc() +} + // IncreaseArgoCDClientRequest increases the number of Argo CD API requests for given server func (cpm *ClientMetrics) IncreaseArgoCDClientRequest(server string, by int) { cpm.argoCDRequestsTotal.WithLabelValues(server).Add(float64(by)) diff --git a/pkg/metrics/metrics_test.go b/pkg/metrics/metrics_test.go index 05af83f1..86fed3b7 100644 --- a/pkg/metrics/metrics_test.go +++ b/pkg/metrics/metrics_test.go @@ -1,10 +1,16 @@ package metrics import ( - "testing" + "io" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" - "github.com/prometheus/client_golang/prometheus" - "github.com/stretchr/testify/assert" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" + "github.com/stretchr/testify/assert" ) func TestMetricsInitialization(t *testing.T) { @@ -68,3 +74,55 @@ func TestMetricsOperations(t *testing.T) { apm.SetNumberOfApplications(3) apm.SetNumberOfImagesWatched("app1", 4) } + +func TestMetricWrappers_NoPanic(t *testing.T) { + prometheus.DefaultRegisterer = prometheus.NewRegistry() + InitMetrics() + + epm := Endpoint() + epm.IncInFlight("reg") + epm.DecInFlight("reg") + epm.ObserveRequestDuration("reg", 5*time.Millisecond) + epm.ObserveHTTPStatus("reg", 200) + epm.IncreaseRetry("reg", "tags") + epm.IncreaseErrorKind("reg", "timeout") + + apm := Applications() + apm.ObserveAppUpdateDuration("app", 7*time.Millisecond) + apm.SetLastAttempt("app", time.Now()) + apm.SetLastSuccess("app", time.Now()) + apm.ObserveCycleDuration(15 * time.Millisecond) + apm.SetCycleLastEnd(time.Now()) + apm.IncreaseImagesConsidered("app", 2) + apm.IncreaseImagesSkipped("app", 1) + apm.SchedulerSkipped("cooldown", 1) + + sf := Singleflight() + sf.IncreaseLeaders("tags") + sf.IncreaseFollowers("tags") +} + +func TestMetricsEndpoint_Serves(t *testing.T) { + reg := prometheus.NewRegistry() + prometheus.DefaultRegisterer = reg + InitMetrics() + mux := http.NewServeMux() + mux.Handle("/metrics", promhttp.HandlerFor(reg, promhttp.HandlerOpts{})) + srv := httptest.NewServer(mux) + defer srv.Close() + + resp, err := http.Get(srv.URL + "/metrics") + if err != nil { + t.Fatalf("GET /metrics failed: %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + t.Fatalf("unexpected status: %d", resp.StatusCode) + } + body, _ := io.ReadAll(resp.Body) + // Check a couple of metric names are present + b := string(body) + if !strings.Contains(b, "argocd_image_updater_applications_watched_total") { + t.Fatalf("expected applications_watched_total metric in scrape") + } +} diff --git a/pkg/version/version.go b/pkg/version/version.go index 4bcdccdb..523ef009 100644 --- a/pkg/version/version.go +++ b/pkg/version/version.go @@ -6,7 +6,7 @@ import ( ) var ( - version = "9.9.99" + version = "100.0.8" buildDate = "1970-01-01T00:00:00Z" gitCommit = "unknown" binaryName = "argocd-image-updater" diff --git a/pkg/webhook/server_test.go b/pkg/webhook/server_test.go index 2d75075a..a3b052c4 100644 --- a/pkg/webhook/server_test.go +++ b/pkg/webhook/server_test.go @@ -6,6 +6,10 @@ import ( "io" "net/http" "net/http/httptest" + "net/url" + "path" + "strings" + "net" "sync" "testing" "time" @@ -98,12 +102,12 @@ func createMockServer(t *testing.T, port int) *WebhookServer { } // Helper function to wait till server is started -func waitForServerToStart(url string, timeout time.Duration) error { +func waitForServerToStart(u string, timeout time.Duration) error { client := &http.Client{Timeout: 1 * time.Second} duration := time.Now().Add(timeout) for time.Now().Before(duration) { - resp, err := client.Get(url) + resp, err := client.Get(cleanURL(u)) if err == nil { resp.Body.Close() return nil @@ -114,18 +118,36 @@ func waitForServerToStart(url string, timeout time.Duration) error { } // Helper function to test connectivity of an endpoint -func testEndpointConnectivity(t *testing.T, url string, expectedStatus int) { +func testEndpointConnectivity(t *testing.T, u string, expectedStatus int) { client := http.Client{Timeout: 5 * time.Second} - res, err := client.Get(url) - if res != nil { - assert.Equal(t, res.StatusCode, expectedStatus, "Did not receive the expected status of %d got: %d", expectedStatus, res.StatusCode) + res, err := client.Get(cleanURL(u)) + if res != nil { + assert.Equal(t, expectedStatus, res.StatusCode, "Did not receive the expected status of %d got: %d", expectedStatus, res.StatusCode) defer res.Body.Close() } assert.NotNil(t, res, "No body received so server is not alive") assert.NoError(t, err) } +// cleanURL normalizes path to avoid double-slashes like "//webhook" +func cleanURL(raw string) string { + pu, err := url.Parse(raw) + if err != nil { return raw } + pu.Path = path.Clean(pu.Path) + // Ensure we keep trailing slash semantics only for root + if pu.Path == "/" && !strings.HasSuffix(raw, "/") { pu.Path = "" } + return pu.String() +} + +// freePort asks the OS for an available TCP port for localhost and returns it +func freePort() int { + l, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { return 8080 } + defer l.Close() + return l.Addr().(*net.TCPAddr).Port +} + // TestNewWebhookServer ensures that WebhookServer struct is inited properly func TestNewWebhookServer(t *testing.T) { handler := NewWebhookHandler() @@ -150,7 +172,8 @@ func TestNewWebhookServer(t *testing.T) { // TestWebhookServerStart ensures that the server is created with the correct endpoints func TestWebhookServerStart(t *testing.T) { - server := createMockServer(t, 8080) + port := freePort() + server := createMockServer(t, port) go func() { err := server.Start() if err != http.ErrServerClosed { @@ -158,29 +181,30 @@ func TestWebhookServerStart(t *testing.T) { } }() - address := fmt.Sprintf("http://localhost:%d/", server.Port) + address := fmt.Sprintf("http://localhost:%d/", server.Port) err := waitForServerToStart(address+"webhook", 5*time.Second) assert.NoError(t, err, "Server failed to start") defer server.Server.Close() - testEndpointConnectivity(t, address+"webhook", http.StatusBadRequest) + testEndpointConnectivity(t, address+"webhook", http.StatusBadRequest) testEndpointConnectivity(t, address+"healthz", http.StatusOK) } // TestWebhookServerStop ensures that the server is stopped properly func TestWebhookServerStop(t *testing.T) { - server := createMockServer(t, 8080) + port := freePort() + server := createMockServer(t, port) errorChannel := make(chan error) go func() { err := server.Start() errorChannel <- err }() - address := fmt.Sprintf("http://localhost:%d/", server.Port) + address := fmt.Sprintf("http://localhost:%d/", server.Port) err := waitForServerToStart(address, 5*time.Second) assert.NoError(t, err, "Server failed to start") - testEndpointConnectivity(t, address+"webhook", http.StatusBadRequest) + testEndpointConnectivity(t, address+"webhook", http.StatusBadRequest) err = server.Stop() select { @@ -191,9 +215,9 @@ func TestWebhookServerStop(t *testing.T) { } assert.NoError(t, err) - client := http.Client{Timeout: 5 * time.Second} - _, err = client.Get("http://localhost:8080/webhook") - assert.NotNil(t, err, "Connecting to endpoint did not return error, server did not shut down properly") + client := http.Client{Timeout: 500 * time.Millisecond} + _, err = client.Get(address + "webhook") + assert.NotNil(t, err, "Connecting to endpoint did not return error, server did not shut down properly") } // TestWebhookServerHandleHealth tests the health handler @@ -216,7 +240,8 @@ func TestWebhookServerHandleHealth(t *testing.T) { // TestWebhookServerHealthEndpoint ensures that the health endpoint of the server is working properly func TestWebhookServerHealthEndpoint(t *testing.T) { - server := createMockServer(t, 8080) + port := freePort() + server := createMockServer(t, port) go func() { err := server.Start() if err != http.ErrServerClosed { @@ -224,7 +249,7 @@ func TestWebhookServerHealthEndpoint(t *testing.T) { } }() - address := fmt.Sprintf("http://localhost:%d/", server.Port) + address := fmt.Sprintf("http://localhost:%d/", server.Port) err := waitForServerToStart(address, 5*time.Second) assert.NoError(t, err, "Server failed to start") defer server.Server.Close() @@ -326,7 +351,8 @@ func TestProcessWebhookEvent(t *testing.T) { // TestWebhookServerWebhookEndpoint ensures that the webhook endpoint of the server is working properly func TestWebhookServerWebhookEndpoint(t *testing.T) { - server := createMockServer(t, 8080) + port := freePort() + server := createMockServer(t, port) mockArgoClient := server.ArgoClient.(*mocks.ArgoCD) mockArgoClient.On("ListApplications", mock.Anything).Return(mockApps, nil).Once() @@ -342,7 +368,7 @@ func TestWebhookServerWebhookEndpoint(t *testing.T) { } }() - address := fmt.Sprintf("http://localhost:%d/", server.Port) + address := fmt.Sprintf("http://localhost:%d/", server.Port) err := waitForServerToStart(address, 5*time.Second) assert.NoError(t, err, "Server failed to start") defer server.Server.Close() @@ -359,7 +385,7 @@ func TestWebhookServerWebhookEndpoint(t *testing.T) { }` client := http.Client{Timeout: 3 * time.Second} - res, err := client.Post(address+"webhook?type=docker", "application/json", bytes.NewReader([]byte(body))) + res, err := client.Post(address+"webhook?type=docker.io", "application/json", bytes.NewReader([]byte(body))) assert.NoError(t, err) assert.NotNil(t, res, "Response received was nil") if res != nil { @@ -371,7 +397,7 @@ func TestWebhookServerWebhookEndpoint(t *testing.T) { body2 := `{}` - res2, err := client.Post(address+"webhook?type=notarealregistry", "application/json", bytes.NewReader([]byte(body2))) + res2, err := client.Post(address+"webhook?type=notarealregistry", "application/json", bytes.NewReader([]byte(body2))) assert.NoError(t, err) assert.NotNil(t, res2, "Response received was nil") if res2 != nil { diff --git a/registry-scanner/go.mod b/registry-scanner/go.mod index bc578e7b..61b41e42 100644 --- a/registry-scanner/go.mod +++ b/registry-scanner/go.mod @@ -1,9 +1,10 @@ module github.com/argoproj-labs/argocd-image-updater/registry-scanner -go 1.23.5 +go 1.24.6 require ( github.com/Masterminds/semver/v3 v3.4.0 + github.com/argoproj-labs/argocd-image-updater v0.0.0-00010101000000-000000000000 github.com/argoproj/pkg v0.13.7-0.20230627120311-a4dd357b057e github.com/distribution/distribution/v3 v3.0.0-20230722181636-7b502560cad4 github.com/opencontainers/go-digest v1.0.0 @@ -12,7 +13,7 @@ require ( github.com/sirupsen/logrus v1.9.3 github.com/stretchr/testify v1.10.0 go.uber.org/ratelimit v0.3.1 - golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 + golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f golang.org/x/sync v0.16.0 gopkg.in/yaml.v2 v2.4.0 k8s.io/api v0.32.2 @@ -31,51 +32,53 @@ require ( github.com/fxamacker/cbor/v2 v2.7.0 // indirect github.com/go-logr/logr v1.4.2 // indirect github.com/go-openapi/jsonpointer v0.21.0 // indirect - github.com/go-openapi/jsonreference v0.20.2 // indirect + github.com/go-openapi/jsonreference v0.21.0 // indirect github.com/go-openapi/swag v0.23.0 // indirect github.com/gogo/protobuf v1.3.2 // indirect github.com/golang/protobuf v1.5.4 // indirect - github.com/google/gnostic-models v0.6.8 // indirect - github.com/google/go-cmp v0.6.0 // indirect + github.com/google/gnostic-models v0.6.9 // indirect + github.com/google/go-cmp v0.7.0 // indirect github.com/google/gofuzz v1.2.0 // indirect github.com/google/uuid v1.6.0 // indirect github.com/gorilla/mux v1.8.0 // indirect github.com/hashicorp/golang-lru v0.5.4 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect - github.com/klauspost/compress v1.16.5 // indirect + github.com/klauspost/compress v1.18.0 // indirect github.com/mailru/easyjson v0.7.7 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect - github.com/prometheus/client_golang v1.19.1 // indirect - github.com/prometheus/client_model v0.5.0 // indirect - github.com/prometheus/common v0.48.0 // indirect - github.com/prometheus/procfs v0.12.0 // indirect - github.com/spf13/pflag v1.0.5 // indirect + github.com/prometheus/client_golang v1.23.0 // indirect + github.com/prometheus/client_model v0.6.2 // indirect + github.com/prometheus/common v0.65.0 // indirect + github.com/prometheus/procfs v0.16.1 // indirect + github.com/spf13/pflag v1.0.7 // indirect github.com/stretchr/objx v0.5.2 // indirect github.com/x448/float16 v0.8.4 // indirect - golang.org/x/net v0.38.0 // indirect - golang.org/x/oauth2 v0.27.0 // indirect - golang.org/x/sys v0.31.0 // indirect - golang.org/x/term v0.30.0 // indirect - golang.org/x/text v0.23.0 // indirect - golang.org/x/time v0.7.0 // indirect - google.golang.org/protobuf v1.35.1 // indirect + go.yaml.in/yaml/v2 v2.4.2 // indirect + golang.org/x/net v0.41.0 // indirect + golang.org/x/oauth2 v0.30.0 // indirect + golang.org/x/sys v0.34.0 // indirect + golang.org/x/term v0.33.0 // indirect + golang.org/x/text v0.27.0 // indirect + golang.org/x/time v0.8.0 // indirect + google.golang.org/protobuf v1.36.6 // indirect gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect k8s.io/klog/v2 v2.130.1 // indirect - k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f // indirect + k8s.io/kube-openapi v0.0.0-20241212222426-2c72e554b1e7 // indirect k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738 // indirect sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect - sigs.k8s.io/structured-merge-diff/v4 v4.4.2 // indirect - sigs.k8s.io/yaml v1.4.0 // indirect + sigs.k8s.io/structured-merge-diff/v4 v4.4.4-0.20241211184406-7bf59b3d70ee // indirect + sigs.k8s.io/yaml v1.6.0 // indirect ) replace ( + github.com/argoproj-labs/argocd-image-updater => ../ k8s.io/api => k8s.io/api v0.32.2 k8s.io/apimachinery => k8s.io/apimachinery v0.32.2 k8s.io/client-go => k8s.io/client-go v0.32.2 diff --git a/registry-scanner/go.sum b/registry-scanner/go.sum index 74f3a94e..4f23930a 100644 --- a/registry-scanner/go.sum +++ b/registry-scanner/go.sum @@ -73,8 +73,9 @@ github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaL github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ= github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY= github.com/go-openapi/jsonreference v0.20.1/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k= -github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE= github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k= +github.com/go-openapi/jsonreference v0.21.0 h1:Rs+Y7hSXT83Jacb7kFyjn4ijOuVGSvOdF2+tg1TRrwQ= +github.com/go-openapi/jsonreference v0.21.0/go.mod h1:LmZmgsrTkVg9LG4EaHeY8cBDslNPMo06cago5JNLkm4= github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14= github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE= github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ= @@ -109,8 +110,9 @@ github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiu github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/google/btree v1.0.1/go.mod h1:xXMiIv4Fb/0kKde4SpL7qlzvu5cMJDRkFDxJfI9uaxA= -github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvRgGre9I= github.com/google/gnostic-models v0.6.8/go.mod h1:5n7qKqH0f5wFt+aWF8CW6pZLLNOfYuF5OpfBSENuI8U= +github.com/google/gnostic-models v0.6.9 h1:MU/8wDLif2qCXZmzncUQ/BOfxWfthHi63KqpoNbWqVw= +github.com/google/gnostic-models v0.6.9/go.mod h1:CiWsm0s6BSQd1hRn8/QmxqB6BesYcbSZxsz9b0KuDBw= github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= @@ -118,8 +120,9 @@ github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/ github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.8/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= -github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= @@ -155,8 +158,9 @@ github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHm github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= -github.com/klauspost/compress v1.16.5 h1:IFV2oUNUzZaz+XyusxpLzpzS8Pt5rh0Z16For/djlyI= github.com/klauspost/compress v1.16.5/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE= +github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= +github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= github.com/klauspost/cpuid/v2 v2.0.1/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= github.com/klauspost/cpuid/v2 v2.2.3/go.mod h1:RVVoqg1df56z8g3pUjL/3lE5UfnlrJX8tyFgg4nqhuY= github.com/klauspost/cpuid/v2 v2.2.4/go.mod h1:RVVoqg1df56z8g3pUjL/3lE5UfnlrJX8tyFgg4nqhuY= @@ -170,6 +174,8 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= +github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs= github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= @@ -259,22 +265,22 @@ github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw= github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo= github.com/prometheus/client_golang v1.1.0/go.mod h1:I1FGZT9+L76gKKOs5djB6ezCbFQP1xR9D75/vuwEF3g= -github.com/prometheus/client_golang v1.19.1 h1:wZWJDwK+NameRJuPGDhlnFgx8e8HN3XHQeLaYJFJBOE= -github.com/prometheus/client_golang v1.19.1/go.mod h1:mP78NwGzrVks5S2H6ab8+ZZGJLZUq1hoULYBAYBw1Ho= +github.com/prometheus/client_golang v1.23.0 h1:ust4zpdl9r4trLY/gSjlm07PuiBq2ynaXXlptpfy8Uc= +github.com/prometheus/client_golang v1.23.0/go.mod h1:i/o0R9ByOnHX0McrTMTyhYvKE4haaf2mW08I+jGAjEE= github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= -github.com/prometheus/client_model v0.5.0 h1:VQw1hfvPvk3Uv6Qf29VrPF32JB6rtbgI6cYPYQjL0Qw= -github.com/prometheus/client_model v0.5.0/go.mod h1:dTiFglRmd66nLR9Pv9f0mZi7B7fk5Pm3gvsjB5tr+kI= +github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= +github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4= github.com/prometheus/common v0.6.0/go.mod h1:eBmuwkDJBwy6iBfxCBob6t6dR6ENT/y+J+Zk0j9GMYc= -github.com/prometheus/common v0.48.0 h1:QO8U2CdOzSn1BBsmXJXduaaW+dY/5QLjfB8svtSzKKE= -github.com/prometheus/common v0.48.0/go.mod h1:0/KsvlIEfPQCQ5I2iNSAWKPZziNCvRs5EC6ILDTlAPc= +github.com/prometheus/common v0.65.0 h1:QDwzd+G1twt//Kwj/Ww6E9FQq1iVMmODnILtW1t2VzE= +github.com/prometheus/common v0.65.0/go.mod h1:0gZns+BLRQ3V6NdaerOhMbwwRbNh9hkGINtQAsP5GS8= github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA= github.com/prometheus/procfs v0.0.3/go.mod h1:4A/X28fw3Fc593LaREMrKMqOKvUAntwMDaekg4FpcdQ= -github.com/prometheus/procfs v0.12.0 h1:jluTpSng7V9hY0O2R9DzzJHYb2xULk9VTR1V1R/k6Bo= -github.com/prometheus/procfs v0.12.0/go.mod h1:pcuDEFsWDnvcgNzo4EEweacyhjeA9Zk3cnaOZAZEfOo= +github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= +github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ= github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs= github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8= @@ -286,8 +292,9 @@ github.com/sirupsen/logrus v1.9.2/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVs github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= github.com/spf13/cobra v1.7.0/go.mod h1:uLxZILRyS/50WlhOIKD7W6V5bgeIt+4sICxh6uRMrb0= -github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spf13/pflag v1.0.7 h1:vN6T9TfwStFPFM5XzjsvmzZkLuaLX+HS+0SeFLRgU6M= +github.com/spf13/pflag v1.0.7/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= @@ -314,9 +321,14 @@ github.com/yuin/goldmark v1.4.1/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1 github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= go.uber.org/atomic v1.7.0 h1:ADUqmZGgLDDfbSL9ZmPxKTybcoEYHgpYfELNoN+7hsw= go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/ratelimit v0.3.1 h1:K4qVE+byfv/B3tC+4nYWP7v/6SimcO7HzHekoMNBma0= go.uber.org/ratelimit v0.3.1/go.mod h1:6euWsTB6U/Nb3X++xEUXA8ciPJvr19Q/0h1+oDcJhRk= +go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI= +go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= +go.yaml.in/yaml/v3 v3.0.3 h1:bXOww4E/J3f66rav3pX3m8w6jDE4knZjGOw8b5Y6iNE= +go.yaml.in/yaml/v3 v3.0.3/go.mod h1:tBHosrYAkRZjRAOREWbDnBXUf08JOwYq++0QNwQiWzI= golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= @@ -339,8 +351,9 @@ golang.org/x/crypto v0.25.0/go.mod h1:T+wALwcMOSE0kXgUAnPAHqTLW+XHgcELELW8VaDgm/ golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54= golang.org/x/crypto v0.28.0/go.mod h1:rmgy+3RHxRZMyY0jjAJShp2zgEdOqj2AO7U0pYmeQ7U= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= -golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 h1:2dVuKD2vS7b0QIHQbpyTISPd0LeHDbnYEryqj5Q1ug8= golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY= +golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f h1:XdNn9LlyWAhLVp6P/i8QYBW+hlyhrhei9uErw2B5GJo= +golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f/go.mod h1:D5SMRVC3C2/4+F/DB1wZsLRnSNimn2Sp/NPsCrsv8ak= golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= @@ -402,13 +415,13 @@ golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= golang.org/x/net v0.27.0/go.mod h1:dDi0PyhWNoiUOrAS8uXv/vnScO4wnHQO4mj9fn/RytE= golang.org/x/net v0.28.0/go.mod h1:yqtgsTWOOnlGLG9GFRrK3++bGOUEkNBoHZc8MEDWPNg= golang.org/x/net v0.30.0/go.mod h1:2wGyMJ5iFasEhkwi13ChkO/t1ECNC4X4eBKkVFyYFlU= -golang.org/x/net v0.38.0 h1:vRMAPTMaeGqVhG5QyLJHqNDwecKTomGeqbnfZyKlBI8= -golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8= +golang.org/x/net v0.41.0 h1:vBTly1HeNPEn3wtREYfy4GZ/NECgw2Cnl+nK6Nz3uvw= +golang.org/x/net v0.41.0/go.mod h1:B/K4NNqkfmg07DQYrbwvSluqCJOOXwUjeb/5lOisjbA= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.23.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= -golang.org/x/oauth2 v0.27.0 h1:da9Vo7/tDv5RH/7nZDz1eMGS/q1Vv1N/7FCrBhI9I3M= -golang.org/x/oauth2 v0.27.0/go.mod h1:onh5ek6nERTohokkhCD/y2cV4Do3fxFHFuAejCkRWT8= +golang.org/x/oauth2 v0.30.0 h1:dnDm7JmhM45NNpd8FDDeLhK6FwqbOf4MLCM9zb1BOHI= +golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -474,8 +487,8 @@ golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.23.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/sys v0.31.0 h1:ioabZlmFYtWhL+TRYpcnNlLwhyxaM9kWTDEmfnprqik= -golang.org/x/sys v0.31.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/sys v0.34.0 h1:H5Y5sJ2L2JRdyv7ROF1he/lPdvFsd0mJHFw2ThKHxLA= +golang.org/x/sys v0.34.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE= golang.org/x/telemetry v0.0.0-20240521205824-bda55230c457/go.mod h1:pRgIJT+bRLFKnoM1ldnzKoxTIn14Yxz928LQRYYgIN0= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= @@ -501,8 +514,8 @@ golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= golang.org/x/term v0.22.0/go.mod h1:F3qCibpT5AMpCRfhfT53vVJwhLtIVHhB9XDjfFvnMI4= golang.org/x/term v0.23.0/go.mod h1:DgV24QBUrK6jhZXl+20l6UWznPlwAHm1Q1mGHtydmSk= golang.org/x/term v0.25.0/go.mod h1:RPyXicDX+6vLxogjjRxjgD2TKtmAO6NZBsBRfrOLu7M= -golang.org/x/term v0.30.0 h1:PQ39fJZ+mfadBm0y5WlL4vlM7Sx1Hgf13sMIY2+QS9Y= -golang.org/x/term v0.30.0/go.mod h1:NYYFdzHoI5wRh/h5tDMdMqCqPJZEuNqVR5xJLd/n67g= +golang.org/x/term v0.33.0 h1:NuFncQrRcaRvVmgRkvM3j/F00gWIAlcmlB8ACEKmGIg= +golang.org/x/term v0.33.0/go.mod h1:s18+ql9tYWp1IfpV9DmCtQDDSRBUjKaw9M1eAv5UeF0= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= @@ -522,10 +535,11 @@ golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= golang.org/x/text v0.17.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= golang.org/x/text v0.19.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= -golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY= -golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4= -golang.org/x/time v0.7.0 h1:ntUhktv3OPE6TgYxXWv9vKvUSJyIFJlyohwbkEwPrKQ= +golang.org/x/text v0.27.0 h1:4fGWRpyh641NLlecmyl4LOe6yDdfaYNrGb2zdfo4JV4= +golang.org/x/text v0.27.0/go.mod h1:1D28KMCvyooCX9hBiosv5Tz/+YLxj0j7XhWjpSUF7CU= golang.org/x/time v0.7.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= +golang.org/x/time v0.8.0 h1:9i3RxcPv3PZnitoVGMPDKZSq1xW1gK1Xy3ArNOGZfEg= +golang.org/x/time v0.8.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= @@ -552,8 +566,9 @@ golang.org/x/tools v0.21.0/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= golang.org/x/tools v0.23.0/go.mod h1:pnu6ufv6vQkll6szChhK3C3L/ruaIv5eBeztNG8wtsI= golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ= -golang.org/x/tools v0.26.0 h1:v/60pFQmzmT9ExmjDv2gGIfi3OqfKoEP6I5+umXlbnQ= golang.org/x/tools v0.26.0/go.mod h1:TPVVj70c7JJ3WCazhD8OdXcZg/og+b9+tH/KxylGwH0= +golang.org/x/tools v0.34.0 h1:qIpSLOxeCYGg9TrcJokLBG4KFA6d795g0xkBkiESGlo= +golang.org/x/tools v0.34.0/go.mod h1:pAP9OwEaY1CAW3HOmg3hLZC5Z0CCmzjAF2UQMSqNARg= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= @@ -581,8 +596,9 @@ google.golang.org/protobuf v1.28.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqw google.golang.org/protobuf v1.33.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= google.golang.org/protobuf v1.34.1/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw= -google.golang.org/protobuf v1.35.1 h1:m3LfL6/Ca+fqnjnlqQXNpFPABW1UD7mjh8KO2mKFytA= google.golang.org/protobuf v1.35.1/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= +google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= +google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= @@ -621,15 +637,18 @@ k8s.io/klog/v2 v2.5.0/go.mod h1:hy9LJ/NvuK+iVyP4Ehqva4HxZG/oXyIS3n3Jmire4Ec= k8s.io/klog/v2 v2.80.1/go.mod h1:y1WjHnz7Dj687irZUWR/WLkLc5N1YHtjLdmgWjndZn0= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= -k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f h1:GA7//TjRY9yWGy1poLzYYJJ4JRdzg3+O6e8I+e+8T5Y= k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f/go.mod h1:R/HEjbvWI0qdfb8viZUeVZm0X6IZnxAydC7YU42CMw4= +k8s.io/kube-openapi v0.0.0-20241212222426-2c72e554b1e7 h1:hcha5B1kVACrLujCKLbr8XWMxCxzQx42DY8QKYJrDLg= +k8s.io/kube-openapi v0.0.0-20241212222426-2c72e554b1e7/go.mod h1:GewRfANuJ70iYzvn+i4lezLDAFzvjxZYK1gn1lWcfas= k8s.io/utils v0.0.0-20240711033017-18e509b52bc8/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738 h1:M3sRQVHv7vB20Xc2ybTt7ODCeFj6JSWYFzOFnYeS6Ro= k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 h1:/Rv+M11QRah1itp8VhT6HoVx1Ray9eB4DBr+K+/sCJ8= sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3/go.mod h1:18nIHnGi6636UCz6m8i4DhaJ65T6EruyzmoQqI2BVDo= -sigs.k8s.io/structured-merge-diff/v4 v4.4.2 h1:MdmvkGuXi/8io6ixD5wud3vOLwc1rj0aNqRlpuvjmwA= sigs.k8s.io/structured-merge-diff/v4 v4.4.2/go.mod h1:N8f93tFZh9U6vpxwRArLiikrE5/2tiu1w1AGfACIGE4= -sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= +sigs.k8s.io/structured-merge-diff/v4 v4.4.4-0.20241211184406-7bf59b3d70ee h1:ipT2c6nEOdAfBwiwW1oI0mkrlPabbXEFmJBrg6B+OR8= +sigs.k8s.io/structured-merge-diff/v4 v4.4.4-0.20241211184406-7bf59b3d70ee/go.mod h1:N8f93tFZh9U6vpxwRArLiikrE5/2tiu1w1AGfACIGE4= sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= +sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs= +sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4= diff --git a/registry-scanner/pkg/env/env.go b/registry-scanner/pkg/env/env.go index ceaead79..5f8b4b47 100644 --- a/registry-scanner/pkg/env/env.go +++ b/registry-scanner/pkg/env/env.go @@ -50,6 +50,18 @@ func GetDurationVal(envVar string, defaultValue time.Duration) time.Duration { return defaultValue } +// ParseDurationFromEnv retrieves a time.Duration from env with bounds; returns default on error +func ParseDurationFromEnv(envVar string, defaultValue time.Duration, min, max time.Duration) time.Duration { + if val := os.Getenv(envVar); val != "" { + d, err := time.ParseDuration(val) + if err != nil { return defaultValue } + if min > 0 && d < min { return defaultValue } + if max > 0 && d > max { return defaultValue } + return d + } + return defaultValue +} + // Helper function to parse a number from an environment variable. Returns a // default if env is not set, is not parseable to a number, exceeds max (if // max is greater than 0) or is less than min. diff --git a/registry-scanner/pkg/registry/client.go b/registry-scanner/pkg/registry/client.go index 7a269460..6dc36a9c 100644 --- a/registry-scanner/pkg/registry/client.go +++ b/registry-scanner/pkg/registry/client.go @@ -9,6 +9,7 @@ import ( "github.com/argoproj/pkg/json" "github.com/argoproj-labs/argocd-image-updater/registry-scanner/pkg/log" + "github.com/argoproj-labs/argocd-image-updater/pkg/metrics" "github.com/argoproj-labs/argocd-image-updater/registry-scanner/pkg/options" "github.com/argoproj-labs/argocd-image-updater/registry-scanner/pkg/tag" @@ -28,9 +29,16 @@ import ( "go.uber.org/ratelimit" + "bytes" "net/http" "net/url" + "io" + "strconv" "strings" + "sync" + "math/rand" + sf "golang.org/x/sync/singleflight" + "github.com/argoproj-labs/argocd-image-updater/registry-scanner/pkg/env" ) // TODO: Check image's architecture and OS @@ -60,6 +68,7 @@ type registryClient struct { regClient distribution.Repository endpoint *RegistryEndpoint creds credentials + repoName string } // credentials is an implementation of distribution/V3/session struct @@ -92,12 +101,144 @@ type rateLimitTransport struct { endpoint *RegistryEndpoint } +// jwtObservingTransport wraps the underlying transport for token fetches +// and records JWT auth metrics (duration, TTL, errors). +type jwtObservingTransport struct { + endpoint *RegistryEndpoint + base http.RoundTripper + singleflight *sf.Group +} + +func getJWTRetrySettings() (int, time.Duration, time.Duration) { + attempts := env.ParseNumFromEnv("REGISTRY_JWT_ATTEMPTS", 7, 1, 100) + base := env.ParseDurationFromEnv("REGISTRY_JWT_RETRY_BASE", 200*time.Millisecond, 0, time.Hour) + max := env.ParseDurationFromEnv("REGISTRY_JWT_RETRY_MAX", 3*time.Second, 0, time.Hour) + return attempts, base, max +} + +func (j *jwtObservingTransport) doAuthWithRetry(req *http.Request, reg, service, scope string) (*http.Response, error) { + attempts, base, maxDelay := getJWTRetrySettings() + var lastResp *http.Response + var lastErr error + for attempt := 0; attempt < attempts; attempt++ { + start := time.Now() + resp, err := j.base.RoundTrip(req) + if epm := metrics.Endpoint(); epm != nil { + epm.IncreaseJWTAuthRequest(reg, service, scope) + epm.ObserveJWTAuthDuration(reg, service, scope, time.Since(start)) + } + lastResp, lastErr = resp, err + if err == nil && resp != nil && resp.StatusCode >= 200 && resp.StatusCode < 300 { + if resp.Body != nil { + body, rerr := io.ReadAll(resp.Body) + if rerr == nil { + resp.Body = io.NopCloser(bytes.NewBuffer(body)) + type tokenResp struct { ExpiresIn int `json:"expires_in"` } + var tr tokenResp + if jerr := json.Unmarshal(body, &tr); jerr == nil && tr.ExpiresIn > 0 { + if epm := metrics.Endpoint(); epm != nil { + epm.ObserveJWTTokenTTL(reg, service, scope, float64(tr.ExpiresIn)) + } + } else if jerr != nil { + if epm := metrics.Endpoint(); epm != nil { + epm.IncreaseJWTAuthError(reg, service, scope, "parse_json_error") + } + } + } else { + if epm := metrics.Endpoint(); epm != nil { + epm.IncreaseJWTAuthError(reg, service, scope, "read_body_error") + } + } + } + return resp, nil + } + if epm := metrics.Endpoint(); epm != nil { + if err != nil { + epm.IncreaseJWTAuthError(reg, service, scope, "roundtrip_error") + } else if resp != nil { + epm.IncreaseJWTAuthError(reg, service, scope, "http_"+strconv.Itoa(resp.StatusCode)) + } + } + if resp != nil { + if resp.Body != nil { io.Copy(io.Discard, resp.Body); resp.Body.Close() } + } + if epm := metrics.Endpoint(); epm != nil { epm.IncreaseRetry(reg, "auth") } + d := base * time.Duration(1< maxDelay { d = maxDelay } + time.Sleep(d) + } + return lastResp, lastErr +} + +func (j *jwtObservingTransport) RoundTrip(req *http.Request) (*http.Response, error) { + // Only intercept /jwt/auth; otherwise pass-through + if !strings.Contains(req.URL.Path, "/jwt/auth") || j.singleflight == nil { + reg := j.endpoint.RegistryAPI + q := req.URL.Query() + service := q.Get("service") + scope := q.Get("scope") + return j.doAuthWithRetry(req, reg, service, scope) + } + // Deduplicate concurrent token fetches for the same (service,scope) + reg := j.endpoint.RegistryAPI + q := req.URL.Query() + service := q.Get("service") + scope := q.Get("scope") + key := reg + "|auth|" + service + "|" + scope + v, err, _ := j.singleflight.Do(key, func() (any, error) { + return j.doAuthWithRetry(req, reg, service, scope) + }) + if err != nil { + return v.(*http.Response), err + } + return v.(*http.Response), nil +} + // RoundTrip is a custom RoundTrip method with rate-limiter func (rlt *rateLimitTransport) RoundTrip(r *http.Request) (*http.Response, error) { - rlt.limiter.Take() - log.Tracef("Performing HTTP %s %s", r.Method, r.URL) - resp, err := rlt.transport.RoundTrip(r) - return resp, err + rlt.limiter.Take() + reg := rlt.endpoint.RegistryAPI + // per-registry inflight cap + inflight := rlt.endpoint.getInflightChan() + select { case inflight <- struct{}{}: default: inflight <- struct{}{} } + start := time.Now() + log.Tracef("Performing HTTP %s %s", r.Method, r.URL) + // Detect JWT auth endpoint query params if any + service := r.URL.Query().Get("service") + scope := r.URL.Query().Get("scope") + isJWT := strings.Contains(r.URL.Path, "/jwt/auth") + if isJWT { + if epm := metrics.Endpoint(); epm != nil { epm.IncreaseJWTAuthRequest(reg, service, scope) } + } + resp, err := rlt.transport.RoundTrip(r) + if err != nil { + // detect possible port exhaustion dial failures and record for health + MaybeRecordPortExhaustion(err) + } + // metrics + if epm := metrics.Endpoint(); epm != nil { + epm.IncInFlight(reg) + defer func() { epm.DecInFlight(reg) }() + } + d := time.Since(start) + if epm := metrics.Endpoint(); epm != nil { + epm.ObserveRequestDuration(reg, d) + if isJWT { + epm.ObserveJWTAuthDuration(reg, service, scope, d) + if err != nil { epm.IncreaseJWTAuthError(reg, service, scope, "roundtrip_error") } + } + } + // classify /jwt/auth for auth metrics later (placeholder) + // increase request counters + if epm := metrics.Endpoint(); epm != nil { epm.IncreaseRequest(reg, err != nil) } + // record status if available + if resp != nil { + if epm := metrics.Endpoint(); epm != nil { epm.ObserveHTTPStatus(reg, resp.StatusCode) } + } + <-inflight + return resp, err } // NewRepository is a wrapper for creating a registry client that is possibly @@ -110,18 +251,39 @@ func (clt *registryClient) NewRepository(nameInRepository string) error { return err } - authTransport := transport.NewTransport( - clt.endpoint.GetTransport(), auth.NewAuthorizer( - challengeManager1, - auth.NewTokenHandler(clt.endpoint.GetTransport(), clt.creds, nameInRepository, "pull"), - auth.NewBasicHandler(clt.creds))) - - rlt := &rateLimitTransport{ - limiter: clt.endpoint.Limiter, - transport: authTransport, - endpoint: clt.endpoint, + // Normalize repo key to improve cache hits + repo := strings.TrimPrefix(nameInRepository, "/") + cacheKey := clt.endpoint.RegistryAPI + "|" + repo + // Check for cached auth transport to reuse bearer tokens + repoAuthTransportCacheLock.RLock() + cached, ok := repoAuthTransportCache[cacheKey] + repoAuthTransportCacheLock.RUnlock() + var baseRT http.RoundTripper + if ok { + log.Debugf("authorizer cache HIT key=%s", cacheKey) + baseRT = cached + } else { + log.Debugf("authorizer cache MISS key=%s", cacheKey) + // Wrap the underlying transport to observe JWT auth responses + base := clt.endpoint.GetTransport() + // tokenTransport is used by the token handler to fetch tokens + tokenTransport := &jwtObservingTransport{endpoint: clt.endpoint, base: base, singleflight: &jwtAuthSingleflight} + baseRT = transport.NewTransport( + base, auth.NewAuthorizer( + challengeManager1, + auth.NewTokenHandler(tokenTransport, clt.creds, nameInRepository, "pull"), + auth.NewBasicHandler(clt.creds))) + repoAuthTransportCacheLock.Lock() + repoAuthTransportCache[cacheKey] = baseRT + repoAuthTransportCacheLock.Unlock() } + rlt := &rateLimitTransport{ + limiter: clt.endpoint.Limiter, + transport: baseRT, + endpoint: clt.endpoint, + } + named, err := reference.WithName(nameInRepository) if err != nil { return err @@ -130,6 +292,7 @@ func (clt *registryClient) NewRepository(nameInRepository string) error { if err != nil { return err } + clt.repoName = repo return nil } @@ -141,9 +304,12 @@ func NewClient(endpoint *RegistryEndpoint, username, password string) (RegistryC if password == "" && endpoint.Password != "" { password = endpoint.Password } + // Initialize refreshTokens to enable reusing registry-issued refresh tokens + // across requests for the same service (e.g., container_registry). creds := credentials{ - username: username, - password: password, + username: username, + password: password, + refreshTokens: make(map[string]string), } return ®istryClient{ creds: creds, @@ -151,46 +317,143 @@ func NewClient(endpoint *RegistryEndpoint, username, password string) (RegistryC }, nil } +// cache for per-repository auth transports so we reuse bearer tokens across runtime +var repoAuthTransportCache = make(map[string]http.RoundTripper) +var repoAuthTransportCacheLock sync.RWMutex +var jwtAuthSingleflight sf.Group + +// singleflight-style maps for deduping concurrent identical calls +var tagsInFlight sync.Map // key string -> chan result +var manifestInFlight sync.Map // key string -> chan result + +type tagsResult struct { + tags []string + err error +} + // Tags returns a list of tags for given name in repository func (clt *registryClient) Tags() ([]string, error) { - tagService := clt.regClient.Tags(context.Background()) - tTags, err := tagService.All(context.Background()) - if err != nil { - return nil, err - } - return tTags, nil + regURL := "" + if clt.endpoint != nil { regURL = clt.endpoint.RegistryAPI } + key := regURL + "|tags|" + clt.repoName + if ch, loaded := tagsInFlight.Load(key); loaded { + // wait for the leader's result + res := (<-ch.(chan tagsResult)) + return res.tags, res.err + } + ch := make(chan tagsResult, 1) + actual, loaded := tagsInFlight.LoadOrStore(key, ch) + if loaded { + res := (<-actual.(chan tagsResult)) + return res.tags, res.err + } + + // leader path + defer func() { + tagsInFlight.Delete(key) + close(ch) + }() + + tagService := clt.regClient.Tags(context.Background()) + var tTags []string + var err error + // jittered exponential backoff with per-attempt deadline (env-tunable) + base := 200 * time.Millisecond + maxDelay := 3 * time.Second + // Attempts and per-attempt deadline are configurable via env for high-latency registries + attempts := env.ParseNumFromEnv("REGISTRY_TAG_ATTEMPTS", 3, 1, 100) + perAttempt := env.ParseDurationFromEnv("REGISTRY_TAG_TIMEOUT", 60*time.Second, 1*time.Second, time.Hour) + for attempt := 0; attempt < attempts; attempt++ { + ctx, cancel := context.WithTimeout(context.Background(), perAttempt) + tTags, err = tagService.All(ctx) + cancel() + if err == nil { + break + } + // jittered backoff + d := base * time.Duration(1< maxDelay { d = maxDelay } + time.Sleep(d) + } + ch <- tagsResult{tags: tTags, err: err} + if err != nil { return nil, err } + return tTags, nil } // Manifest returns a Manifest for a given tag in repository func (clt *registryClient) ManifestForTag(tagStr string) (distribution.Manifest, error) { - manService, err := clt.regClient.Manifests(context.Background()) - if err != nil { - return nil, err - } - manifest, err := manService.Get( - context.Background(), - digest.FromString(tagStr), - distribution.WithTag(tagStr), distribution.WithManifestMediaTypes(knownMediaTypes)) - if err != nil { - return nil, err - } - return manifest, nil + regURL := "" + if clt.endpoint != nil { regURL = clt.endpoint.RegistryAPI } + key := regURL + "|manifest|" + clt.repoName + "|tag=" + tagStr + if ch, loaded := manifestInFlight.Load(key); loaded { + res := (<-ch.(chan struct{m distribution.Manifest; e error})) + return res.m, res.e + } + ch := make(chan struct{m distribution.Manifest; e error}, 1) + actual, loaded := manifestInFlight.LoadOrStore(key, ch) + if loaded { res := (<-actual.(chan struct{m distribution.Manifest; e error})); return res.m, res.e } + defer func(){ manifestInFlight.Delete(key); close(ch) }() + + manService, err := clt.regClient.Manifests(context.Background()) + if err != nil { ch <- struct{m distribution.Manifest; e error}{nil, err}; return nil, err } + var manifest distribution.Manifest + base := 200 * time.Millisecond + maxDelay := 3 * time.Second + manAttempts := env.ParseNumFromEnv("REGISTRY_MANIFEST_ATTEMPTS", 3, 1, 100) + manPerAttempt := env.ParseDurationFromEnv("REGISTRY_MANIFEST_TIMEOUT", 60*time.Second, 1*time.Second, time.Hour) + for attempt := 0; attempt < manAttempts; attempt++ { + ctx, cancel := context.WithTimeout(context.Background(), manPerAttempt) + manifest, err = manService.Get(ctx, digest.FromString(tagStr), distribution.WithTag(tagStr), distribution.WithManifestMediaTypes(knownMediaTypes)) + cancel() + if err == nil { break } + d := base * time.Duration(1< maxDelay { d = maxDelay } + time.Sleep(d) + } + ch <- struct{m distribution.Manifest; e error}{manifest, err} + if err != nil { return nil, err } + return manifest, nil } // ManifestForDigest returns a Manifest for a given digest in repository func (clt *registryClient) ManifestForDigest(dgst digest.Digest) (distribution.Manifest, error) { - manService, err := clt.regClient.Manifests(context.Background()) - if err != nil { - return nil, err - } - manifest, err := manService.Get( - context.Background(), - dgst, - distribution.WithManifestMediaTypes(knownMediaTypes)) - if err != nil { - return nil, err - } - return manifest, nil + regURL := "" + if clt.endpoint != nil { regURL = clt.endpoint.RegistryAPI } + key := regURL + "|manifest|" + clt.repoName + "|dgst=" + dgst.String() + if ch, loaded := manifestInFlight.Load(key); loaded { + res := (<-ch.(chan struct{m distribution.Manifest; e error})) + return res.m, res.e + } + ch := make(chan struct{m distribution.Manifest; e error}, 1) + actual, loaded := manifestInFlight.LoadOrStore(key, ch) + if loaded { res := (<-actual.(chan struct{m distribution.Manifest; e error})); return res.m, res.e } + defer func(){ manifestInFlight.Delete(key); close(ch) }() + + manService, err := clt.regClient.Manifests(context.Background()) + if err != nil { ch <- struct{m distribution.Manifest; e error}{nil, err}; return nil, err } + var manifest distribution.Manifest + base := 200 * time.Millisecond + maxDelay := 3 * time.Second + dgstAttempts := env.ParseNumFromEnv("REGISTRY_MANIFEST_ATTEMPTS", 3, 1, 100) + dgstPerAttempt := env.ParseDurationFromEnv("REGISTRY_MANIFEST_TIMEOUT", 60*time.Second, 1*time.Second, time.Hour) + for attempt := 0; attempt < dgstAttempts; attempt++ { + ctx, cancel := context.WithTimeout(context.Background(), dgstPerAttempt) + manifest, err = manService.Get(ctx, dgst, distribution.WithManifestMediaTypes(knownMediaTypes)) + cancel() + if err == nil { break } + d := base * time.Duration(1< maxDelay { d = maxDelay } + time.Sleep(d) + } + ch <- struct{m distribution.Manifest; e error}{manifest, err} + if err != nil { return nil, err } + return manifest, nil } // TagMetadata retrieves metadata for a given manifest of given repository diff --git a/registry-scanner/pkg/registry/client_test.go b/registry-scanner/pkg/registry/client_test.go index 074a3cde..176c8e1c 100644 --- a/registry-scanner/pkg/registry/client_test.go +++ b/registry-scanner/pkg/registry/client_test.go @@ -20,6 +20,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" "github.com/stretchr/testify/require" + // singleflight not used in simplified token reuse approach "github.com/argoproj-labs/argocd-image-updater/registry-scanner/pkg/log" "github.com/argoproj-labs/argocd-image-updater/registry-scanner/pkg/options" @@ -168,12 +169,25 @@ func TestNewClient(t *testing.T) { }) } +// removed singleflight test + +func TestNewClient_InitializesRefreshTokenMap(t *testing.T) { + ep := &RegistryEndpoint{} + rc, err := NewClient(ep, "", "") + require.NoError(t, err) + c := rc.(*registryClient) + // Ensure the map is not nil so SetRefreshToken can store values + require.NotNil(t, c.creds.refreshTokens) +} + func TestTags(t *testing.T) { t.Run("success", func(t *testing.T) { mockRegClient := new(mocks.Repository) - client := registryClient{ - regClient: mockRegClient, - } + client := registryClient{ + regClient: mockRegClient, + endpoint: &RegistryEndpoint{RegistryAPI: "https://example.com"}, + repoName: "org/repo", + } mockTagService := new(mocks.TagService) mockTagService.On("All", mock.Anything).Return([]string{"testTag-1", "testTag-2"}, nil) mockRegClient.On("Tags", mock.Anything).Return(mockTagService) @@ -184,9 +198,11 @@ func TestTags(t *testing.T) { }) t.Run("Fail", func(t *testing.T) { mockRegClient := new(mocks.Repository) - client := registryClient{ - regClient: mockRegClient, - } + client := registryClient{ + regClient: mockRegClient, + endpoint: &RegistryEndpoint{RegistryAPI: "https://example.com"}, + repoName: "org/repo", + } mockTagService := new(mocks.TagService) mockTagService.On("All", mock.Anything).Return([]string{}, errors.New("Error on caling func All")) mockRegClient.On("Tags", mock.Anything).Return(mockTagService) diff --git a/registry-scanner/pkg/registry/config.go b/registry-scanner/pkg/registry/config.go index 641c5987..53376b27 100644 --- a/registry-scanner/pkg/registry/config.go +++ b/registry-scanner/pkg/registry/config.go @@ -35,6 +35,10 @@ func clearRegistries() { registryLock.Lock() registries = make(map[string]*RegistryEndpoint) registryLock.Unlock() + + // Also clear transport cache when registries are cleared + // This ensures that when registry configuration changes, we use fresh transports + ClearTransportCache() } // LoadRegistryConfiguration loads a YAML-formatted registry configuration from diff --git a/registry-scanner/pkg/registry/degrade.go b/registry-scanner/pkg/registry/degrade.go new file mode 100644 index 00000000..b526b37b --- /dev/null +++ b/registry-scanner/pkg/registry/degrade.go @@ -0,0 +1,97 @@ +package registry + +import ( + "errors" + "net" + "strings" + "sync" + "syscall" + "time" + + "github.com/argoproj-labs/argocd-image-updater/registry-scanner/pkg/env" +) + +// port exhaustion detector: tracks recent EADDRNOTAVAIL dial errors in a sliding window +var peState struct { + mu sync.Mutex + events []time.Time + window time.Duration + threshold int + inited bool +} + +func initPortExhaustion() { + if peState.inited { + return + } + peState.window = env.ParseDurationFromEnv("PORT_EXHAUSTION_WINDOW", 60*time.Second, 1*time.Second, 24*time.Hour) + peState.threshold = env.ParseNumFromEnv("PORT_EXHAUSTION_THRESHOLD", 8, 1, 100000) + peState.inited = true +} + +func recordPortExhaustionEvent() { + initPortExhaustion() + now := time.Now() + peState.mu.Lock() + defer peState.mu.Unlock() + peState.events = append(peState.events, now) + cutoff := now.Add(-peState.window) + // drop old events + i := 0 + for ; i < len(peState.events); i++ { + if peState.events[i].After(cutoff) { + break + } + } + if i > 0 && i <= len(peState.events) { + peState.events = append([]time.Time(nil), peState.events[i:]...) + } +} + +// IsPortExhaustionDegraded returns true if the number of recent EADDRNOTAVAIL +// events within the configured window exceeds the threshold +func IsPortExhaustionDegraded() bool { + initPortExhaustion() + now := time.Now() + cutoff := now.Add(-peState.window) + peState.mu.Lock() + defer peState.mu.Unlock() + // prune + i := 0 + for ; i < len(peState.events); i++ { + if peState.events[i].After(cutoff) { + break + } + } + if i > 0 && i <= len(peState.events) { + peState.events = append([]time.Time(nil), peState.events[i:]...) + } + return len(peState.events) >= peState.threshold +} + +// MaybeRecordPortExhaustion checks an error for EADDRNOTAVAIL and records it +func MaybeRecordPortExhaustion(err error) { + if err == nil { + return + } + // unwrap errors to find syscall codes + unwrapped := err + for unwrapped != nil { + var opErr *net.OpError + if errors.As(unwrapped, &opErr) { + // on some systems, opErr.Err can be syscall.Errno + if errno, ok := opErr.Err.(syscall.Errno); ok && errno == syscall.EADDRNOTAVAIL { + recordPortExhaustionEvent() + return + } + } + unwrapped = errors.Unwrap(unwrapped) + } + // fallback to substring match + msg := err.Error() + if strings.Contains(strings.ToLower(msg), "cannot assign requested address") { + recordPortExhaustionEvent() + } +} + + diff --git a/registry-scanner/pkg/registry/endpoints.go b/registry-scanner/pkg/registry/endpoints.go index f0de871c..790885ad 100644 --- a/registry-scanner/pkg/registry/endpoints.go +++ b/registry-scanner/pkg/registry/endpoints.go @@ -8,8 +8,10 @@ import ( "strings" "sync" "time" + "strconv" "github.com/argoproj-labs/argocd-image-updater/registry-scanner/pkg/cache" + "github.com/argoproj-labs/argocd-image-updater/registry-scanner/pkg/env" "github.com/argoproj-labs/argocd-image-updater/registry-scanner/pkg/log" "go.uber.org/ratelimit" @@ -89,6 +91,11 @@ type RegistryEndpoint struct { IsDefault bool lock sync.RWMutex limit int + // in-flight limiter channel (nil until first use). Controls concurrent HTTP + // requests per registry to prevent socket/port exhaustion under bursts. + inflightCh chan struct{} + // desired capacity for inflight channel + inflightCap int } // registryTweaks should contain a list of registries whose settings cannot be @@ -121,6 +128,12 @@ var registryLock sync.RWMutex // credentialGroup ensures only one credential refresh happens per registry var credentialGroup singleflight.Group +// transportCache stores reusable HTTP transports per registry API URL. +// Reusing transports enables HTTP keep-alives/connection pooling and avoids +// excessive TLS handshakes when the updater queries registries frequently. +var transportCache = make(map[string]*http.Transport) +var transportCacheLock sync.RWMutex + func AddRegistryEndpointFromConfig(epc RegistryConfiguration) error { ep := NewRegistryEndpoint(epc.Prefix, epc.Name, epc.ApiURL, epc.Credentials, epc.DefaultNS, epc.Insecure, TagListSortFromString(epc.TagSortMode), epc.Limit, epc.CredsExpire) return AddRegistryEndpoint(ep) @@ -144,6 +157,7 @@ func NewRegistryEndpoint(prefix, name, apiUrl, credentials, defaultNS string, in TagListSort: tagListSort, Limiter: ratelimit.New(limit), limit: limit, + inflightCap: 15, } return ep } @@ -278,20 +292,108 @@ func (ep *RegistryEndpoint) DeepCopy() *RegistryEndpoint { newEp.CredsUpdated = ep.CredsUpdated newEp.IsDefault = ep.IsDefault newEp.limit = ep.limit + newEp.inflightCap = ep.inflightCap ep.lock.RUnlock() return newEp } -// GetTransport returns a transport object for this endpoint +// ClearTransportCache clears cached transports (e.g., after registry config reload) +func ClearTransportCache() { + transportCacheLock.Lock() + defer transportCacheLock.Unlock() + // Proactively close idle connections on existing transports before clearing + for _, tr := range transportCache { + tr.CloseIdleConnections() + } + transportCache = make(map[string]*http.Transport) + log.Debugf("Transport cache cleared.") +} + +// StartTransportJanitor periodically closes idle connections on all cached +// transports to prevent idle socket accumulation. Returns a stop function. +func StartTransportJanitor(interval time.Duration) func() { + if interval <= 0 { + return func() {} + } + stopCh := make(chan struct{}) + ticker := time.NewTicker(interval) + go func() { + for { + select { + case <-ticker.C: + transportCacheLock.RLock() + for _, tr := range transportCache { + tr.CloseIdleConnections() + } + transportCacheLock.RUnlock() + case <-stopCh: + ticker.Stop() + return + } + } + }() + return func() { close(stopCh) } +} + +// GetTransport returns a cached transport configured with sane defaults. +// The transport is keyed by the endpoint's RegistryAPI and shared by callers +// to maximize connection reuse and apply timeouts consistently. func (ep *RegistryEndpoint) GetTransport() *http.Transport { + // Cache key must account for TLS mode to avoid reusing a secure transport for insecure endpoints + key := ep.RegistryAPI + "|insecure=" + strconv.FormatBool(ep.Insecure) + // Fast path: return cached transport if present + transportCacheLock.RLock() + if tr, ok := transportCache[key]; ok { + transportCacheLock.RUnlock() + return tr + } + transportCacheLock.RUnlock() + tlsC := &tls.Config{} if ep.Insecure { tlsC.InsecureSkipVerify = true } - return &http.Transport{ - Proxy: http.ProxyFromEnvironment, - TLSClientConfig: tlsC, + + // Create and cache a transport with sane defaults + // Allow overriding key HTTP transport timeouts via environment + respHdrTimeout := env.ParseDurationFromEnv("REGISTRY_RESPONSE_HEADER_TIMEOUT", 60*time.Second, 1*time.Second, time.Hour) + tlsHsTimeout := env.ParseDurationFromEnv("REGISTRY_TLS_HANDSHAKE_TIMEOUT", 10*time.Second, 1*time.Second, time.Hour) + idleConnTimeout := env.ParseDurationFromEnv("REGISTRY_IDLE_CONN_TIMEOUT", 90*time.Second, 0, 24*time.Hour) + maxConnsPerHost := env.ParseNumFromEnv("REGISTRY_MAX_CONNS_PER_HOST", 30, 1, 10000) + maxIdleConns := env.ParseNumFromEnv("REGISTRY_MAX_IDLE_CONNS", 1000, 1, 100000) + maxIdleConnsPerHost := env.ParseNumFromEnv("REGISTRY_MAX_IDLE_CONNS_PER_HOST", 200, 1, 100000) + + tr := &http.Transport{ + Proxy: http.ProxyFromEnvironment, + TLSClientConfig: tlsC, + // Prefer reuse of a larger idle pool to minimize new dials under load + MaxIdleConns: maxIdleConns, + MaxIdleConnsPerHost: maxIdleConnsPerHost, + // Cap parallel dials per host to avoid ephemeral port exhaustion + MaxConnsPerHost: maxConnsPerHost, + + IdleConnTimeout: idleConnTimeout, + TLSHandshakeTimeout: tlsHsTimeout, + ExpectContinueTimeout: 1 * time.Second, + ForceAttemptHTTP2: true, + ResponseHeaderTimeout: respHdrTimeout, } + transportCacheLock.Lock() + transportCache[key] = tr + transportCacheLock.Unlock() + return tr +} + +// getInflightChan returns the per-registry inflight channel, creating it on first use. +func (ep *RegistryEndpoint) getInflightChan() chan struct{} { + ep.lock.Lock() + defer ep.lock.Unlock() + if ep.inflightCh == nil { + cap := ep.inflightCap + if cap <= 0 { cap = 15 } + ep.inflightCh = make(chan struct{}, cap) + } + return ep.inflightCh } // init initializes the registry configuration diff --git a/registry-scanner/pkg/registry/endpoints_test.go b/registry-scanner/pkg/registry/endpoints_test.go index d2dae1f7..9d483281 100644 --- a/registry-scanner/pkg/registry/endpoints_test.go +++ b/registry-scanner/pkg/registry/endpoints_test.go @@ -2,8 +2,12 @@ package registry import ( "fmt" + "net" + "net/http" + "net/http/httptest" "strings" "sync" + "sync/atomic" "testing" "time" @@ -310,6 +314,79 @@ func TestGetTransport(t *testing.T) { }) } +func TestGetTransport_CachesPerAPI(t *testing.T) { + ep1 := &RegistryEndpoint{RegistryAPI: "https://r1.example", Insecure: false} + ep2 := &RegistryEndpoint{RegistryAPI: "https://r1.example", Insecure: false} + ep3 := &RegistryEndpoint{RegistryAPI: "https://r2.example", Insecure: true} + + // Clear any previous cache state + ClearTransportCache() + + tr1 := ep1.GetTransport() + require.NotNil(t, tr1) + + tr2 := ep2.GetTransport() + require.NotNil(t, tr2) + + // Same API URL should reuse the same pointer + assert.Equal(t, tr1, tr2) + + tr3 := ep3.GetTransport() + require.NotNil(t, tr3) + + // Different API URL should create a different transport + assert.NotEqual(t, tr1, tr3) +} + +// Test that StartTransportJanitor closes idle client connections so that the +// next request establishes a new TCP connection instead of reusing the old one. +func TestStartTransportJanitor_ClosesIdleConns(t *testing.T) { + // Create a custom server with ConnState hook to count new TCP connections + var newConnCount int32 + h := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte("ok")) + }) + srv := httptest.NewUnstartedServer(h) + srv.Config.ConnState = func(c net.Conn, s http.ConnState) { + if s == http.StateNew { + atomic.AddInt32(&newConnCount, 1) + } + } + srv.Start() + defer srv.Close() + + // Map server URL to a registry endpoint and its shared transport + ep := &RegistryEndpoint{RegistryAPI: srv.URL, Insecure: false} + ClearTransportCache() + tr := ep.GetTransport() + client := &http.Client{Transport: tr} + + // First request creates a new TCP connection + resp1, err := client.Get(srv.URL + "/first") + require.NoError(t, err) + require.NotNil(t, resp1) + _ = resp1.Body.Close() + + // Start janitor with a very short interval so it runs promptly + stop := StartTransportJanitor(25 * time.Millisecond) + defer stop() + + // Wait for at least one janitor tick + time.Sleep(60 * time.Millisecond) + + // Second request should not be able to reuse the old idle conn after janitor + resp2, err := client.Get(srv.URL + "/second") + require.NoError(t, err) + _ = resp2.Body.Close() + + // We expect at least two TCP connections: one per request + // If the idle connection had been reused, this would be 1 + if atomic.LoadInt32(&newConnCount) < 2 { + t.Fatalf("expected janitor to force new TCP connection, got newConnCount=%d", newConnCount) + } +} + func Test_RestoreDefaultRegistryConfiguration(t *testing.T) { // Call the function to restore default configuration RestoreDefaultRegistryConfiguration() diff --git a/registry-scanner/pkg/registry/jwt_test.go b/registry-scanner/pkg/registry/jwt_test.go new file mode 100644 index 00000000..ee5534db --- /dev/null +++ b/registry-scanner/pkg/registry/jwt_test.go @@ -0,0 +1,118 @@ +package registry + +import ( + "net/http" + "net/url" + "os" + "sync" + "testing" + + sf "golang.org/x/sync/singleflight" + + "github.com/stretchr/testify/require" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/mock" + + "github.com/argoproj-labs/argocd-image-updater/registry-scanner/pkg/registry/mocks" + "github.com/argoproj-labs/argocd-image-updater/pkg/metrics" + "github.com/prometheus/client_golang/prometheus" +) + +// helper to build a /jwt/auth request with given service/scope +func newJWTReq(t *testing.T, service, scope string) *http.Request { + t.Helper() + u, err := url.Parse("https://gitlab.example.com/jwt/auth") + require.NoError(t, err) + q := u.Query() + q.Set("service", service) + q.Set("scope", scope) + u.RawQuery = q.Encode() + req, err := http.NewRequest(http.MethodGet, u.String(), nil) + require.NoError(t, err) + return req +} + +// reset globals to avoid cross-test interference and duplicate collectors +func resetMetricsAndFlights() { + reg := prometheus.NewRegistry() + prometheus.DefaultRegisterer = reg + prometheus.DefaultGatherer = reg + metrics.InitMetrics() + jwtAuthSingleflight = sf.Group{} + tagsInFlight = sync.Map{} + manifestInFlight = sync.Map{} +} + +func TestJWT_Singleflight_DeduplicatesSameScope(t *testing.T) { + resetMetricsAndFlights() + mockRT := new(mocks.RoundTripper) + // First (and only) underlying call returns 200 + mockRT.On("RoundTrip", mock.AnythingOfType("*http.Request")).Return(&http.Response{StatusCode: http.StatusOK}, nil).Once() + + e := &RegistryEndpoint{RegistryAPI: "https://registry.example.com"} + j := &jwtObservingTransport{endpoint: e, base: mockRT, singleflight: &sf.Group{}} + + // Two concurrent requests for the same (service,scope) + req1 := newJWTReq(t, "container_registry", "repository:org/repo:pull") + req2 := newJWTReq(t, "container_registry", "repository:org/repo:pull") + + var wg sync.WaitGroup + wg.Add(2) + go func(){ defer wg.Done(); _, _ = j.RoundTrip(req1) }() + go func(){ defer wg.Done(); _, _ = j.RoundTrip(req2) }() + wg.Wait() + + mockRT.AssertNumberOfCalls(t, "RoundTrip", 1) +} + +func TestJWT_Singleflight_AllowsDifferentScopes(t *testing.T) { + resetMetricsAndFlights() + mockRT := new(mocks.RoundTripper) + mockRT.On("RoundTrip", mock.AnythingOfType("*http.Request")).Return(&http.Response{StatusCode: http.StatusOK}, nil).Twice() + + e := &RegistryEndpoint{RegistryAPI: "https://registry.example.com"} + j := &jwtObservingTransport{endpoint: e, base: mockRT, singleflight: &sf.Group{}} + + req1 := newJWTReq(t, "container_registry", "repository:org/repoA:pull") + req2 := newJWTReq(t, "container_registry", "repository:org/repoB:pull") + + var wg sync.WaitGroup + wg.Add(2) + go func(){ defer wg.Done(); _, _ = j.RoundTrip(req1) }() + go func(){ defer wg.Done(); _, _ = j.RoundTrip(req2) }() + wg.Wait() + + mockRT.AssertNumberOfCalls(t, "RoundTrip", 2) +} + +func TestJWT_Retry_Backoff_AttemptsHonored(t *testing.T) { + resetMetricsAndFlights() + // Make the underlying transport fail N-1 times, then succeed + attempts := 4 + os.Setenv("REGISTRY_JWT_ATTEMPTS", "4") + t.Cleanup(func(){ os.Unsetenv("REGISTRY_JWT_ATTEMPTS") }) + + callCount := 0 + mockRT := new(mocks.RoundTripper) + mockRT.Mock.Test(t) + mockRT.On("RoundTrip", mock.AnythingOfType("*http.Request")).Return(func(_ *http.Request) *http.Response { + callCount++ + if callCount < attempts { + return nil + } + return &http.Response{StatusCode: http.StatusOK} + }, func(_ *http.Request) error { + if callCount < attempts { return assert.AnError } + return nil + }).Times(attempts) + + e := &RegistryEndpoint{RegistryAPI: "https://registry.example.com"} + j := &jwtObservingTransport{endpoint: e, base: mockRT, singleflight: &sf.Group{}} + req := newJWTReq(t, "container_registry", "repository:org/repo:pull") + resp, err := j.RoundTrip(req) + require.NoError(t, err) + require.Equal(t, http.StatusOK, resp.StatusCode) + assert.Equal(t, attempts, callCount) +} + +