argoproj-labs
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 170 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 170 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 13 additions & 10 deletions b/‎Makefile‎
Lines changed: 13 additions & 10 deletions
@@ -15,3 +15,5 @@ test-results
 *.goe
 **/kuttl-test.json
 **/kubeconfig
+AGENT.md
+.gitlab-ci.yml
@@ -6,6 +6,18 @@ handling on your side.
 
 ## Unreleased
 
+## 2025-09-19 - Release v100.0.6a
+
+### Changes
+
+- scheduler(continuous): increase tick cadence from ~100ms to ~1s to reduce log noise and API/list pressure; no change to per-app `--interval` gating
+- docs(readme): remove Mermaid diagram; add ASCII architecture; add rate limiting/backpressure section; add phase comparison table (stock vs tuned)
+
+### Notes
+
+- Behavior impact: only the scheduler’s discovery cadence changes; application dispatch still respects `--interval`, in-flight guards, fairness (LRU/fail-first, cooldown, per-repo-cap), and concurrency caps.
+- Recommended: if startup delay is undesirable, run with `--warmup-cache=false`.
+
 ### Upgrade notes (no really, you MUST read this)
 
 * **Attention**: By default, `argocd-image-updater` now uses the K8s API to retrieve applications, instead of the Argo CD API. Also, it is now recommended to install in the same namespace as Argo CD is running in (`argocd` by default). For existing installations, which are running in a dedicated namespace.
@@ -29,6 +41,164 @@ handling on your side.
 
 * refactor: make argocd-image-updater-config volume mapping optional (#145)
 
+
+## 2025-09-18 - Release v100.0.5a
+
+### Fixes
+
+- fix(git): Prevent panic in batched writer when `GetCreds` is nil or write-back method is not Git
+  - Only enqueue batched writes when `wbc.Method == git`
+  - Guard in `repoWriter.commitBatch` for missing `GetCreds` (skip with log)
+
+### Tests
+
+- test(git): Strengthen batched writer test to set `Method: WriteBackGit` and provide `GetCreds` stub, so missing-GetCreds would fail tests
+
+### Notes
+
+- No flags or defaults changed; safe upgrade from v100.0.4a
+
+## 2025-09-18 - Release v100.0.4a
+
+### Changes
+
+- test(git): Add unit test verifying batched writer flushes per-branch (monorepo safety)
+- fix(git): Guard `getWriteBackBranch` against nil Application source
+- docs: Clarify `--max-concurrency=0` (auto) in README quick reference
+
+### Notes
+
+- All existing tests pass. No changes to defaults or flags.
+
+## 2025-09-18 - Release v100.0.3a
+
+### Highlights
+
+- Continuous mode: per-app scheduling with independent timers (no full-cycle waits)
+- Auto concurrency: `--max-concurrency=0` computes workers from CPUs/apps
+- Robust registry auth and I/O: singleflight + retries with backoff on `/jwt/auth`, tag and manifest operations
+- Safer connection handling: transport reuse, tuned timeouts, per‑registry in‑flight caps
+- Git efficiency: per‑repo batched writer + retries
+- Deep metrics: apps, cycles, registry, JWT
+
+### New features
+
+- feat(mode): `--mode=continuous` (default remains `cycle`)
+- feat(concurrency): `--max-concurrency=0` for auto sizing
+- feat(schedule): LRU / fail-first with `--schedule`; fairness with `--per-repo-cap`, `--cooldown`
+- feat(auth): JWT `/jwt/auth` retries with backoff (singleflight dedupe)
+  - Env: `REGISTRY_JWT_ATTEMPTS` (default 7), `REGISTRY_JWT_RETRY_BASE` (200ms), `REGISTRY_JWT_RETRY_MAX` (3s)
+- feat(metrics): Per-application timings and state
+  - `argocd_image_updater_application_update_duration_seconds{application}`
+  - `argocd_image_updater_application_last_attempt_timestamp{application}`
+  - `argocd_image_updater_application_last_success_timestamp{application}`
+  - `argocd_image_updater_images_considered_total{application}`
+  - `argocd_image_updater_images_skipped_total{application}`
+  - `argocd_image_updater_scheduler_skipped_total{reason}`
+- feat(metrics): Cycle timing
+  - `argocd_image_updater_update_cycle_duration_seconds`
+  - `argocd_image_updater_update_cycle_last_end_timestamp`
+- feat(metrics): Registry visibility
+  - `argocd_image_updater_registry_in_flight_requests{registry}`
+  - `argocd_image_updater_registry_request_duration_seconds{registry}`
+  - `argocd_image_updater_registry_http_status_total{registry,code}`
+  - `argocd_image_updater_registry_request_retries_total{registry,op}`
+  - `argocd_image_updater_registry_errors_total{registry,kind}`
+- feat(metrics): Singleflight effectiveness
+  - `argocd_image_updater_singleflight_leaders_total{kind}`
+  - `argocd_image_updater_singleflight_followers_total{kind}`
+- feat(metrics): JWT visibility
+  - `argocd_image_updater_registry_jwt_auth_requests_total{registry,service,scope}`
+  - `argocd_image_updater_registry_jwt_auth_errors_total{registry,service,scope,reason}`
+  - `argocd_image_updater_registry_jwt_auth_duration_seconds{registry,service,scope}`
+  - `argocd_image_updater_registry_jwt_token_ttl_seconds{registry,service,scope}`
+
+### Improvements
+
+- perf(registry): HTTP transport reuse; tuned `MaxIdleConns`, `MaxIdleConnsPerHost`, `MaxConnsPerHost`; response and handshake timeouts
+- perf(registry): Per‑registry in‑flight cap to prevent connection storms
+- resiliency(registry): Jittered retries for tags/manifests; `/jwt/auth` retries with backoff
+- perf(git): Batched per‑repo writer; retries for fetch/shallow-fetch/push
+- sched: Fairness via LRU/fail-first, cooldown, and per-repo caps
+
+### Defaults enabled (no flags)
+
+- Transport reuse and tuned timeouts
+- Per‑registry in‑flight cap (default 15)
+- Authorizer cache per (registry, repo)
+- Singleflight on tags, manifests, and `/jwt/auth`
+- Retries: tags/manifests (3x), JWT auth (defaults above)
+- Git retries (env-overridable); Batched writer (disable via `GIT_BATCH_DISABLE=true`)
+
+### Docs
+
+- docs(install): Performance flags and defaults (continuous mode, auto concurrency, JWT retry envs)
+- docs(metrics): Expanded metrics section
+
+### Tests
+
+- test: Unit tests for transport caching, metrics wrappers, continuous scheduler basics, and end-to-end build
+
+### Known issues
+
+- Under very high concurrency and bursty load, upstream registry/SNAT limits may still cause intermittent timeouts. The new caps, retries, and singleflight significantly reduce impact; tune per‑registry limits and consider HTTP/2 where available.
+
+## 2025-09-17 - Release v99.9.9 - 66de072
+
+### New features
+
+* feat: Reuse HTTP transports for registries with keep-alives and timeouts
+* feat: Initialize registry refresh-token map to enable token reuse
+* feat: Add Makefile `DOCKER` variable to support `podman`
+
+### Improvements
+
+* perf: Cache transports per registry+TLS mode; add sensible connection/timeouts
+* resiliency: Retry/backoff for registry tag listing
+* resiliency: Retry/backoff for git fetch/shallow-fetch/push during write-back
+
+### Tests/Docs
+
+* test: Add unit tests for transport caching and token map init
+* docs: Requirements/notes updates
+
+### Upgrade notes
+
+* None
+
+### Bug fixes
+
+* None
+
+### Bugs
+
+* Under very high concurrency (300–500) after 2–3 hours, nodes may hit ephemeral port exhaustion causing registry dials to fail:
+
+    Example error observed:
+
+    `dial tcp 10.2.163.141:5000: connect: cannot assign requested address`
+
+    Notes:
+    - This typically manifests across all registries simultaneously under heavy outbound connection churn.
+    - Root cause is excessive parallel dials combined with short‑lived connections (TIME_WAIT buildup), not a specific registry outage.
+    - Mitigations available in v100.0.0a: larger keep‑alive pools, lower MaxConnsPerHost, and ability to close idle on cache clear. Operational mitigations: reduce updater concurrency and/or per‑registry limits (e.g., 500→250; 50 rps→20–30 rps) while investigating.
+
+    Details:
+    - Old ports are “released” only after TIME_WAIT (2MSL). With HTTP/1.1 and big bursts, you create more concurrent outbound sockets than the ephemeral range can recycle before TIME_WAIT expires, so you hit “cannot assign requested address” even though old sockets eventually close.
+    - Why it still happens under 250/100 RPS:
+      - Each new dial consumes a unique local ephemeral port to the same dst tuple. TIME_WAIT lasts ~60–120s (kernel dependent). Bursty concurrency + short interval means you outpace reuse.
+      - Go HTTP/1.1 doesn’t pipeline; reuse works only if there’s an idle kept‑alive socket. If many goroutines need sockets at once, you dial anyway.
+      - Often compounded by SNAT limits at the node (Kubernetes egress): per‑dst NAT port cap can exhaust even faster.
+    - How to confirm quickly:
+      - Check TIME_WAIT to the registry IP:port: `ss -antp | grep :5000 | grep TIME_WAIT | wc -l`
+      - Check ephemeral range: `sysctl net.ipv4.ip_local_port_range`
+      - In Kubernetes, inspect node SNAT usage (some clouds cap SNAT ports per node/destination).
+    - What fixes it (software‑side, regardless of kernel/NAT tuning):
+      - Add a hard per‑registry in‑flight cap (e.g., 10–15) so requests queue instead of dialing new sockets.
+      - Lower `MaxConnsPerHost` further (e.g., 15). Keep large idle pools to maximize reuse.
+      - Add jitter to scheduling (avoid synchronized bursts); consider 30s interval over 15s.
+      - If the registry supports HTTP/2 over TLS, H2 multiplexing drastically reduces sockets.
+
 ## 2020-12-06 - Release v0.8.0
 
 ### Upgrade notes (no really, you MUST read this)
 
@@ -12,6 +12,9 @@ ARCH?=$(shell go env GOARCH)
 OUTDIR?=dist
 BINNAME?=argocd-image-updater
 
+# Container runtime (override with DOCKER=podman)
+DOCKER?=docker
+
 CURRENT_DIR=$(shell pwd)
 VERSION=$(shell cat ${CURRENT_DIR}/VERSION)
 GIT_COMMIT=$(shell git rev-parse HEAD)
@@ -87,14 +90,14 @@ controller:
 
 .PHONY: image
 image: clean-image
-	docker build \
+	${DOCKER} build \
 		-t ${IMAGE_PREFIX}${IMAGE_NAME}:${IMAGE_TAG} \
 		--pull \
 		.
 
 .PHONY: multiarch-image
 multiarch-image:
-	docker buildx build \
+	${DOCKER} buildx build \
 		-t ${IMAGE_PREFIX}${IMAGE_NAME}:${IMAGE_TAG} \
 		--progress plain \
 		--pull \
@@ -103,7 +106,7 @@ multiarch-image:
 
 .PHONY: multiarch-image-push
 multiarch-image-push:
-	docker buildx build \
+	${DOCKER} buildx build \
 		-t ${IMAGE_PREFIX}${IMAGE_NAME}:${IMAGE_TAG} \
 		--progress plain \
 		--pull \
@@ -113,7 +116,7 @@ multiarch-image-push:
 
 .PHONY: image-push
 image-push: image
-	docker push ${IMAGE_PREFIX}${IMAGE_NAME}:${IMAGE_TAG}
+	${DOCKER} push ${IMAGE_PREFIX}${IMAGE_NAME}:${IMAGE_TAG}
 
 .PHONY: release-binaries
 release-binaries:
@@ -130,10 +133,10 @@ release-binaries:
 
 .PHONY: extract-binary
 extract-binary:
-	docker rm argocd-image-updater-${IMAGE_TAG} || true
-	docker create --name argocd-image-updater-${IMAGE_TAG} ${IMAGE_PREFIX}${IMAGE_NAME}:${IMAGE_TAG}
-	docker cp argocd-image-updater-${IMAGE_TAG}:/usr/local/bin/argocd-image-updater /tmp/argocd-image-updater_${IMAGE_TAG}_linux-amd64
-	docker rm argocd-image-updater-${IMAGE_TAG}
+	${DOCKER} rm argocd-image-updater-${IMAGE_TAG} || true
+	${DOCKER} create --name argocd-image-updater-${IMAGE_TAG} ${IMAGE_PREFIX}${IMAGE_NAME}:${IMAGE_TAG}
+	${DOCKER} cp argocd-image-updater-${IMAGE_TAG}:/usr/local/bin/argocd-image-updater /tmp/argocd-image-updater_${IMAGE_TAG}_linux-amd64
+	${DOCKER} rm argocd-image-updater-${IMAGE_TAG}
 
 .PHONY: lint
 lint:
@@ -148,7 +151,7 @@ codegen: manifests
 
 .PHONY: run-test
 run-test:
-	docker run -v $(HOME)/.kube:/kube --rm -it \
+	${DOCKER} run -v $(HOME)/.kube:/kube --rm -it \
 		-e ARGOCD_TOKEN \
 		${IMAGE_PREFIX}${IMAGE_NAME}:${IMAGE_TAG} \
 		--kubeconfig /kube/config \
@@ -157,5 +160,5 @@ run-test:
 
 .PHONY: serve-docs
 serve-docs:
-	docker run ${MKDOCS_RUN_ARGS} --rm -it -p 8000:8000 -v ${CURRENT_DIR}:/docs ${MKDOCS_DOCKER_IMAGE} serve -a 0.0.0.0:8000
+	${DOCKER} run ${MKDOCS_RUN_ARGS} --rm -it -p 8000:8000 -v ${CURRENT_DIR}:/docs ${MKDOCS_DOCKER_IMAGE} serve -a 0.0.0.0:8000