argoproj · RoelofKuijpers · Apr 9, 2025 · Apr 9, 2025 · Apr 9, 2025 · Apr 9, 2025
@@ -9,6 +9,7 @@ import (
 	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/kubectl/pkg/util/podutils"
 
+	"github.com/argoproj/gitops-engine/pkg/sync/common"
 	"github.com/argoproj/gitops-engine/pkg/utils/kube"
 )
 
@@ -93,9 +94,9 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) {
 		}
 
 		return &HealthStatus{Status: HealthStatusDegraded, Message: ""}, nil
+
 	case corev1.PodRunning:
-		switch pod.Spec.RestartPolicy {
-		case corev1.RestartPolicyAlways:
+		getHealthStatus := func(pod *corev1.Pod) (*HealthStatus, error) {
 			// if pod is ready, it is automatically healthy
 			if podutils.IsPodReady(pod) {
 				return &HealthStatus{
@@ -117,10 +118,20 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) {
 				Status:  HealthStatusProgressing,
 				Message: pod.Status.Message,
 			}, nil
-		case corev1.RestartPolicyOnFailure, corev1.RestartPolicyNever:
-			// pods set with a restart policy of OnFailure or Never, have a finite life.
+		}
+		policy := pod.Spec.RestartPolicy
+		// If the pod has the AnnotationIgnoreRestartPolicy annotation or its restart policy is Always,
+		// then treat it as a long-running pod and check its health status.
+		if _, ok := pod.Annotations[common.AnnotationIgnoreRestartPolicy]; ok || policy == corev1.RestartPolicyAlways {
+			return getHealthStatus(pod)
+		}
+
+		if policy == corev1.RestartPolicyOnFailure || policy == corev1.RestartPolicyNever {
+			// Most pods set with a restart policy of OnFailure or Never, have a finite life.
 			// These pods are typically resource hooks. Thus, we consider these as Progressing
-			// instead of healthy.
+			// instead of healthy. If this is unwanted, e.g., when the pod is managed by an
+			// operator and therefore has a restart policy of OnFailure or Never, then use the
+			// the AnnotationIgnoreRestartPolicy annotation.
 			return &HealthStatus{
 				Status:  HealthStatusProgressing,
 				Message: pod.Status.Message,

@@ -103,6 +103,7 @@ func TestPod(t *testing.T) {
 	assertAppHealth(t, "./testdata/pod-error.yaml", HealthStatusDegraded)
 	assertAppHealth(t, "./testdata/pod-running-restart-always.yaml", HealthStatusHealthy)
 	assertAppHealth(t, "./testdata/pod-running-restart-never.yaml", HealthStatusProgressing)
+	assertAppHealth(t, "./testdata/pod-running-restart-never-with-ignore-annotation.yaml", HealthStatusHealthy)
 	assertAppHealth(t, "./testdata/pod-running-restart-onfailure.yaml", HealthStatusProgressing)
 	assertAppHealth(t, "./testdata/pod-failed.yaml", HealthStatusDegraded)
 	assertAppHealth(t, "./testdata/pod-succeeded.yaml", HealthStatusHealthy)

@@ -0,0 +1,87 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  creationTimestamp: 2018-12-02T09:15:16Z
+  name: my-pod
+  namespace: argocd
+  resourceVersion: "151053"
+  selfLink: /api/v1/namespaces/argocd/pods/my-pod
+  uid: c86e909c-f612-11e8-a057-fe5f49266390
+  annotations:
+    argocd.argoproj.io/ignore-restart-policy: "true"
+spec:
+  containers:
+  - command:
+    - sh
+    - -c
+    - sleep 10
+    image: alpine:3.21
+    imagePullPolicy: Always
+    name: main
+    resources:
+      requests:
+        ephemeral-storage: "100Mi"
+        memory: "128Mi"
+        cpu: "250m"
+      limits:
+        memory: "256Mi"
+        cpu: "500m"
+    terminationMessagePath: /dev/termination-log
+    terminationMessagePolicy: File
+    volumeMounts:
+    - mountPath: /var/run/secrets/kubernetes.io/serviceaccount
+      name: default-token-f9jvj
+      readOnly: true
+  dnsPolicy: ClusterFirst
+  nodeName: minikube
+  restartPolicy: Never
+  schedulerName: default-scheduler
+  securityContext: {}
+  serviceAccount: default
+  serviceAccountName: default
+  automountServiceAccountToken: false
+  terminationGracePeriodSeconds: 30
+  tolerations:
+  - effect: NoExecute
+    key: node.kubernetes.io/not-ready
+    operator: Exists
+    tolerationSeconds: 300
+  - effect: NoExecute
+    key: node.kubernetes.io/unreachable
+    operator: Exists
+    tolerationSeconds: 300
+  volumes:
+  - name: default-token-f9jvj
+    secret:
+      defaultMode: 420
+      secretName: default-token-f9jvj
+status:
+  conditions:
+  - lastProbeTime: null
+    lastTransitionTime: 2018-12-02T09:15:16Z
+    status: "True"
+    type: Initialized
+  - lastProbeTime: null
+    lastTransitionTime: 2018-12-02T09:15:19Z
+    status: "True"
+    type: Ready
+  - lastProbeTime: null
+    lastTransitionTime: 2018-12-02T09:15:16Z
+    status: "True"
+    type: PodScheduled
+  containerStatuses:
+  - containerID: containerd://adc73c2c0ae3f1fd9bf294abd834e740042ee375de680c0cfcdd90d863a73b8b
+    image: alpine:3.21
+    imageID: docker.io/library/alpine@sha256:a8560b36e8b8210634f77d9f7f9efd7ffa463e380b75e2e74aff4511df3ef88c
+    lastState: {}
+    name: main
+    ready: true
+    restartCount: 0
+    state:
+      running:
+        startedAt: 2018-12-02T09:15:19Z
+  hostIP: 192.168.64.41
+  phase: Running
+  podIP: 172.17.0.9
+  qosClass: BestEffort
+  startTime: 2018-12-02T09:15:16Z
@@ -17,6 +17,9 @@ const (
 	// AnnotationKeyHookDeletePolicy is the policy of deleting a hook
 	AnnotationKeyHookDeletePolicy = "argocd.argoproj.io/hook-delete-policy"
 	AnnotationDeletionApproved    = "argocd.argoproj.io/deletion-approved"
+	// AnnotationIgnoreRestartPolicy ignores restart policy, useful for operator-managed
+	// pods to be considered healthy
+	AnnotationIgnoreRestartPolicy = "argocd.argoproj.io/ignore-restart-policy"
 
 	// Sync option that disables dry run in resource is missing in the cluster
 	SyncOptionSkipDryRunOnMissingResource = "SkipDryRunOnMissingResource=true"

@@ -65,6 +65,14 @@ The following policies define when the hook will be deleted.
   - HookFailed - the hook resource is deleted after the hook failed.
   - BeforeHookCreation - any existing hook resource is deleted before the new one is created
 
+**Pods with restartPolicy Never or OnFailure**
+
+During synchronization, the application may show the resource health as *Progressing* when it deploys a Pod that has the `restartPolicy` set to `Never` or `OnFailure` (see Kubernetes docs for restart policy). Generally, these resources behave like a Job and are expected to complete. This is intended behavior, since Jobs are commonly used for sync hooks and must finish before an application is considered *Healthy*.
+
+A workaround is to use the annotation: argocd.argoproj.io/ignore-restart-policy: "true".
+
+When this annotation is set on the Pod resource, the controller will ignore the `restartPolicy` and consider the Pod *Running* as a valid healthy state.
+
 # Sync Waves
 
 The waves allow to group sync execution of syncing process into batches when each batch is executed sequentially one after