-
Notifications
You must be signed in to change notification settings - Fork 291
fix: pod with a restart policy of Never or OnFailure stuck at 'Progressing' (#15317) #709
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
9ba91f3
b0a3219
07cb6a0
9a9eb19
75c2fe7
2c955b6
8ef020a
b216058
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,6 +9,7 @@ import ( | |
"k8s.io/apimachinery/pkg/runtime" | ||
"k8s.io/kubectl/pkg/util/podutils" | ||
|
||
"github.com/argoproj/gitops-engine/pkg/sync/common" | ||
"github.com/argoproj/gitops-engine/pkg/utils/kube" | ||
) | ||
|
||
|
@@ -93,9 +94,9 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) { | |
} | ||
|
||
return &HealthStatus{Status: HealthStatusDegraded, Message: ""}, nil | ||
|
||
case corev1.PodRunning: | ||
switch pod.Spec.RestartPolicy { | ||
case corev1.RestartPolicyAlways: | ||
getHealthStatus := func(pod *corev1.Pod) (*HealthStatus, error) { | ||
// if pod is ready, it is automatically healthy | ||
if podutils.IsPodReady(pod) { | ||
return &HealthStatus{ | ||
|
@@ -117,10 +118,20 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) { | |
Status: HealthStatusProgressing, | ||
Message: pod.Status.Message, | ||
}, nil | ||
case corev1.RestartPolicyOnFailure, corev1.RestartPolicyNever: | ||
// pods set with a restart policy of OnFailure or Never, have a finite life. | ||
} | ||
policy := pod.Spec.RestartPolicy | ||
// If the pod has the AnnotationIgnoreRestartPolicy annotation or its restart policy is Always, | ||
// then treat it as a long-running pod and check its health status. | ||
if _, ok := pod.Annotations[common.AnnotationIgnoreRestartPolicy]; ok || policy == corev1.RestartPolicyAlways { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think this feature is necessary with the usage of If a controller is creating resource on which we cannot assume application healthiness reliably, then we should exclude those. |
||
return getHealthStatus(pod) | ||
} | ||
|
||
if policy == corev1.RestartPolicyOnFailure || policy == corev1.RestartPolicyNever { | ||
// Most pods set with a restart policy of OnFailure or Never, have a finite life. | ||
// These pods are typically resource hooks. Thus, we consider these as Progressing | ||
// instead of healthy. | ||
// instead of healthy. If this is unwanted, e.g., when the pod is managed by an | ||
// operator and therefore has a restart policy of OnFailure or Never, then use the | ||
// the AnnotationIgnoreRestartPolicy annotation. | ||
return &HealthStatus{ | ||
Status: HealthStatusProgressing, | ||
Message: pod.Status.Message, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
apiVersion: v1 | ||
kind: Pod | ||
metadata: | ||
creationTimestamp: 2018-12-02T09:15:16Z | ||
name: my-pod | ||
namespace: argocd | ||
resourceVersion: "151053" | ||
selfLink: /api/v1/namespaces/argocd/pods/my-pod | ||
uid: c86e909c-f612-11e8-a057-fe5f49266390 | ||
annotations: | ||
argocd.argoproj.io/ignore-restart-policy: "true" | ||
spec: | ||
containers: | ||
- command: | ||
|
||
- sh | ||
- -c | ||
- sleep 10 | ||
image: alpine:3.21 | ||
imagePullPolicy: Always | ||
name: main | ||
resources: | ||
requests: | ||
ephemeral-storage: "100Mi" | ||
memory: "128Mi" | ||
cpu: "250m" | ||
limits: | ||
memory: "256Mi" | ||
cpu: "500m" | ||
terminationMessagePath: /dev/termination-log | ||
terminationMessagePolicy: File | ||
volumeMounts: | ||
- mountPath: /var/run/secrets/kubernetes.io/serviceaccount | ||
name: default-token-f9jvj | ||
readOnly: true | ||
dnsPolicy: ClusterFirst | ||
|
||
nodeName: minikube | ||
restartPolicy: Never | ||
schedulerName: default-scheduler | ||
securityContext: {} | ||
serviceAccount: default | ||
serviceAccountName: default | ||
automountServiceAccountToken: false | ||
terminationGracePeriodSeconds: 30 | ||
tolerations: | ||
- effect: NoExecute | ||
key: node.kubernetes.io/not-ready | ||
operator: Exists | ||
tolerationSeconds: 300 | ||
- effect: NoExecute | ||
key: node.kubernetes.io/unreachable | ||
operator: Exists | ||
tolerationSeconds: 300 | ||
volumes: | ||
- name: default-token-f9jvj | ||
secret: | ||
defaultMode: 420 | ||
secretName: default-token-f9jvj | ||
status: | ||
conditions: | ||
- lastProbeTime: null | ||
lastTransitionTime: 2018-12-02T09:15:16Z | ||
status: "True" | ||
type: Initialized | ||
- lastProbeTime: null | ||
lastTransitionTime: 2018-12-02T09:15:19Z | ||
status: "True" | ||
type: Ready | ||
- lastProbeTime: null | ||
lastTransitionTime: 2018-12-02T09:15:16Z | ||
status: "True" | ||
type: PodScheduled | ||
containerStatuses: | ||
- containerID: containerd://adc73c2c0ae3f1fd9bf294abd834e740042ee375de680c0cfcdd90d863a73b8b | ||
image: alpine:3.21 | ||
imageID: docker.io/library/alpine@sha256:a8560b36e8b8210634f77d9f7f9efd7ffa463e380b75e2e74aff4511df3ef88c | ||
lastState: {} | ||
name: main | ||
ready: true | ||
restartCount: 0 | ||
state: | ||
running: | ||
startedAt: 2018-12-02T09:15:19Z | ||
hostIP: 192.168.64.41 | ||
phase: Running | ||
podIP: 172.17.0.9 | ||
qosClass: BestEffort | ||
startTime: 2018-12-02T09:15:16Z |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -65,6 +65,14 @@ The following policies define when the hook will be deleted. | |
- HookFailed - the hook resource is deleted after the hook failed. | ||
- BeforeHookCreation - any existing hook resource is deleted before the new one is created | ||
|
||
**Pods with restartPolicy Never or OnFailure** | ||
|
||
During synchronization, the application may show the resource health as *Progressing* when it deploys a Pod that has the `restartPolicy` set to `Never` or `OnFailure` (see Kubernetes docs for restart policy). Generally, these resources behave like a Job and are expected to complete. This is intended behavior, since Jobs are commonly used for sync hooks and must finish before an application is considered *Healthy*. | ||
|
||
A workaround is to use the annotation: argocd.argoproj.io/ignore-restart-policy: "true". | ||
|
||
When this annotation is set on the Pod resource, the controller will ignore the `restartPolicy` and consider the Pod *Running* as a valid healthy state. | ||
Comment on lines
+72
to
+74
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This behavior, combined with hooks, would be problematic since hook would be Healthy before their completions. It is good to explain the current behavior. But not should be part of the health check documentation, and not the hooks. |
||
|
||
# Sync Waves | ||
|
||
The waves allow to group sync execution of syncing process into batches when each batch is executed sequentially one after | ||
|
Uh oh!
There was an error while loading. Please reload this page.