Skip to content

Commit 78cbd01

Browse files
committed
Additional updates for KEP-3329
1 parent bd723c0 commit 78cbd01

File tree

1 file changed

+38
-37
lines changed
  • keps/sig-apps/3329-retriable-and-non-retriable-failures

1 file changed

+38
-37
lines changed

keps/sig-apps/3329-retriable-and-non-retriable-failures/README.md

Lines changed: 38 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -401,8 +401,7 @@ spec:
401401
rules:
402402
- action: Ignore
403403
onPodConditions:
404-
operator: In
405-
values: [ DisruptionTarget ]
404+
- type: DisruptionTarget
406405
```
407406

408407
Note that, in this case the user supplies a list of Pod condition type values.
@@ -752,15 +751,16 @@ type PodFailurePolicyAction string
752751
const (
753752
// This is an action which might be taken on a pod failure - mark the
754753
// pod's job as Failed and terminate all running pods.
755-
PodFailurePolicyActionTerminate PodFailurePolicyAction = "Terminate"
754+
PodFailurePolicyActionFailJob PodFailurePolicyAction = "FailJob"
756755
757756
// This is an action which might be taken on a pod failure - the counter towards
758-
// .backoffLimit is not incremented and a replacement pod is created.
757+
// .backoffLimit, represented by the job's .status.failed field, is not
758+
// incremented and a replacement pod is created.
759759
PodFailurePolicyActionIgnore PodFailurePolicyAction = "Ignore"
760760
761761
// This is an action which might be taken on a pod failure - the pod failure
762-
// is handled in the default way - the counter towards .backoffLimit is
763-
// incremented.
762+
// is handled in the default way - the counter towards .backoffLimit,
763+
// represented by the job's .status.failed field, is incremented.
764764
PodFailurePolicyActionCount PodFailurePolicyAction = "Count"
765765
)
766766
@@ -772,15 +772,20 @@ const (
772772
)
773773
774774
// PodFailurePolicyOnExitCodesRequirement describes the requirement for handling
775-
// a failed pod based on its container exit codes.
775+
// a failed pod based on its container exit codes. In particular, it lookups the
776+
// .state.terminated.exitCode for each app container and init container status,
777+
// represented by the .status.containerStatuses and .status.initContainerStatuses
778+
// fields in the Pod status, respectively. Containers completed with success
779+
// (exit code 0) are excluded from the requirement check.
776780
type PodFailurePolicyOnExitCodesRequirement struct {
777781
// Restricts the check for exit codes to the container with the
778782
// specified name. When null, the rule applies to all containers.
779783
// +optional
780784
ContainerName *string
781785
782786
// Represents the relationship between the container exit code(s) and the
783-
// specified values. Possible values are:
787+
// specified values. Containers completed with success (exit code 0) are
788+
// excluded from the requirement check.Possible values are:
784789
// - In: the requirement is satisfied if at least one container exit code
785790
// (might be multiple if there are multiple containers not restricted
786791
// by the 'containerName' field) is in the set of specified values.
@@ -791,39 +796,26 @@ type PodFailurePolicyOnExitCodesRequirement struct {
791796
792797
// Specifies the set of values. Each returned container exit code (might be
793798
// multiple in case of multiple containers) is checked against this set of
794-
// values with respect to the operator.
799+
// values with respect to the operator. Value '0' cannot be used for the In
800+
// operator.
795801
// +listType=set
796802
Values []int32
797803
}
798804
799-
type PodFailurePolicyOnPodConditionsOperator string
800-
801-
const (
802-
PodFailurePolicyOnPodConditionsOpIn PodFailurePolicyOnPodConditionsOperator = "In"
803-
)
804-
805-
// PodFailurePolicyOnPodConditionsRequirement describes the requirement for handling
806-
// a failed pod based on its conditions.
807-
type PodFailurePolicyOnPodConditionsRequirement struct {
808-
// Represents the relationship between the set of actual Pod condition types
809-
// and the set of specified Pod condition types. Possible values are:
810-
// - In: the requirement is satisfied, if at least one actual Pod condition
811-
// type (for a condition with status=True) is present in the set of
812-
// specified Pod condition types.
813-
Operator PodFailurePolicyOnPodConditionsOperator
814-
815-
// Specifies the set of values. Each actual pod condition type,
816-
// with status=True, is checked against this set with respect to the operator.
817-
// +listType=set
818-
Values []api.PodConditionType
805+
// PodFailurePolicyOnPodConditionsPattern describes a pattern for matching
806+
// an actual pod condition type.
807+
type PodFailurePolicyOnPodConditionsPattern struct {
808+
// Specifies the required Pod condition type. The pattern matches a pod condition
809+
// if the specified type equals the pod condition type.
810+
Type api.PodConditionType
819811
}
820812
821813
// PodFailurePolicyRule describes how a pod failure is handled when the requirements are met.
822814
// Only one of OnExitCodes and onPodConditions can be used in each rule.
823815
type PodFailurePolicyRule struct {
824816
// Specifies the action taken on a pod failure when the requirements are satisfied.
825817
// Possible values are:
826-
// - Terminate: indicates that the pod's job is marked as Failed and all
818+
// - FailJob: indicates that the pod's job is marked as Failed and all
827819
// running pods are terminated.
828820
// - Ignore: indicates that the counter towards the .backoffLimit is not
829821
// incremented and a replacement pod is created.
@@ -835,9 +827,11 @@ type PodFailurePolicyRule struct {
835827
// +optional
836828
OnExitCodes *PodFailurePolicyOnExitCodesRequirement
837829
838-
// Represents the requirement on the pod conditions.
839-
// +optional
840-
OnPodConditions *PodFailurePolicyOnPodConditionsRequirement
830+
// Represents the requirement on the pod conditions. The requirement is represented
831+
// as a list of pod condition patterns. The requirement is satisfied if at
832+
// least pattern matches an actual pod condition.
833+
// +listType=atomic
834+
OnPodConditions []PodFailurePolicyOnPodConditionsPattern
841835
}
842836
843837
// PodFailurePolicy describes how failed pods influence the backoffLimit.
@@ -857,8 +851,13 @@ type JobSpec struct {
857851
// Specifies the policy of handling failed pods. In particular, it allows to
858852
// specify the set of actions and conditions which need to be
859853
// satisfied to take the associated action.
860-
// If empty, the default behaviour applies - the counter of pod failed is
861-
// incremented and it is checked against the backoffLimit.
854+
// If empty, the default behaviour applies - the counter of failed pods,
855+
// represented by the jobs's .status.failed field, is incremented and it is
856+
// checked against the backoffLimit. This field cannot be used in combination
857+
// with restartPolicy=OnFailure.
858+
//
859+
// This field is alpha-level. To use this field, you must enable the
860+
// `JobPodFailurePolicy` feature gate (disabled by default).
862861
// +optional
863862
PodFailurePolicy *PodFailurePolicy
864863
...
@@ -899,8 +898,7 @@ spec:
899898
values: [1,2,3]
900899
- action: Ignore
901900
onPodConditions:
902-
operator: In
903-
values: [ DisruptionTarget ]
901+
- type: DisruptionTarget
904902
```
905903
906904
### Evaluation
@@ -1086,6 +1084,9 @@ Below are some examples to consider, in addition to the aforementioned [maturity
10861084
indicating that a pod should be retried (see: [Evolving condition types](#evolving-condition-types))
10871085
- Simplify the code in job controller responsible for detection of failed pods
10881086
based on the fix for pods stuck in the running phase (see: [Marking pods as Failed](marking-pods-as-failed))
1087+
- Commonize the code for appending pod conditions between components
1088+
- Do not update the pod disruption condition (with type=`DisruptionTarget`) if
1089+
it is already present with `status=True`
10891090
- Review and implement if feasible adding of pod conditions with the use of
10901091
[SSA](https://kubernetes.io/docs/reference/using-api/server-side-apply/) client.
10911092
- The feature flag enabled by default

0 commit comments

Comments
 (0)