Skip to content

Commit e08524d

Browse files
fix: Avoid controller crashes when running large number of workflows (argoproj#9691)
Signed-off-by: Yuan Tang <[email protected]>
1 parent 4158cf1 commit e08524d

File tree

4 files changed

+7
-2
lines changed

4 files changed

+7
-2
lines changed

.spelling

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@ k8s-jobs
148148
kube
149149
kubelet
150150
kubernetes
151+
liveness
151152
localhost
152153
memoization
153154
memoized

docs/environment-variables.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,12 +33,14 @@ most users. Environment variables may be removed at any time.
3333
| `GZIP_IMPLEMENTATION` | `string` | `PGZip` | The implementation of compression/decompression. Currently only "`PGZip`" and "`GZip`" are supported. |
3434
| `INFORMER_WRITE_BACK` | `bool` | `true` | Whether to write back to informer instead of catching up. |
3535
| `HEALTHZ_AGE` | `time.Duration` | `5m` | How old a un-reconciled workflow is to report unhealthy. |
36+
| `HEALTHZ_LIST_LIMIT` | `int` | `200` | The maximum number of responses to return for a list call on workflows for liveness check. |
3637
| `INDEX_WORKFLOW_SEMAPHORE_KEYS` | `bool` | `true` | Whether or not to index semaphores. |
3738
| `LEADER_ELECTION_IDENTITY` | `string` | Controller's `metadata.name` | The ID used for workflow controllers to elect a leader. |
3839
| `LEADER_ELECTION_DISABLE` | `bool` | `false` | Whether leader election should be disabled. |
3940
| `LEADER_ELECTION_LEASE_DURATION` | `time.Duration` | `15s` | The duration that non-leader candidates will wait to force acquire leadership. |
4041
| `LEADER_ELECTION_RENEW_DEADLINE` | `time.Duration` | `10s` | The duration that the acting master will retry refreshing leadership before giving up. |
4142
| `LEADER_ELECTION_RETRY_PERIOD` | `time.Duration` | `5s` | The duration that the leader election clients should wait between tries of actions. |
43+
| `LIST_LIMIT` | `int` | `200` | The maximum number of responses to return for a list call on workflows for workflow informer. |
4244
| `MAX_OPERATION_TIME` | `time.Duration` | `30s` | The maximum time a workflow operation is allowed to run for before re-queuing the workflow onto the work queue. |
4345
| `OFFLOAD_NODE_STATUS_TTL` | `time.Duration` | `5m` | The TTL to delete the offloaded node status. Currently only used for testing. |
4446
| `POD_NAMES` | `string` | `v2` | Whether to have pod names contain the template name (v2) or be the node id (v1) - should be set the same for Argo Server. |

workflow/controller/controller.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -803,6 +803,7 @@ func (wfc *WorkflowController) tweakListOptions(options *metav1.ListOptions) {
803803
labelSelector := labels.NewSelector().
804804
Add(util.InstanceIDRequirement(wfc.Config.InstanceID))
805805
options.LabelSelector = labelSelector.String()
806+
options.Limit = int64(env.LookupEnvIntOr("LIST_LIMIT", 200))
806807
}
807808

808809
func getWfPriority(obj interface{}) (int32, time.Time) {

workflow/controller/healthz.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@ import (
1313
)
1414

1515
var (
16-
age = env.LookupEnvDurationOr("HEALTHZ_AGE", 5*time.Minute)
16+
age = env.LookupEnvDurationOr("HEALTHZ_AGE", 5*time.Minute)
17+
limit = int64(env.LookupEnvIntOr("HEALTHZ_LIST_LIMIT", 200))
1718
)
1819

1920
// https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/#define-a-liveness-http-request
@@ -30,7 +31,7 @@ func (wfc *WorkflowController) Healthz(w http.ResponseWriter, r *http.Request) {
3031
labelSelector := "!" + common.LabelKeyPhase + "," + instanceIDSelector
3132
err := func() error {
3233
// avoid problems with informers, but directly querying the API
33-
list, err := wfc.wfclientset.ArgoprojV1alpha1().Workflows(wfc.managedNamespace).List(ctx, metav1.ListOptions{LabelSelector: labelSelector})
34+
list, err := wfc.wfclientset.ArgoprojV1alpha1().Workflows(wfc.managedNamespace).List(ctx, metav1.ListOptions{LabelSelector: labelSelector, Limit: limit})
3435
if err != nil {
3536
return err
3637
}

0 commit comments

Comments
 (0)