Don't try to cleanup Pending PipelineRuns

chmouel · chmouel · commit 8e4dc4c4b2c3 · 2024-03-19T16:25:08.000+01:00
When using max-keep-runs annotations with concurrency, the cleanups process on the watcher was trying to cleanup Pending PipelineRuns. This would cause the watcher to get stuck and not able to process its queue. We are now avoiding it the same way we do for Running PipelineRuns. Improved TestGithubSecondPullRequestConcurrencyMultiplePR to include max-keep-runs and /retest over concurrency Increase the timeout for running the e2e tests since that test is getting a bit slower. Jira: https://issues.redhat.com/browse/SRVKP-4266 Signed-off-by: Chmouel Boudjnah <chmouel@redhat.com>
diff --git a/Makefile b/Makefile
@@ -7,7 +7,7 @@ LDFLAGS=
 OUTPUT_DIR=bin
 GO           = go
 TIMEOUT_UNIT = 20m
-TIMEOUT_E2E  = 30m
+TIMEOUT_E2E  = 45m
 GO_TEST_FLAGS +=
 SHELL := bash
 
diff --git a/docs/content/docs/guide/cleanups.md b/docs/content/docs/guide/cleanups.md
@@ -4,17 +4,23 @@ weight: 8
 ---
 # PipelineRuns Cleanups
 
-There can be many PipelineRuns into a user namespace and Pipelines-as-Code
-has the ability to only keep several PipelineRuns that matches an event.
+There can be many PipelineRuns into a user namespace and Pipelines-as-Code has
+the ability to only keep a certain amount of PipelineRuns and cleaning the old
+ones.
 
-For example if the PipelineRun has this annotation :
+When your PipelineRun has this annotation :
 
 ```yaml
 pipelinesascode.tekton.dev/max-keep-runs: "maxNumber"
 ```
 
-Pipelines-as-Code sees this and will start cleaning up right after it finishes a
-successful execution keeping only the maxNumber of PipelineRuns.
+Pipelines-as-Code sees this and will start cleaning up right after one of the
+PipelineRun finishes to a successful execution keeping only the last `maxNumber` of
+PipelineRuns.
 
-It will skip the `Running` PipelineRuns but will not skip the PipelineRuns with
-`Unknown` status.
+It will skip the `Running` or `Pending` PipelineRuns but will not skip the
+PipelineRuns with `Unknown` status.
+
+{{< hint info >}}
+The setting can be as well configured globally for a cluster via the [Pipelines-as-Code ConfigMap]({{< relref "/docs/install/settings.md" >}})
+{{< /hint >}}
diff --git a/pkg/kubeinteraction/cleanups.go b/pkg/kubeinteraction/cleanups.go
@@ -31,8 +31,9 @@ func (k Interaction) CleanupPipelines(ctx context.Context, logger *zap.SugaredLo
 	}
 
 	for c, prun := range psort.PipelineRunSortByCompletionTime(pruns.Items) {
-		if prun.GetStatusCondition().GetCondition(apis.ConditionSucceeded).GetReason() == "Running" {
-			logger.Infof("skipping %s since currently running", prun.GetName())
+		prReason := prun.GetStatusCondition().GetCondition(apis.ConditionSucceeded).GetReason()
+		if prReason == tektonv1.PipelineRunReasonRunning.String() || prReason == tektonv1.PipelineRunReasonPending.String() {
+			logger.Infof("skipping cleaning PipelineRun %s since the conditions.reason is %s", prReason, prun.GetName())
 			continue
 		}
 
diff --git a/pkg/kubeinteraction/cleanups_test.go b/pkg/kubeinteraction/cleanups_test.go
@@ -83,6 +83,22 @@ func TestCleanupPipelines(t *testing.T) {
 				prunLatestInList: "pipeline-running",
 			},
 		},
+		{
+			name: "cleanup-skip-pending",
+			args: args{
+				namespace:      ns,
+				repositoryName: cleanupRepoName,
+				maxKeep:        1,
+				kept:           1, // see my comment in code why only 1 is kept.
+				prunCurrent:    &tektonv1.PipelineRun{ObjectMeta: metav1.ObjectMeta{Labels: cleanupLabels, Annotations: cleanupAnnotations}},
+				pruns: []*tektonv1.PipelineRun{
+					tektontest.MakePRCompletion(clock, "pipeline-pending", ns, tektonv1.PipelineRunReasonPending.String(), nil, cleanupLabels, 10),
+					tektontest.MakePRCompletion(clock, "pipeline-toclean", ns, tektonv1.PipelineRunReasonSuccessful.String(), nil, cleanupLabels, 30),
+					tektontest.MakePRCompletion(clock, "pipeline-tokeep", ns, tektonv1.PipelineRunReasonSuccessful.String(), nil, cleanupLabels, 20),
+				},
+				prunLatestInList: "pipeline-pending",
+			},
+		},
 		{
 			name: "cleanup with secrets",
 			args: args{
diff --git a/test/github_pullrequest_concurrency_multiplepr_test.go b/test/github_pullrequest_concurrency_multiplepr_test.go
@@ -0,0 +1,138 @@
+//go:build e2e
+// +build e2e
+
+package test
+
+import (
+	"context"
+	"fmt"
+	"net/http"
+	"os"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/google/go-github/v56/github"
+	"github.com/openshift-pipelines/pipelines-as-code/pkg/random"
+	tgithub "github.com/openshift-pipelines/pipelines-as-code/test/pkg/github"
+	"github.com/openshift-pipelines/pipelines-as-code/test/pkg/options"
+	"github.com/openshift-pipelines/pipelines-as-code/test/pkg/payload"
+	"github.com/openshift-pipelines/pipelines-as-code/test/pkg/repository"
+	tektonv1 "github.com/tektoncd/pipeline/pkg/apis/pipeline/v1"
+	"github.com/tektoncd/pipeline/pkg/names"
+	"gotest.tools/v3/assert"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+)
+
+// TestGithubSecondPullRequestConcurrencyMultiplePR concurrency for the same Repository over multiples PR including a /retest
+// and a max-keep-run, may be a bit slow (180s at least) but it's worth it.
+func TestGithubSecondPullRequestConcurrencyMultiplePR(t *testing.T) {
+	ctx := context.Background()
+	label := "Github Multiple PullRequest Concurrency-1 MaxKeepRun-1 Multiple"
+	numberOfPullRequest := 3
+	numberOfPipelineRuns := 3
+	numberOfRetests := 1
+	maxNumberOfConcurrentPipelineRuns := 1
+	maxKeepRun := 1
+	allPipelinesRunsCnt := (numberOfPullRequest * numberOfPipelineRuns) + (numberOfPullRequest * numberOfRetests * numberOfPipelineRuns)
+	allPipelinesRunAfterCleanUp := allPipelinesRunsCnt / (maxKeepRun + 1)
+	loopMax := 35
+
+	targetNS := names.SimpleNameGenerator.RestrictLengthWithRandomSuffix("pac-e2e-ns")
+	_, runcnx, opts, ghcnx, err := tgithub.Setup(ctx, true, false)
+	assert.NilError(t, err)
+
+	runcnx.Clients.Log.Infof("Starting %d pipelineruns, (numberOfPullRequest=%d*numberOfPipelineRuns=%d) + (numberOfPullRequest=%d*numberOfRetests=%d*numberOfPipelineRuns=%d) Should end after clean up (maxKeepRun=%d) with %d",
+		allPipelinesRunsCnt, numberOfPullRequest, numberOfPipelineRuns, numberOfPullRequest, numberOfRetests, numberOfPipelineRuns, maxKeepRun, allPipelinesRunAfterCleanUp)
+
+	repoinfo, resp, err := ghcnx.Client.Repositories.Get(ctx, opts.Organization, opts.Repo)
+	assert.NilError(t, err)
+	if resp != nil && resp.StatusCode == http.StatusNotFound {
+		t.Errorf("Repository %s not found in %s", opts.Organization, opts.Repo)
+	}
+	// set concurrency
+	opts.Concurrency = maxNumberOfConcurrentPipelineRuns
+	err = tgithub.CreateCRD(ctx, t, repoinfo, runcnx, opts, targetNS)
+	assert.NilError(t, err)
+
+	allPullRequests := []int{}
+	for prc := 0; prc < numberOfPullRequest; prc++ {
+		branchName := names.SimpleNameGenerator.RestrictLengthWithRandomSuffix("branch")
+		logmsg := fmt.Sprintf("Testing %s with Github APPS integration branch %s namespace %s", label, branchName, targetNS)
+		yamlFiles := map[string]string{}
+		randomAlphaString := strings.ToLower(random.AlphaString(4))
+		for i := 1; i <= numberOfPipelineRuns; i++ {
+			yamlFiles[fmt.Sprintf(".tekton/prlongrunnning-%s-%d.yaml", randomAlphaString, i)] = "testdata/pipelinerun_long_running_maxkeep_run.yaml"
+		}
+
+		entries, err := payload.GetEntries(yamlFiles, targetNS, options.MainBranch, "pull_request", map[string]string{
+			"MaxKeepRun": fmt.Sprint(maxKeepRun),
+		})
+		assert.NilError(t, err)
+
+		targetRefName := fmt.Sprintf("refs/heads/%s",
+			names.SimpleNameGenerator.RestrictLengthWithRandomSuffix("pac-e2e-test"))
+
+		sha, vref, err := tgithub.PushFilesToRef(ctx, ghcnx.Client, logmsg, repoinfo.GetDefaultBranch(), targetRefName,
+			opts.Organization, opts.Repo, entries)
+		assert.NilError(t, err)
+		runcnx.Clients.Log.Infof("Commit %s has been created and pushed to %s", sha, vref.GetURL())
+
+		prNumber, err := tgithub.PRCreate(ctx, runcnx, ghcnx, opts.Organization, opts.Repo, targetRefName, repoinfo.GetDefaultBranch(), logmsg)
+		assert.NilError(t, err)
+
+		defer tgithub.TearDown(ctx, t, runcnx, ghcnx, prNumber, targetRefName, targetNS, opts)
+		allPullRequests = append(allPullRequests, prNumber)
+	}
+
+	// send some retest to spice things up on concurrency and test the maxKeepRun
+	for i := 0; i < numberOfRetests; i++ {
+		for _, prNumber := range allPullRequests {
+			_, _, err := ghcnx.Client.Issues.CreateComment(ctx,
+				opts.Organization,
+				opts.Repo, prNumber,
+				&github.IssueComment{Body: github.String("/retest")})
+			assert.NilError(t, err)
+		}
+	}
+
+	finished := false
+	for i := 0; i < loopMax; i++ {
+		unsuccessful := 0
+		prs, err := runcnx.Clients.Tekton.TektonV1().PipelineRuns(targetNS).List(ctx, metav1.ListOptions{})
+		assert.NilError(t, err)
+		for _, pr := range prs.Items {
+			if pr.Status.GetConditions() == nil {
+				unsuccessful++
+				continue
+			}
+			for _, condition := range pr.Status.GetConditions() {
+				if condition.Status == "Unknown" || condition.GetReason() == tektonv1.PipelineRunSpecStatusPending {
+					unsuccessful++
+					continue
+				}
+			}
+		}
+		if unsuccessful == 0 {
+			finished = true
+			break
+		}
+		runcnx.Clients.Log.Infof("number of unsuccessful PR %d out of %d, waiting 10s more, %d/%d", unsuccessful, allPipelinesRunsCnt, i, loopMax)
+		// it's high because it takes time to process on kind
+		time.Sleep(10 * time.Second)
+	}
+	if !finished {
+		t.Errorf("we didn't get %d pipelineruns as successful, some of them are still pending or it's abnormally slow to process the Q", allPipelinesRunsCnt)
+	}
+
+	prs, err := runcnx.Clients.Tekton.TektonV1().PipelineRuns(targetNS).List(ctx, metav1.ListOptions{})
+	assert.NilError(t, err)
+	assert.Equal(t, len(prs.Items), allPipelinesRunAfterCleanUp, "we should have had %d kept after cleanup, we got %d", allPipelinesRunAfterCleanUp, len(prs.Items))
+
+	runcnx.Clients.Log.Infof("success: number of cleaned PR is %d we expected to have %d after the cleanup", len(prs.Items), allPipelinesRunAfterCleanUp)
+
+	if os.Getenv("TEST_NOCLEANUP") != "true" {
+		repository.NSTearDown(ctx, t, runcnx, targetNS)
+		return
+	}
+}
diff --git a/test/testdata/pipelinerun_long_running_maxkeep_run.yaml b/test/testdata/pipelinerun_long_running_maxkeep_run.yaml
@@ -0,0 +1,22 @@
+---
+apiVersion: tekton.dev/v1beta1
+kind: PipelineRun
+metadata:
+  name: "\\ .PipelineName //"
+  annotations:
+    pipelinesascode.tekton.dev/target-namespace: "\\ .TargetNamespace //"
+    pipelinesascode.tekton.dev/on-target-branch: "[\\ .TargetBranch //]"
+    pipelinesascode.tekton.dev/on-event: "[\\ .TargetEvent //]"
+    pipelinesascode.tekton.dev/max-keep-runs: "\\ .MaxKeepRun //"
+spec:
+  pipelineSpec:
+    tasks:
+      - name: task
+        taskSpec:
+          steps:
+            - name: task
+              image: registry.access.redhat.com/ubi9/ubi-micro
+              script: |
+                echo "hello pipeline"
+                sleep 10
+                exit 0

Original file line number	Diff line number	Diff line change
`@@ -31,8 +31,9 @@ func (k Interaction) CleanupPipelines(ctx context.Context, logger *zap.SugaredLo`
`31`	`31`	`}`
`32`	`32`
`33`	`33`	`for c, prun := range psort.PipelineRunSortByCompletionTime(pruns.Items) {`
`34`		`- if prun.GetStatusCondition().GetCondition(apis.ConditionSucceeded).GetReason() == "Running" {`
`35`		`- logger.Infof("skipping %s since currently running", prun.GetName())`
	`34`	`+ prReason := prun.GetStatusCondition().GetCondition(apis.ConditionSucceeded).GetReason()`
	`35`	`+ if prReason == tektonv1.PipelineRunReasonRunning.String() \|\| prReason == tektonv1.PipelineRunReasonPending.String() {`
	`36`	`+ logger.Infof("skipping cleaning PipelineRun %s since the conditions.reason is %s", prReason, prun.GetName())`
`36`	`37`	`continue`
`37`	`38`	`}`
`38`	`39`