Check build completion solely on workflow complete webhook events

Kobzol · Kobzol · commit 3dab86d6232c · 2025-07-18T14:43:37.000+02:00
diff --git a/docs/design.md b/docs/design.md
@@ -49,10 +49,13 @@ repository:
 (based on the `timeout` configured for the repository), it will cancel them.
 - Reload user permissions from the Team API.
 - Reload `rust-bors.toml` config for the repository from its main branch.
+- Reload the mergeability status of open PRs from GitHub.
+- Sync the status of PRs between the DB and GitHub.
+- Run the merge queue.
 
 ## Concurrency
 The bot is currently listening for GitHub webhooks concurrently, however it handles all commands serially, to avoid
-race conditions. This limitation is expected to be lifted in the future.
+race conditions. This limitation might be lifted in the future.
 
 ## Try builds
 A try build means that you execute a specific CI job on a PR (without merging the PR), to test if the job passes C
@@ -202,6 +205,13 @@ With [homu](https://github.com/rust-lang/homu) (the old bors implementation), Gi
 to use a "fake" job that marked the whole CI workflow as succeeded or failed, to signal to bors if it should consider
 the workflow to be OK or not.
 
-The new bors uses a different approach. It asks GitHub which [check suites](https://docs.github.com/en/rest/checks/suites)
-are attached to a given commit, and then it waits until all of these check suites complete (or until a timeout is reached).
-Thanks to this approach, there is no need to introduce fake CI jobs.
+The new bors uses a different approach. In an earlier implementation, it used to ask GitHub which [check suites](https://docs.github.com/en/rest/checks/suites) were attached to a given commit, and then it waited until all of these check suites were completed (or until a timeout was reached). However, this was too "powerful" for rust-lang/rust, where we currently have only a single CI workflow per commit, and it was producing certain race conditions, where GitHub would send us a webhook that a check suite was completed, but when we then asked the GitHub API about the status of the check suite, it was still marked as pending.
+
+To make the implementation more robust, it now behaves as follow:
+- We listen for the workflow completed webhook.
+- When it is received, we ask GitHub what are all the workflows attached to the workflow's check suite.
+- If all of them are completed, we mark the build as finished.
+
+Note that we explicitly do not read the "Check suite was completed" webhook, because it can actually be received *before* a webhook that tells us that the last workflow of that check suite was completed. If that happens, we could mark a build as completed without knowing the final conclusion of its workflows. That is not a big problem, but it would mean that we sometimes cannot post the real status of a workflow in the "build completed" bors comment on GitHub. So instead we just listen for the completed workflows.
+
+In any case, with new bors there is no need to introduce fake conclusion CI jobs.
diff --git a/src/bors/event.rs b/src/bors/event.rs
@@ -31,9 +31,9 @@ pub enum BorsRepositoryEvent {
     /// when a branch is deleted or when a tag is deleted.
     PushToBranch(PushToBranch),
     /// A workflow run on Github Actions or a check run from external CI system has been started.
-    WorkflowStarted(WorkflowStarted),
+    WorkflowStarted(WorkflowRunStarted),
     /// A workflow run on Github Actions or a check run from external CI system has been completed.
-    WorkflowCompleted(WorkflowCompleted),
+    WorkflowCompleted(WorkflowRunCompleted),
 }
 
 impl BorsRepositoryEvent {
@@ -162,7 +162,7 @@ pub struct PushToBranch {
 }
 
 #[derive(Debug)]
-pub struct WorkflowStarted {
+pub struct WorkflowRunStarted {
     pub repository: GithubRepoName,
     pub name: String,
     pub branch: String,
@@ -173,7 +173,7 @@ pub struct WorkflowStarted {
 }
 
 #[derive(Debug)]
-pub struct WorkflowCompleted {
+pub struct WorkflowRunCompleted {
     pub repository: GithubRepoName,
     pub branch: String,
     pub commit_sha: CommitSha,
diff --git a/src/bors/handlers/workflow.rs b/src/bors/handlers/workflow.rs
@@ -5,20 +5,19 @@ use octocrab::params::checks::CheckRunConclusion;
 use octocrab::params::checks::CheckRunStatus;
 
 use crate::PgDbClient;
-use crate::bors::CheckSuiteStatus;
-use crate::bors::RepositoryState;
 use crate::bors::comment::{try_build_succeeded_comment, workflow_failed_comment};
-use crate::bors::event::{WorkflowCompleted, WorkflowStarted};
+use crate::bors::event::{WorkflowRunCompleted, WorkflowRunStarted};
 use crate::bors::handlers::is_bors_observed_branch;
 use crate::bors::handlers::labels::handle_label_trigger;
 use crate::bors::merge_queue::AUTO_BRANCH_NAME;
 use crate::bors::merge_queue::MergeQueueSender;
+use crate::bors::{RepositoryState, WorkflowRun};
 use crate::database::{BuildStatus, WorkflowStatus};
 use crate::github::LabelTrigger;
 
 pub(super) async fn handle_workflow_started(
     db: Arc<PgDbClient>,
-    payload: WorkflowStarted,
+    payload: WorkflowRunStarted,
 ) -> anyhow::Result<()> {
     if !is_bors_observed_branch(&payload.branch) {
         return Ok(());
@@ -67,7 +66,7 @@ pub(super) async fn handle_workflow_started(
 pub(super) async fn handle_workflow_completed(
     repo: Arc<RepositoryState>,
     db: Arc<PgDbClient>,
-    mut payload: WorkflowCompleted,
+    mut payload: WorkflowRunCompleted,
     merge_queue_tx: &MergeQueueSender,
 ) -> anyhow::Result<()> {
     if !is_bors_observed_branch(&payload.branch) {
@@ -95,14 +94,17 @@ pub(super) async fn handle_workflow_completed(
     db.update_workflow_status(*payload.run_id, payload.status)
         .await?;
 
-    try_complete_build(repo.as_ref(), db.as_ref(), payload, merge_queue_tx).await
+    maybe_complete_build(repo.as_ref(), db.as_ref(), payload, merge_queue_tx).await
 }
 
-/// Try to complete a pending build.
-async fn try_complete_build(
+/// Attempt to complete a pending build after a workflow run has been completed.
+/// We assume that the status of the completed workflow run has already been updated in the
+/// database.
+/// We also assume that there is only a single check suite attached to a single build of a commit.
+async fn maybe_complete_build(
     repo: &RepositoryState,
     db: &PgDbClient,
-    payload: WorkflowCompleted,
+    payload: WorkflowRunCompleted,
     merge_queue_tx: &MergeQueueSender,
 ) -> anyhow::Result<()> {
     if !is_bors_observed_branch(&payload.branch) {
@@ -134,41 +136,50 @@ async fn try_complete_build(
         return Ok(());
     };
 
-    // Ask GitHub what are all the check suites attached to the given commit.
-    // This tells us for how many workflows we should wait.
-    let checks = repo
-        .client
-        .get_check_suites_for_commit(&payload.branch, &payload.commit_sha)
-        .await?;
+    // Load the workflow runs that we know about from the DB. We know about workflow runs for
+    // which we have received a started or a completed event.
+    let mut db_workflow_runs = db.get_workflows_for_build(&build).await?;
+    tracing::debug!("Workflow runs from DB: {db_workflow_runs:?}");
 
-    // Some checks are still running, let's wait for the next event
-    if checks
-        .iter()
-        .any(|check| matches!(check.status, CheckSuiteStatus::Pending))
     {
-        tracing::debug!("Some check suites are still pending: {checks:?}");
-        return Ok(());
+        // Ask GitHub about all workflow runs attached to the check suite of the completed workflow run.
+        // This tells us for how many workflow runs we should wait.
+        // We assume that this number is final, and after a single workflow run has been completed, no
+        // other workflow runs attached to the check suite can appear out of nowhere.
+        // Note: we actually only need the number of workflow runs in this function, but we download
+        // some data about them to have better logging.
+        let gh_workflow_runs: Vec<WorkflowRun> = repo
+            .client
+            .get_workflow_runs_for_check_suite(payload.check_suite_id)
+            .await?;
+        tracing::debug!("Workflow runs from GitHub: {gh_workflow_runs:?}");
+
+        // This could happen if a workflow run webhook is lost, or if one workflow run manages to finish
+        // before another workflow run even manages to start. It should be rare.
+        // We will wait for the next workflow run completed webhook.
+        if db_workflow_runs.len() < gh_workflow_runs.len() {
+            tracing::warn!("Workflow count mismatch, waiting for the next webhook");
+            return Ok(());
+        }
     }
 
-    let has_failure = checks
+    // We have all expected workflow runs in the DB, but some of them are still pending.
+    // Wait for the next workflow run to be finished.
+    if db_workflow_runs
         .iter()
-        .any(|check| matches!(check.status, CheckSuiteStatus::Failure));
-
-    let mut workflows = db.get_workflows_for_build(&build).await?;
-    workflows.sort_by(|a, b| a.name.cmp(&b.name));
-
-    // If this happens, there is a race condition in GH webhooks and we haven't received a workflow
-    // finished/failed event for some workflow yet. In this case, wait for that event before
-    // posting the PR comment.
-    if workflows.len() < checks.len()
-        || workflows
-            .iter()
-            .any(|w| w.status == WorkflowStatus::Pending)
+        .any(|w| w.status == WorkflowStatus::Pending)
     {
-        tracing::warn!("All checks are finished, but some workflows are still pending");
+        tracing::info!("Some workflows are not finished yet, waiting for the next webhook.");
         return Ok(());
     }
 
+    // Below this point, we assume that the build has completed, because all workflow runs attached
+    // to the corresponding check suite are completed.
+
+    let has_failure = db_workflow_runs
+        .iter()
+        .any(|check| matches!(check.status, WorkflowStatus::Failure));
+
     let (status, trigger) = if has_failure {
         (BuildStatus::Failure, LabelTrigger::TryBuildFailed)
     } else {
@@ -194,12 +205,13 @@ async fn try_complete_build(
         }
     }
 
+    db_workflow_runs.sort_by(|a, b| a.name.cmp(&b.name));
     let message = if !has_failure {
         tracing::info!("Workflow succeeded");
-        try_build_succeeded_comment(&workflows, payload.commit_sha, &build)
+        try_build_succeeded_comment(&db_workflow_runs, payload.commit_sha, &build)
     } else {
         tracing::info!("Workflow failed");
-        workflow_failed_comment(&workflows)
+        workflow_failed_comment(&db_workflow_runs)
     };
     repo.client.post_comment(pr.number, message).await?;
 
diff --git a/src/bors/mod.rs b/src/bors/mod.rs
@@ -7,7 +7,7 @@ pub use command::RollupMode;
 pub use comment::Comment;
 pub use context::BorsContext;
 pub use handlers::{handle_bors_global_event, handle_bors_repository_event};
-use octocrab::models::CheckSuiteId;
+use octocrab::models::RunId;
 use serde::Serialize;
 
 use crate::config::RepositoryConfig;
@@ -25,6 +25,7 @@ mod handlers;
 pub mod merge_queue;
 pub mod mergeable_queue;
 
+use crate::database::WorkflowStatus;
 pub use command::CommandPrefix;
 
 #[cfg(test)]
@@ -39,19 +40,11 @@ pub static WAIT_FOR_PR_STATUS_REFRESH: TestSyncMarker = TestSyncMarker::new();
 #[cfg(test)]
 pub static WAIT_FOR_WORKFLOW_STARTED: TestSyncMarker = TestSyncMarker::new();
 
-#[derive(Clone, Debug, PartialEq, Eq)]
-pub enum CheckSuiteStatus {
-    Pending,
-    Failure,
-    Success,
-}
-
-/// A GitHub check suite.
-/// Corresponds to a single GitHub actions workflow run, or to a single external CI check run.
+/// Corresponds to a single execution of a workflow.
 #[derive(Clone, Debug)]
-pub struct CheckSuite {
-    pub id: CheckSuiteId,
-    pub status: CheckSuiteStatus,
+pub struct WorkflowRun {
+    pub id: RunId,
+    pub status: WorkflowStatus,
 }
 
 /// An access point to a single repository.
diff --git a/src/database/mod.rs b/src/database/mod.rs
@@ -387,7 +387,7 @@ pub enum WorkflowType {
 }
 
 /// Status of a workflow.
-#[derive(Copy, Clone, Debug, PartialEq, sqlx::Type)]
+#[derive(Copy, Clone, Debug, PartialEq, Eq, sqlx::Type)]
 #[sqlx(type_name = "TEXT")]
 #[sqlx(rename_all = "lowercase")]
 pub enum WorkflowStatus {
@@ -400,6 +400,7 @@ pub enum WorkflowStatus {
 }
 
 /// Represents a workflow run, coming either from Github Actions or from some external CI.
+#[derive(Debug)]
 pub struct WorkflowModel {
     pub id: PrimaryKey,
     /// The build this workflow is associated with.
diff --git a/src/github/api/client.rs b/src/github/api/client.rs
@@ -7,16 +7,17 @@ use std::fmt::Debug;
 use tracing::log;
 
 use crate::bors::event::PullRequestComment;
-use crate::bors::{CheckSuite, CheckSuiteStatus, Comment};
+use crate::bors::{Comment, WorkflowRun};
 use crate::config::{CONFIG_FILE_PATH, RepositoryConfig};
-use crate::database::RunId;
+use crate::database::{RunId, WorkflowStatus};
 use crate::github::api::base_github_html_url;
 use crate::github::api::operations::{
     ForcePush, MergeError, create_check_run, merge_branches, set_branch_to_commit, update_check_run,
 };
 use crate::github::{CommitSha, GithubRepoName, PullRequest, PullRequestNumber};
 use crate::utils::timing::{measure_network_request, perform_network_request_with_retry};
 use futures::TryStreamExt;
+use octocrab::models::workflows::Run;
 use serde::de::DeserializeOwned;
 
 /// Provides access to a single app installation (repository) using the GitHub API.
@@ -200,62 +201,57 @@ impl GithubRepositoryClient {
         .await
     }
 
-    /// Find all check suites attached to the given commit and branch.
-    pub async fn get_check_suites_for_commit(
+    /// Find all workflows attached to a specific check suite.
+    pub async fn get_workflow_runs_for_check_suite(
         &self,
-        branch: &str,
-        sha: &CommitSha,
-    ) -> anyhow::Result<Vec<CheckSuite>> {
+        check_suite_id: CheckSuiteId,
+    ) -> anyhow::Result<Vec<WorkflowRun>> {
         #[derive(serde::Deserialize, Debug)]
-        struct CheckSuitePayload {
-            id: CheckSuiteId,
+        struct WorkflowRunResponse {
+            id: octocrab::models::RunId,
+            status: String,
             conclusion: Option<String>,
-            head_branch: String,
         }
 
         #[derive(serde::Deserialize, Debug)]
-        struct CheckSuiteResponse {
-            check_suites: Vec<CheckSuitePayload>,
+        struct WorkflowRunsResponse {
+            workflow_runs: Vec<WorkflowRunResponse>,
         }
 
-        perform_network_request_with_retry("get_check_suites_for_commit", || async {
-            let response: CheckSuiteResponse = self
-                .get_request(&format!("commits/{}/check-suites", sha.0))
+        perform_network_request_with_retry("get_workflows_for_check_suite", || async {
+            // We use a manual query, because octocrab currently doesn't allow filtering by
+            // check_suite_id when listing workflow runs.
+            // Note: we don't handle paging here, as we don't expect to get more than 30 workflows
+            // per check suite.
+            let response: WorkflowRunsResponse = self
+                .get_request(&format!("actions/runs?check_suite_id={check_suite_id}"))
                 .await
-                .context("Cannot fetch CheckSuiteResponse")?;
+                .context("Cannot fetch workflow runs for a check suite")?;
 
-            let suites = response
-                .check_suites
-                .into_iter()
-                .filter(|suite| suite.head_branch == branch)
-                .map(|suite| CheckSuite {
-                    id: suite.id,
-                    status: match suite.conclusion {
-                        Some(status) => match status.as_str() {
-                            "success" => CheckSuiteStatus::Success,
-                            "failure" | "neutral" | "cancelled" | "skipped" | "timed_out"
-                            | "action_required" | "startup_failure" | "stale" => {
-                                CheckSuiteStatus::Failure
-                            }
-                            _ => {
-                                tracing::warn!(
-                                    "Received unknown check suite conclusion for {}@{sha}: {status}",
-                                    self.repo_name
-                                );
-                                CheckSuiteStatus::Pending
-                            }
-                        },
+            fn get_status(run: &WorkflowRunResponse) -> WorkflowStatus {
+                match run.status.as_str() {
+                    "completed" => match run.conclusion.as_deref() {
+                        Some("success") => WorkflowStatus::Success,
+                        Some(_) => WorkflowStatus::Failure,
                         None => {
-                            tracing::warn!(
-                                "Received empty check suite conclusion for {}@{sha}",
-                                self.repo_name,
-                            );
-                            CheckSuiteStatus::Pending
+                            tracing::warn!("Received completed status with empty conclusion for workflow run {}", run.id);
+                            WorkflowStatus::Failure
                         }
                     },
+                    "failure" | "startup_failure" => WorkflowStatus::Failure,
+                    _ => WorkflowStatus::Pending
+                }
+            }
+
+            let runs = response
+                .workflow_runs
+                .into_iter()
+                .map(|run| WorkflowRun {
+                    id: run.id,
+                    status: get_status(&run),
                 })
                 .collect();
-            Ok(suites)
+            Ok(runs)
         })
         .await?
     }
diff --git a/src/github/webhook.rs b/src/github/webhook.rs

Original file line number	Diff line number	Diff line change
`@@ -387,7 +387,7 @@ pub enum WorkflowType {`
`387`	`387`	`}`
`388`	`388`
`389`	`389`	`/// Status of a workflow.`
`390`		`-#[derive(Copy, Clone, Debug, PartialEq, sqlx::Type)]`
	`390`	`+#[derive(Copy, Clone, Debug, PartialEq, Eq, sqlx::Type)]`
`391`	`391`	`#[sqlx(type_name = "TEXT")]`
`392`	`392`	`#[sqlx(rename_all = "lowercase")]`
`393`	`393`	`pub enum WorkflowStatus {`
`@@ -400,6 +400,7 @@ pub enum WorkflowStatus {`
`400`	`400`	`}`
`401`	`401`
`402`	`402`	`/// Represents a workflow run, coming either from Github Actions or from some external CI.`
	`403`	`+#[derive(Debug)]`
`403`	`404`	`pub struct WorkflowModel {`
`404`	`405`	`pub id: PrimaryKey,`
`405`	`406`	`/// The build this workflow is associated with.`