Skip to content

Commit 8571954

Browse files
committed
fix: update job closures for all completed builds to prevent scheduler deadlock
The scheduler was only updating job_closures for failed builds, causing it to potentially wait indefinitely for dependencies that had already completed successfully. This resulted in builds hanging when successful dependencies weren't properly removed from the closure tracking. - Rename _handle_failed_job to _handle_completed_job for clarity - Move job_closures update to run after ALL completed jobs (success/failure) - Ensure closure update happens after get_failed_dependents to maintain integrity
1 parent 29eccb5 commit 8571954

File tree

1 file changed

+38
-35
lines changed

1 file changed

+38
-35
lines changed

buildbot_nix/buildbot_nix/build_trigger.py

Lines changed: 38 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -435,51 +435,54 @@ async def _get_failed_build_url(self, brids: dict[str, Any]) -> str:
435435
break
436436
return url
437437

438-
async def _handle_failed_job(
438+
async def _handle_completed_job(
439439
self,
440440
job: NixEvalJob,
441441
ctx: SchedulingContext,
442442
brids: dict[str, Any],
443443
result: int,
444444
) -> None:
445-
"""Handle a failed job by removing dependents and updating cache."""
446-
if not isinstance(job, NixEvalJobSuccess) or result == SUCCESS:
445+
"""Handle a completed job by updating closures and managing failures."""
446+
if not isinstance(job, NixEvalJobSuccess):
447447
return
448448

449-
# Update failed builds cache if needed
450-
if result == util.FAILURE and self.jobs_config.failed_builds_db is not None:
451-
should_add_to_cache = (
452-
self.build and self.build.reason == "rebuild"
453-
) or not self.jobs_config.failed_builds_db.check_build(job.drvPath)
454-
if should_add_to_cache:
455-
url = await self._get_failed_build_url(brids)
456-
self.jobs_config.failed_builds_db.add_build(
457-
job.drvPath, datetime.now(tz=UTC), url
458-
)
449+
# Handle failure-specific logic
450+
if result != SUCCESS:
451+
# Update failed builds cache if needed
452+
if result == util.FAILURE and self.jobs_config.failed_builds_db is not None:
453+
should_add_to_cache = (
454+
self.build and self.build.reason == "rebuild"
455+
) or not self.jobs_config.failed_builds_db.check_build(job.drvPath)
456+
if should_add_to_cache:
457+
url = await self._get_failed_build_url(brids)
458+
self.jobs_config.failed_builds_db.add_build(
459+
job.drvPath, datetime.now(tz=UTC), url
460+
)
459461

460-
# Schedule dependent failures
461-
removed = self.get_failed_dependents(
462-
job, ctx.build_schedule_order, ctx.job_closures
463-
)
464-
for removed_job in removed:
465-
scheduler, props = self.schedule_dependency_failed(removed_job, job)
466-
dep_brids, results_deferred = await self.schedule(
467-
ctx.ss_for_trigger, scheduler, props
462+
# Schedule dependent failures (MUST happen before updating job_closures!)
463+
removed = self.get_failed_dependents(
464+
job, ctx.build_schedule_order, ctx.job_closures
468465
)
469-
ctx.build_schedule_order.remove(removed_job)
470-
ctx.scheduled.append(
471-
BuildTrigger.ScheduledJob(removed_job, dep_brids, results_deferred)
472-
)
473-
self.brids.extend(dep_brids.values())
466+
for removed_job in removed:
467+
scheduler, props = self.schedule_dependency_failed(removed_job, job)
468+
dep_brids, results_deferred = await self.schedule(
469+
ctx.ss_for_trigger, scheduler, props
470+
)
471+
ctx.build_schedule_order.remove(removed_job)
472+
ctx.scheduled.append(
473+
BuildTrigger.ScheduledJob(removed_job, dep_brids, results_deferred)
474+
)
475+
self.brids.extend(dep_brids.values())
474476

475-
if removed:
476-
ctx.scheduler_log.addStdout(
477-
"\t- removed jobs: "
478-
+ ", ".join([job.drvPath for job in removed])
479-
+ "\n"
480-
)
477+
if removed:
478+
ctx.scheduler_log.addStdout(
479+
"\t- removed jobs: "
480+
+ ", ".join([job.drvPath for job in removed])
481+
+ "\n"
482+
)
481483

482-
# Update job closures
484+
# Update job closures for ALL completed jobs (both success and failure)
485+
# This MUST happen after get_failed_dependents since it needs the closures intact
483486
for job_closure in ctx.job_closures.values():
484487
if job.drvPath in job_closure:
485488
job_closure.remove(job.drvPath)
@@ -589,8 +592,8 @@ async def run(self) -> int:
589592
result,
590593
)
591594

592-
# Handle failed jobs and their dependents
593-
await self._handle_failed_job(job, ctx, brids, result)
595+
# Handle completed job and update closures
596+
await self._handle_completed_job(job, ctx, brids, result)
594597

595598
overall_result = worst_status(result, overall_result)
596599
ctx.scheduler_log.addStdout(

0 commit comments

Comments
 (0)