Skip to content

Commit 29eccb5

Browse files
authored
Merge pull request #490 from nix-community/refactoring
disable cached failed builds by default
2 parents 6053962 + 7637216 commit 29eccb5

File tree

4 files changed

+69
-58
lines changed

4 files changed

+69
-58
lines changed

buildbot_nix/buildbot_nix/__init__.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ class NixEvalConfig:
8080
max_memory_size: int
8181

8282
eval_lock: MasterLock
83-
failed_builds_db: FailedBuildDB
83+
failed_builds_db: FailedBuildDB | None
8484
gcroots_user: str = "buildbot-worker"
8585

8686
show_trace: bool = False
@@ -1533,7 +1533,7 @@ def _configure_projects(
15331533
or multiprocessing.cpu_count(),
15341534
max_memory_size=self.config.eval_max_memory_size,
15351535
eval_lock=eval_lock,
1536-
failed_builds_db=DB, # type: ignore[arg-type] # DB is guaranteed to be initialized above
1536+
failed_builds_db=DB,
15371537
gcroots_user=self.config.gcroots_user,
15381538
show_trace=self.config.show_trace_on_failure,
15391539
),
@@ -1646,14 +1646,10 @@ def configure(self, config: dict[str, Any]) -> None:
16461646
eval_lock = util.MasterLock("nix-eval")
16471647

16481648
global DB # noqa: PLW0603
1649-
if DB is None:
1649+
if DB is None and self.config.cache_failed_builds:
16501650
DB = FailedBuildDB(Path("failed_builds.dbm"))
16511651
atexit.register(lambda: DB.close() if DB is not None else None)
16521652

1653-
if DB is None:
1654-
msg = "Database initialization failed"
1655-
raise RuntimeError(msg)
1656-
16571653
# Configure projects
16581654
succeeded_projects = self._configure_projects(
16591655
config, projects, worker_names, eval_lock

buildbot_nix/buildbot_nix/build_trigger.py

Lines changed: 47 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ class JobsConfig:
5050
successful_jobs: list[NixEvalJobSuccess]
5151
failed_jobs: list[NixEvalJobError]
5252
combine_builds: bool
53-
failed_builds_db: FailedBuildDB
53+
failed_builds_db: FailedBuildDB | None
5454

5555

5656
class BuildTrigger(buildstep.ShellMixin, steps.BuildStep):
@@ -72,7 +72,7 @@ class BuildTrigger(buildstep.ShellMixin, steps.BuildStep):
7272
wait_for_finish_deferred: defer.Deferred[tuple[list[int], int]] | None
7373
brids: list[int]
7474
consumers: dict[int, Any]
75-
failed_builds_db: FailedBuildDB
75+
failed_builds_db: FailedBuildDB | None
7676

7777
@dataclass
7878
class ScheduledJob:
@@ -372,51 +372,54 @@ async def _schedule_failed_evaluations(
372372
scheduled.append(
373373
BuildTrigger.ScheduledJob(failed_job, brids, results_deferred)
374374
)
375-
self.brids.extend(brids)
375+
self.brids.extend(brids.values())
376376

377377
return overall_result
378378

379379
async def _process_build_for_scheduling(
380380
self,
381381
build: NixEvalJobSuccess,
382-
context: SchedulingContext,
382+
ctx: SchedulingContext,
383383
) -> None:
384384
"""Process a single build to determine if it should be scheduled."""
385-
failed_build = self.jobs_config.failed_builds_db.check_build(build.drvPath)
385+
failed_build = None
386+
if self.jobs_config.failed_builds_db is not None:
387+
failed_build = self.jobs_config.failed_builds_db.check_build(build.drvPath)
386388

387-
if context.job_closures.get(build.drvPath):
389+
if ctx.job_closures.get(build.drvPath):
388390
# Has dependencies, skip for now
389391
return
390392

391393
if failed_build is not None and self.build:
392394
if self.build.reason != "rebuild":
393395
# Skip due to cached failure
394-
context.scheduler_log.addStdout(
396+
ctx.scheduler_log.addStdout(
395397
f"\t- skipping {build.attr} due to cached failure, first failed at {failed_build.time}\n"
396398
f"\t see build at {failed_build.url}\n"
397399
)
398-
context.build_schedule_order.remove(build)
400+
ctx.build_schedule_order.remove(build)
399401

400402
brids, results_deferred = await self.schedule(
401-
context.ss_for_trigger,
403+
ctx.ss_for_trigger,
402404
*self.schedule_cached_failure(build, failed_build),
403405
)
404-
context.scheduled.append(
406+
ctx.scheduled.append(
405407
BuildTrigger.ScheduledJob(build, brids, results_deferred)
406408
)
407409
self.brids.extend(brids.values())
408410
else:
409411
# Rebuild requested, remove from cache and schedule
410-
self.jobs_config.failed_builds_db.remove_build(build.drvPath)
411-
context.scheduler_log.addStdout(
412+
if self.jobs_config.failed_builds_db is not None:
413+
self.jobs_config.failed_builds_db.remove_build(build.drvPath)
414+
ctx.scheduler_log.addStdout(
412415
f"\t- not skipping {build.attr} with cached failure due to rebuild, first failed at {failed_build.time}\n"
413416
)
414-
context.build_schedule_order.remove(build)
415-
context.schedule_now.append(build)
417+
ctx.build_schedule_order.remove(build)
418+
ctx.schedule_now.append(build)
416419
else:
417420
# No cached failure, schedule normally
418-
context.build_schedule_order.remove(build)
419-
context.schedule_now.append(build)
421+
ctx.build_schedule_order.remove(build)
422+
ctx.schedule_now.append(build)
420423

421424
async def _get_failed_build_url(self, brids: dict[str, Any]) -> str:
422425
"""Get the URL of the actual failed build."""
@@ -435,7 +438,7 @@ async def _get_failed_build_url(self, brids: dict[str, Any]) -> str:
435438
async def _handle_failed_job(
436439
self,
437440
job: NixEvalJob,
438-
context: SchedulingContext,
441+
ctx: SchedulingContext,
439442
brids: dict[str, Any],
440443
result: int,
441444
) -> None:
@@ -444,7 +447,7 @@ async def _handle_failed_job(
444447
return
445448

446449
# Update failed builds cache if needed
447-
if result == util.FAILURE:
450+
if result == util.FAILURE and self.jobs_config.failed_builds_db is not None:
448451
should_add_to_cache = (
449452
self.build and self.build.reason == "rebuild"
450453
) or not self.jobs_config.failed_builds_db.check_build(job.drvPath)
@@ -456,28 +459,28 @@ async def _handle_failed_job(
456459

457460
# Schedule dependent failures
458461
removed = self.get_failed_dependents(
459-
job, context.build_schedule_order, context.job_closures
462+
job, ctx.build_schedule_order, ctx.job_closures
460463
)
461464
for removed_job in removed:
462465
scheduler, props = self.schedule_dependency_failed(removed_job, job)
463466
dep_brids, results_deferred = await self.schedule(
464-
context.ss_for_trigger, scheduler, props
467+
ctx.ss_for_trigger, scheduler, props
465468
)
466-
context.build_schedule_order.remove(removed_job)
467-
context.scheduled.append(
469+
ctx.build_schedule_order.remove(removed_job)
470+
ctx.scheduled.append(
468471
BuildTrigger.ScheduledJob(removed_job, dep_brids, results_deferred)
469472
)
470473
self.brids.extend(dep_brids.values())
471474

472475
if removed:
473-
context.scheduler_log.addStdout(
476+
ctx.scheduler_log.addStdout(
474477
"\t- removed jobs: "
475478
+ ", ".join([job.drvPath for job in removed])
476479
+ "\n"
477480
)
478481

479482
# Update job closures
480-
for job_closure in context.job_closures.values():
483+
for job_closure in ctx.job_closures.values():
481484
if job.drvPath in job_closure:
482485
job_closure.remove(job.drvPath)
483486

@@ -523,7 +526,7 @@ async def run(self) -> int:
523526
)
524527

525528
# Create scheduling context that will be reused throughout the loop
526-
context = BuildTrigger.SchedulingContext(
529+
ctx = BuildTrigger.SchedulingContext(
527530
build_schedule_order=build_schedule_order,
528531
job_closures=job_closures,
529532
ss_for_trigger=ss_for_trigger,
@@ -533,34 +536,33 @@ async def run(self) -> int:
533536
)
534537

535538
# Main scheduling loop
536-
while build_schedule_order or scheduled:
537-
scheduler_log.addStdout("Scheduling...\n")
539+
while ctx.build_schedule_order or ctx.scheduled:
540+
ctx.scheduler_log.addStdout("Scheduling...\n")
538541

539542
# Determine which jobs to schedule now
540-
context.schedule_now = []
541-
for build in list(build_schedule_order):
542-
await self._process_build_for_scheduling(build, context)
543+
ctx.schedule_now = []
544+
for build in list(ctx.build_schedule_order):
545+
await self._process_build_for_scheduling(build, ctx)
543546

544-
if not context.schedule_now:
545-
scheduler_log.addStdout("\tNo builds to schedule found.\n")
547+
if not ctx.schedule_now:
548+
ctx.scheduler_log.addStdout("\tNo builds to schedule found.\n")
546549

547550
# Schedule ready jobs
548-
for job in context.schedule_now:
549-
scheduler_log.addStdout(f"\t- {job.attr}\n")
551+
for job in ctx.schedule_now:
552+
ctx.scheduler_log.addStdout(f"\t- {job.attr}\n")
550553
brids, results_deferred = await self.schedule(
551-
ss_for_trigger,
554+
ctx.ss_for_trigger,
552555
*self.schedule_success(build_props, job),
553556
)
554-
scheduled.append(
557+
ctx.scheduled.append(
555558
BuildTrigger.ScheduledJob(job, brids, results_deferred)
556559
)
557560
self.brids.extend(brids.values())
558561

559-
scheduler_log.addStdout("Waiting...\n")
562+
ctx.scheduler_log.addStdout("Waiting...\n")
560563

561-
# Wait for a job to complete
562564
self.wait_for_finish_deferred = defer.DeferredList(
563-
[job.results for job in scheduled],
565+
[job.results for job in ctx.scheduled],
564566
fireOnOneCallback=True,
565567
fireOnOneErrback=True,
566568
)
@@ -570,12 +572,12 @@ async def run(self) -> int:
570572
results, index = await self.wait_for_finish_deferred # type: ignore[assignment]
571573

572574
# Process completed job
573-
job, brids, _ = scheduled[index]
575+
job, brids, _ = ctx.scheduled[index]
574576
done.append(BuildTrigger.DoneJob(job, brids, results))
575-
del scheduled[index]
577+
del ctx.scheduled[index]
576578
result = results[0]
577579

578-
scheduler_log.addStdout(
580+
ctx.scheduler_log.addStdout(
579581
f"Found finished build {job.attr}, result {util.Results[result].upper()}\n"
580582
)
581583

@@ -588,10 +590,10 @@ async def run(self) -> int:
588590
)
589591

590592
# Handle failed jobs and their dependents
591-
await self._handle_failed_job(job, context, brids, result)
593+
await self._handle_failed_job(job, ctx, brids, result)
592594

593595
overall_result = worst_status(result, overall_result)
594-
scheduler_log.addStdout(
596+
ctx.scheduler_log.addStdout(
595597
f"\t- new result: {util.Results[overall_result].upper()} \n"
596598
)
597599

@@ -605,7 +607,7 @@ async def run(self) -> int:
605607
self.build,
606608
overall_result,
607609
)
608-
scheduler_log.addStdout("Done!\n")
610+
ctx.scheduler_log.addStdout("Done!\n")
609611
return overall_result
610612

611613
def getCurrentSummary(self) -> dict[str, str]: # noqa: N802

buildbot_nix/buildbot_nix/models.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,7 @@ class BuildbotNixConfig(BaseModel):
300300
nix_workers_secret_file: Path | None = None
301301
effects_per_repo_secrets: dict[str, str] = {}
302302
show_trace_on_failure: bool = False
303+
cache_failed_builds: bool = False
303304

304305
def nix_worker_secrets(self) -> WorkerConfig:
305306
if self.nix_workers_secret_file is None:

nixosModules/master.nix

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -569,6 +569,13 @@ in
569569
description = "Show stack traces on failed evaluations";
570570
};
571571

572+
cacheFailedBuilds = lib.mkEnableOption ''
573+
cache failed builds in local database to avoid retrying them.
574+
When enabled, failed builds will be remembered and skipped in subsequent evaluations
575+
unless explicitly rebuilt. When disabled (the default), all builds will be attempted
576+
regardless of previous failures
577+
'';
578+
572579
outputsPath = lib.mkOption {
573580
type = lib.types.nullOr lib.types.path;
574581
description = "Path where we store the latest build store paths names for nix attributes as text files. This path will be exposed via nginx at \${domain}/nix-outputs";
@@ -742,15 +749,17 @@ in
742749
null
743750
else
744751
{
745-
user_allowlist = cfg.gitea.userAllowlist;
746-
repo_allowlist = cfg.gitea.repoAllowlist;
752+
filters = {
753+
user_allowlist = cfg.gitea.userAllowlist;
754+
repo_allowlist = cfg.gitea.repoAllowlist;
755+
topic = cfg.gitea.topic;
756+
};
747757
token_file = "gitea-token";
748758
webhook_secret_file = "gitea-webhook-secret";
749759
project_cache_file = "gitea-project-cache.json";
750760
oauth_secret_file = "gitea-oauth-secret";
751761
instance_url = cfg.gitea.instanceUrl;
752762
oauth_id = cfg.gitea.oauthId;
753-
topic = cfg.gitea.topic;
754763
ssh_private_key_file = cfg.gitea.sshPrivateKeyFile;
755764
ssh_known_hosts_file = cfg.gitea.sshKnownHostsFile;
756765
};
@@ -759,8 +768,11 @@ in
759768
null
760769
else
761770
{
762-
user_allowlist = cfg.github.userAllowlist;
763-
repo_allowlist = cfg.github.repoAllowlist;
771+
filters = {
772+
user_allowlist = cfg.github.userAllowlist;
773+
repo_allowlist = cfg.github.repoAllowlist;
774+
topic = cfg.github.topic;
775+
};
764776
auth_type =
765777
if (cfg.github.authType ? "legacy") then
766778
{ token_file = "github-token"; }
@@ -778,7 +790,6 @@ in
778790
webhook_secret_file = "github-webhook-secret";
779791
oauth_secret_file = "github-oauth-secret";
780792
oauth_id = cfg.github.oauthId;
781-
topic = cfg.github.topic;
782793
};
783794
pull_based =
784795
if cfg.pullBased.repositories == [ ] then
@@ -815,6 +826,7 @@ in
815826
branches = cfg.branches;
816827
nix_workers_secret_file = "buildbot-nix-workers";
817828
show_trace_on_failure = cfg.showTrace;
829+
cache_failed_builds = cfg.cacheFailedBuilds;
818830
}
819831
}").read_text()))
820832
)

0 commit comments

Comments
 (0)