Skip to content

Commit 45db386

Browse files
authored
[drci] Mark unstable pending jobs as unstable (#6283)
Jobs only get categorized after they fail, but if a job is marked as unstable, it will always end up in the unstable group. This PR makes it so that unstable pending jobs get put in the unstable group. I think this should help with trymerge waiting for unstable jobs to finish, since we sometimes mark jobs as unstable if the queue time is very long Pending unstable jobs still count towards the pending count This was tested on a random PR on pytorch by curling my local development Pending jobs will show up in Dr. CI, which might be confusing as people usually expect only failing jobs to show up, so it also adds a hour glass icon to the pending unstable jobs, which looks like <img width="844" alt="image" src="https://github.com/user-attachments/assets/52a6df6b-5aef-4550-ac04-5d6d9b5322aa" />
1 parent cfc5f99 commit 45db386

File tree

2 files changed

+55
-20
lines changed

2 files changed

+55
-20
lines changed

torchci/pages/api/drci/drci.ts

Lines changed: 26 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -544,7 +544,8 @@ function constructResultsJobsSections(
544544
const hudPrUrl = `${hudBaseUrl}/pr/${owner}/${repo}/${prNumber}`;
545545
const jobsSorted = jobs.sort((a, b) => a.name.localeCompare(b.name));
546546
for (const job of jobsSorted) {
547-
output += `* [${job.name}](${hudPrUrl}#${job.id}) ([gh](${job.html_url}))`;
547+
const isPendingIcon = isPending(job) ? ":hourglass_flowing_sand: " : "";
548+
output += `* ${isPendingIcon}[${job.name}](${hudPrUrl}#${job.id}) ([gh](${job.html_url}))`;
548549

549550
const relatedJob = relatedJobs.get(job.id);
550551
// Show the related trunk failure for broken trunk or the similar failure for flaky
@@ -622,8 +623,12 @@ export function constructResultsComment(
622623
prNumber: number
623624
): string {
624625
let output = `\n`;
625-
const unrelatedFailureCount =
626-
flakyJobs.length + brokenTrunkJobs.length + unstableJobs.length;
626+
// Filter out unstable pending jobs
627+
const unrelatedFailureCount = _(flakyJobs)
628+
.concat(brokenTrunkJobs)
629+
.concat(unstableJobs)
630+
.filter((job) => !isPending(job))
631+
.value().length;
627632
const newFailedJobs: RecentWorkflowsData[] = failedJobs.filter(
628633
(job) =>
629634
job.conclusion !== "cancelled" &&
@@ -634,11 +639,7 @@ export function constructResultsComment(
634639
job.conclusion === "cancelled" ||
635640
job.failure_captures.includes(CANCELLED_STEP_ERROR)
636641
);
637-
const failing =
638-
failedJobs.length +
639-
flakyJobs.length +
640-
brokenTrunkJobs.length +
641-
unstableJobs.length;
642+
const failing = failedJobs.length + unrelatedFailureCount;
642643
const headerPrefix = `## `;
643644
const pendingIcon = `:hourglass_flowing_sand:`;
644645
const successIcon = `:white_check_mark:`;
@@ -662,8 +663,7 @@ export function constructResultsComment(
662663
const hasSignificantFailures = newFailedJobs.length > 0;
663664
const hasCancelledFailures = cancelledJobs.length > 0;
664665
const hasPending = pending > 0;
665-
const hasUnrelatedFailures =
666-
flakyJobs.length + brokenTrunkJobs.length + unstableJobs.length;
666+
const hasUnrelatedFailures = unrelatedFailureCount > 0;
667667

668668
let icon = "";
669669
if (hasSignificantFailures || hasCancelledFailures) {
@@ -795,14 +795,11 @@ export function constructResultsComment(
795795
repo,
796796
prNumber,
797797
"UNSTABLE",
798-
`The following ${pluralize(
799-
"job",
800-
unstableJobs.length
801-
)} failed but ${pluralize(
802-
"was",
798+
`The following ${pluralize("job", unstableJobs.length)} ${pluralize(
799+
"is",
803800
unstableJobs.length,
804-
"were"
805-
)} likely due to flakiness present on trunk and has been marked as unstable`,
801+
"are"
802+
)} marked as unstable, possibly due to flakiness on trunk`,
806803
unstableJobs,
807804
"",
808805
true,
@@ -855,6 +852,10 @@ function getTrunkFailure(
855852
.find((baseJob) => isSameFailure(baseJob, job));
856853
}
857854

855+
function isPending(job: RecentWorkflowsData): boolean {
856+
return job.conclusion === "" && isTime0(job.completed_at);
857+
}
858+
858859
export async function getWorkflowJobsStatuses(
859860
prInfo: PRandJobs,
860861
flakyRules: FlakyRule[],
@@ -889,8 +890,15 @@ export async function getWorkflowJobsStatuses(
889890
const relatedInfo: Map<number, string> = new Map();
890891

891892
for (const job of prInfo.jobs) {
892-
if (job.conclusion === "" && isTime0(job.completed_at)) {
893+
if (isPending(job)) {
893894
pending++;
895+
if (isUnstableJob(job as any, unstableIssues)) {
896+
unstableJobs.push(job);
897+
relatedIssues.set(
898+
job.id,
899+
getOpenUnstableIssues(job.name, unstableIssues)
900+
);
901+
}
894902
} else if (job.conclusion === "failure" || job.conclusion === "cancelled") {
895903
const suppressedLabels = await getSuppressedLabels(job, labels);
896904
if (prInfo.repo === "pytorch" && suppressedLabels.length !== 0) {

torchci/test/drci.test.ts

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -636,14 +636,41 @@ describe("Update Dr. CI Bot Unit Tests", () => {
636636
"The following job has failed",
637637
"The following job failed but was likely due to flakiness present on trunk",
638638
"The following job failed but was present on the merge base",
639-
"The following job failed but was likely due to flakiness present on trunk and has been marked as unstable",
639+
"The following job is marked as unstable",
640640
failedA.name,
641641
failedB.name,
642642
failedC.name,
643643
unstableA.name,
644644
];
645645
expect(
646-
expectToContain.every((s) => failureInfoComment.includes(s!))
646+
expectToContain.every((s) => failureInfoComment.includes(s))
647+
).toBeTruthy();
648+
});
649+
650+
test("test pending unstable job", async () => {
651+
// Test that a pending unstable job gets included in the comment as
652+
// unstable, but doesn't count as failed in the overall count
653+
const failureInfoComment = constructResultsCommentHelper({
654+
pending: 1,
655+
failedJobs: [failedA],
656+
flakyJobs: [failedB],
657+
brokenTrunkJobs: [failedC],
658+
unstableJobs: [{ ...unstableA, conclusion: "", completed_at: TIME_0 }],
659+
merge_base: "random base sha",
660+
});
661+
const expectToContain = [
662+
"1 New Failure, 1 Pending, 2 Unrelated Failures",
663+
"The following job has failed",
664+
"The following job failed but was likely due to flakiness present on trunk",
665+
"The following job failed but was present on the merge base",
666+
"The following job is marked as unstable",
667+
failedA.name,
668+
failedB.name,
669+
failedC.name,
670+
unstableA.name,
671+
];
672+
expect(
673+
expectToContain.every((s) => failureInfoComment.includes(s))
647674
).toBeTruthy();
648675
});
649676

0 commit comments

Comments
 (0)