Skip to content

Commit 0b2b894

Browse files
authored
ci: auto retry on runner failure (#18206)
1 parent 77d5c3b commit 0b2b894

File tree

3 files changed

+278
-0
lines changed

3 files changed

+278
-0
lines changed
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
module.exports = async ({ github, context, core }) => {
2+
const runID = process.env.WORKFLOW_RUN_ID;
3+
const runURL = process.env.WORKFLOW_RUN_URL;
4+
5+
const { data: workflowRun } = await github.rest.actions.getWorkflowRun({
6+
owner: context.repo.owner,
7+
repo: context.repo.repo,
8+
run_id: runID
9+
});
10+
11+
const { data: pulls } = await github.rest.pulls.list({
12+
owner: context.repo.owner,
13+
repo: context.repo.repo,
14+
head: `${context.repo.owner}:${workflowRun.head_branch}`,
15+
state: 'open'
16+
});
17+
18+
if (pulls.length === 0) {
19+
core.info('No open PR found for this workflow run');
20+
return;
21+
}
22+
23+
const pr = pulls[0];
24+
25+
const { data: jobs } = await github.rest.actions.listJobsForWorkflowRun({
26+
owner: context.repo.owner,
27+
repo: context.repo.repo,
28+
run_id: runID
29+
});
30+
31+
const failedJobs = jobs.jobs.filter(job => job.conclusion === 'failure');
32+
33+
function isRetryableError(errorMessage) {
34+
if (!errorMessage) return false;
35+
36+
// Only check for the specific self-hosted runner communication error
37+
return errorMessage.includes('The self-hosted runner lost communication with the server.');
38+
}
39+
40+
// Count retryable jobs
41+
let retryableJobsCount = 0;
42+
let analyzedJobs = [];
43+
44+
for (const job of failedJobs) {
45+
try {
46+
// Get job details to access check_run_url
47+
const { data: jobDetails } = await github.rest.actions.getJobForWorkflowRun({
48+
owner: context.repo.owner,
49+
repo: context.repo.repo,
50+
job_id: job.id
51+
});
52+
53+
// Get job annotations
54+
const { data: annotations } = await github.rest.checks.listAnnotations({
55+
owner: context.repo.owner,
56+
repo: context.repo.repo,
57+
check_run_id: jobDetails.check_run_url.split('/').pop()
58+
});
59+
60+
const allAnnotationMessages = annotations.map(annotation => annotation.message).join('\n');
61+
const isRetryable = isRetryableError(allAnnotationMessages);
62+
63+
analyzedJobs.push({
64+
name: job.name,
65+
retryable: isRetryable,
66+
annotationCount: annotations.length
67+
});
68+
69+
if (isRetryable) {
70+
retryableJobsCount++;
71+
}
72+
} catch (error) {
73+
core.info(`Could not analyze job ${job.name}: ${error.message}`);
74+
analyzedJobs.push({
75+
name: job.name,
76+
retryable: false,
77+
annotationCount: 0,
78+
error: error.message
79+
});
80+
}
81+
}
82+
83+
const comment = `🤖 **Smart Auto-retry Analysis (Annotations-based)**
84+
85+
The workflow run [${runID}](${runURL}) failed and has been analyzed for retryable errors using job annotations.
86+
87+
**Analysis Results:**
88+
- Total failed jobs: ${failedJobs.length}
89+
- Jobs with retryable errors: ${retryableJobsCount}
90+
- Jobs with code/test issues: ${failedJobs.length - retryableJobsCount}
91+
92+
${retryableJobsCount > 0 ?
93+
`✅ **${retryableJobsCount} job(s) have been automatically retried** due to infrastructure issues detected in annotations (runner communication, network timeouts, resource exhaustion, etc.)
94+
95+
You can monitor the retry progress in the [Actions tab](${runURL}).` :
96+
`❌ **No jobs were retried** because all failures appear to be code or test related issues that require manual fixes.`
97+
}
98+
99+
**Job Analysis (based on annotations):**
100+
${analyzedJobs.map(job => {
101+
if (job.error) {
102+
return `- ${job.name}: ❓ Analysis failed (${job.error})`;
103+
}
104+
return `- ${job.name}: ${job.retryable ? '🔄 Retryable (infrastructure)' : '❌ Not retryable (code/test)'} (${job.annotationCount} annotations)`;
105+
}).join('\n')}
106+
107+
---
108+
*This is an automated analysis and retry triggered by the smart retry workflow using job annotations.*`;
109+
110+
await github.rest.issues.createComment({
111+
owner: context.repo.owner,
112+
repo: context.repo.repo,
113+
issue_number: pr.number,
114+
body: comment
115+
});
116+
117+
core.info(`Added smart retry analysis comment to PR #${pr.number}`);
118+
};
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
module.exports = async ({ github, context, core }) => {
2+
const runID = process.env.WORKFLOW_RUN_ID;
3+
4+
core.info(`Checking failed workflow run: ${runID}`);
5+
6+
const { data: workflowRun } = await github.rest.actions.getWorkflowRun({
7+
owner: context.repo.owner,
8+
repo: context.repo.repo,
9+
run_id: runID
10+
});
11+
12+
core.info(`Workflow run status: ${workflowRun.status}`);
13+
core.info(`Workflow run conclusion: ${workflowRun.conclusion}`);
14+
15+
const { data: jobs } = await github.rest.actions.listJobsForWorkflowRun({
16+
owner: context.repo.owner,
17+
repo: context.repo.repo,
18+
run_id: runID
19+
});
20+
21+
core.info(`Found ${jobs.jobs.length} jobs in the workflow run`);
22+
23+
const failedJobs = jobs.jobs.filter(job => job.conclusion === 'failure');
24+
25+
if (failedJobs.length === 0) {
26+
core.info('No failed jobs found to retry');
27+
return;
28+
}
29+
30+
core.info(`Found ${failedJobs.length} failed jobs to analyze:`);
31+
32+
function isRetryableError(errorMessage) {
33+
if (!errorMessage) return false;
34+
return errorMessage.includes('The self-hosted runner lost communication with the server.');
35+
}
36+
37+
const jobsToRetry = [];
38+
39+
for (const job of failedJobs) {
40+
core.info(`Analyzing job: ${job.name} (ID: ${job.id})`);
41+
42+
try {
43+
const { data: jobDetails } = await github.rest.actions.getJobForWorkflowRun({
44+
owner: context.repo.owner,
45+
repo: context.repo.repo,
46+
job_id: job.id
47+
});
48+
49+
core.info(`Job status: ${jobDetails.status}, conclusion: ${jobDetails.conclusion}`);
50+
51+
const { data: annotations } = await github.rest.checks.listAnnotations({
52+
owner: context.repo.owner,
53+
repo: context.repo.repo,
54+
check_run_id: jobDetails.check_run_url.split('/').pop()
55+
});
56+
57+
core.info(`Found ${annotations.length} annotations for job: ${job.name}`);
58+
59+
const errorAnnotations = annotations.filter(annotation =>
60+
annotation.annotation_level === 'failure' ||
61+
annotation.annotation_level === 'warning' ||
62+
annotation.message.toLowerCase().includes('error') ||
63+
annotation.message.toLowerCase().includes('failed') ||
64+
annotation.message.toLowerCase().includes('timeout') ||
65+
annotation.message.toLowerCase().includes('connection') ||
66+
annotation.message.toLowerCase().includes('runner')
67+
);
68+
69+
core.info(`Found ${errorAnnotations.length} error-related annotations:`);
70+
errorAnnotations.forEach(annotation => {
71+
core.info(` [${annotation.annotation_level}] ${annotation.message}`);
72+
});
73+
74+
const allAnnotationMessages = annotations.map(annotation => annotation.message).join('');
75+
76+
const isRetryable = isRetryableError(allAnnotationMessages);
77+
78+
if (isRetryable) {
79+
core.info(`✅ Job "${job.name}" is retryable - infrastructure issue detected in annotations`);
80+
jobsToRetry.push(job);
81+
} else {
82+
core.info(`❌ Job "${job.name}" is NOT retryable - likely a code/test issue based on annotations`);
83+
}
84+
85+
} catch (error) {
86+
core.error(`Failed to analyze job ${job.name}:`, error.message);
87+
}
88+
}
89+
90+
if (jobsToRetry.length === 0) {
91+
core.info('No jobs found with retryable errors. Skipping retry.');
92+
return;
93+
}
94+
95+
core.info(`Retrying ${jobsToRetry.length} jobs with retryable errors:`);
96+
jobsToRetry.forEach(job => {
97+
core.info(`- ${job.name} (ID: ${job.id})`);
98+
});
99+
100+
for (const job of jobsToRetry) {
101+
try {
102+
core.info(`Retrying job: ${job.name} (ID: ${job.id})`);
103+
104+
await github.rest.actions.reRunJob({
105+
owner: context.repo.owner,
106+
repo: context.repo.repo,
107+
job_id: job.id
108+
});
109+
110+
core.info(`✅ Successfully initiated retry for job: ${job.name}`);
111+
} catch (error) {
112+
core.error(`❌ Failed to retry job ${job.name}:`, error.message);
113+
}
114+
}
115+
116+
core.info('Retry process completed');
117+
};

.github/workflows/retry.yml

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
name: Retry Failed Workflow
2+
3+
on:
4+
workflow_run:
5+
workflows:
6+
- "dev.yml"
7+
types:
8+
- completed
9+
10+
permissions:
11+
pull-requests: write
12+
issues: write
13+
actions: write
14+
contents: read
15+
16+
jobs:
17+
retry:
18+
runs-on: ubuntu-latest
19+
if: ${{ github.event.workflow_run.conclusion == 'failure' }}
20+
steps:
21+
- name: Checkout code
22+
uses: actions/checkout@v4
23+
24+
- name: Retry failed jobs
25+
uses: actions/github-script@v7
26+
env:
27+
WORKFLOW_RUN_ID: ${{ github.event.workflow_run.id }}
28+
with:
29+
github-token: ${{ github.token }}
30+
script: |
31+
const script = require('./.github/scripts/retry_failed_jobs.js');
32+
await script({ github, context, core });
33+
34+
- name: Comment on PR
35+
uses: actions/github-script@v7
36+
env:
37+
WORKFLOW_RUN_ID: ${{ github.event.workflow_run.id }}
38+
WORKFLOW_RUN_URL: ${{ github.event.workflow_run.url }}
39+
with:
40+
github-token: ${{ github.token }}
41+
script: |
42+
const script = require('./.github/scripts/comment_retry_on_pr.js');
43+
await script({ github, context, core });

0 commit comments

Comments
 (0)