runners: More scale-down perf improvements (#6854)

seemethere · web-flow · commit 4556a1333b30 · 2025-06-27T20:16:00.000+02:00
Does the following: * Removes ssm parameter cleanup from terminateInstances (will be a follow-up PR to add a termination policy to parameters) * Removed double check for ghRunner calls (was causing performance bottlenecks * NOTE: We will need to monitor removeGHRunnerOrg calls to see if those introduce another performance bottleneck + job cancellations (if they rise then we revert, dashboard: https://hud.pytorch.org/job_cancellation_dashboard) Signed-off-by: Eli Uriegas <eliuriegas@meta.com> --------- Signed-off-by: Eli Uriegas <eliuriegas@meta.com>
diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/runners.test.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/runners.test.ts
@@ -355,20 +355,12 @@ describe('terminateRunners', () => {
       },
     ];
 
-    mockSSMdescribeParametersRet.mockResolvedValueOnce({
-      Parameters: runners
-        .map((runner) => getParameterNameForRunner(runner.environment as string, runner.instanceId))
-        .map((s) => ({ Name: s })),
-    });
-
     await terminateRunners(runners, metrics);
 
     expect(mockEC2.terminateInstances).toBeCalledTimes(1);
     expect(mockEC2.terminateInstances).toBeCalledWith({
       InstanceIds: ['i-1234', 'i-5678'],
     });
-    expect(mockSSM.describeParameters).toBeCalledTimes(1);
-    expect(mockSSM.deleteParameter).toBeCalledTimes(2);
   });
 
   it('terminates runners across multiple regions', async () => {
@@ -385,10 +377,6 @@ describe('terminateRunners', () => {
       },
     ];
 
-    mockSSMdescribeParametersRet.mockResolvedValue({
-      Parameters: [{ Name: 'gi-ci-i-1234' }, { Name: 'gi-ci-i-5678' }],
-    });
-
     await terminateRunners(runners, metrics);
 
     expect(mockEC2.terminateInstances).toBeCalledTimes(2);
@@ -398,8 +386,6 @@ describe('terminateRunners', () => {
     expect(mockEC2.terminateInstances).toHaveBeenNthCalledWith(2, {
       InstanceIds: ['i-5678'],
     });
-    expect(mockSSM.describeParameters).toBeCalledTimes(2);
-    expect(mockSSM.deleteParameter).toBeCalledTimes(2);
   });
 
   it('handles partial failure - terminates some runners but fails on others', async () => {
@@ -421,16 +407,6 @@ describe('terminateRunners', () => {
       },
     ];
 
-    // First region succeeds
-    mockSSMdescribeParametersRet.mockResolvedValueOnce({
-      Parameters: [{ Name: 'gi-ci-i-1234' }, { Name: 'gi-ci-i-5678' }],
-    });
-
-    // Second region also gets SSM parameters but has no successful terminations to clean up
-    mockSSMdescribeParametersRet.mockResolvedValueOnce({
-      Parameters: [],
-    });
-
     // First region succeeds, second region fails
     mockEC2.terminateInstances
       .mockReturnValueOnce({
@@ -445,8 +421,6 @@ describe('terminateRunners', () => {
     );
 
     expect(mockEC2.terminateInstances).toBeCalledTimes(2);
-    expect(mockSSM.describeParameters).toBeCalledTimes(2); // Called for both regions
-    expect(mockSSM.deleteParameter).toBeCalledTimes(2); // Only for successful region
   });
 
   it('handles large batches by splitting into chunks', async () => {
@@ -457,12 +431,6 @@ describe('terminateRunners', () => {
       environment: 'gi-ci',
     }));
 
-    mockSSMdescribeParametersRet.mockResolvedValueOnce({
-      Parameters: runners.map((runner) => ({
-        Name: getParameterNameForRunner(runner.environment as string, runner.instanceId),
-      })),
-    });
-
     await terminateRunners(runners, metrics);
 
     // Should make 2 terminate calls (batches of 100 and 50)
@@ -473,10 +441,6 @@ describe('terminateRunners', () => {
     expect(mockEC2.terminateInstances).toHaveBeenNthCalledWith(2, {
       InstanceIds: runners.slice(100, 150).map((r) => r.instanceId),
     });
-
-    // SSM cleanup should handle all 150 parameters
-    expect(mockSSM.describeParameters).toBeCalledTimes(1);
-    expect(mockSSM.deleteParameter).toBeCalledTimes(150);
   });
 
   it('cleans up SSM parameters for successful batches even when later batch fails', async () => {
@@ -487,12 +451,6 @@ describe('terminateRunners', () => {
       environment: 'gi-ci',
     }));
 
-    mockSSMdescribeParametersRet.mockResolvedValueOnce({
-      Parameters: runners.slice(0, 100).map((runner) => ({
-        Name: getParameterNameForRunner(runner.environment as string, runner.instanceId),
-      })),
-    });
-
     // First batch succeeds, second batch fails
     mockEC2.terminateInstances
       .mockReturnValueOnce({
@@ -505,9 +463,6 @@ describe('terminateRunners', () => {
     await expect(terminateRunners(runners, metrics)).rejects.toThrow('Failed to terminate some runners');
 
     expect(mockEC2.terminateInstances).toBeCalledTimes(2);
-    // SSM cleanup should still happen for the first 100 runners that were successfully terminated
-    expect(mockSSM.describeParameters).toBeCalledTimes(1);
-    expect(mockSSM.deleteParameter).toBeCalledTimes(100);
   });
 
   it('handles SSM parameter cleanup failure gracefully', async () => {
@@ -519,18 +474,9 @@ describe('terminateRunners', () => {
       },
     ];
 
-    // SSM describe fails, so it should attempt direct deletion
-    mockSSMdescribeParametersRet.mockRejectedValueOnce(new Error('SSM describe failed'));
-
     await terminateRunners(runners, metrics);
 
     expect(mockEC2.terminateInstances).toBeCalledTimes(1);
-    expect(mockSSM.describeParameters).toBeCalledTimes(1);
-    // Should still attempt direct deletion even when describe fails
-    expect(mockSSM.deleteParameter).toBeCalledTimes(1);
-    expect(mockSSM.deleteParameter).toBeCalledWith({
-      Name: getParameterNameForRunner(runners[0].environment as string, runners[0].instanceId),
-    });
   });
 });
 
@@ -1779,9 +1725,6 @@ describe('terminateRunner', () => {
     };
 
     // Mock terminateRunners by mocking the underlying calls
-    mockSSMdescribeParametersRet.mockResolvedValueOnce({
-      Parameters: [{ Name: 'gi-ci-i-1234' }],
-    });
     mockEC2.terminateInstances.mockReturnValueOnce({
       promise: jest.fn().mockResolvedValueOnce({}),
     });
@@ -1792,8 +1735,5 @@ describe('terminateRunner', () => {
     expect(mockEC2.terminateInstances).toBeCalledWith({
       InstanceIds: ['i-1234'],
     });
-    expect(mockSSM.deleteParameter).toBeCalledWith({
-      Name: 'gi-ci-i-1234',
-    });
   });
 });
diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/runners.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/runners.ts
@@ -371,53 +371,42 @@ async function terminateRunnersInRegion(runners: RunnerInfo[], metrics: Metrics,
   // We'll attempt to terminate all batches, but if any batch throws we still want to clean up the SSM
   // parameters for the instances that were already terminated.  To achieve this we wrap the whole
   // operation in a try / finally block so that the cleanup always executes.
-  try {
-    for (const [batchIndex, instanceBatch] of instanceBatches.entries()) {
-      console.info(
-        `[${region}] Processing batch ${batchIndex + 1}/${instanceBatches.length} with ${
-          instanceBatch.length
-        } instances: ${instanceBatch.map((r) => r.instanceId).join(', ')}`,
-      );
-
-      try {
-        await expBackOff(() => {
-          return metrics.trackRequestRegion(
-            region,
-            metrics.ec2TerminateInstancesAWSCallSuccess,
-            metrics.ec2TerminateInstancesAWSCallFailure,
-            () => {
-              return ec2.terminateInstances({ InstanceIds: instanceBatch.map((r) => r.instanceId) }).promise();
-            },
-          );
-        });
+  for (const [batchIndex, instanceBatch] of instanceBatches.entries()) {
+    console.info(
+      `[${region}] Processing batch ${batchIndex + 1}/${instanceBatches.length} with ${
+        instanceBatch.length
+      } instances: ${instanceBatch.map((r) => r.instanceId).join(', ')}`,
+    );
 
-        console.info(
-          `[${region}] Successfully terminated batch ${batchIndex + 1}/${instanceBatches.length}: ${instanceBatch
-            .map((r) => r.instanceId)
-            .join(', ')}`,
+    try {
+      await expBackOff(() => {
+        return metrics.trackRequestRegion(
+          region,
+          metrics.ec2TerminateInstancesAWSCallSuccess,
+          metrics.ec2TerminateInstancesAWSCallFailure,
+          () => {
+            return ec2.terminateInstances({ InstanceIds: instanceBatch.map((r) => r.instanceId) }).promise();
+          },
         );
+      });
 
-        // Record successfully terminated runners so that we can clean up their SSM parameters later.
-        successfullyTerminated.push(...instanceBatch);
-      } catch (e) {
-        console.error(
-          `[${region}] Failed to terminate batch ${batchIndex + 1}/${instanceBatches.length}: ${instanceBatch
-            .map((r) => r.instanceId)
-            .join(', ')} - ${e}`,
-        );
-        // Re-throw so that callers are aware of the failure; the finally block will still execute and
-        // attempt SSM cleanup for the instances that were already terminated.
-        throw e;
-      }
-    }
-  } finally {
-    try {
-      await cleanupSSMParametersForRunners(successfullyTerminated, metrics, region, ssm);
-    } catch (cleanupErr) {
-      // We do not want cleanup issues to mask the original termination error, just log them.
+      console.info(
+        `[${region}] Successfully terminated batch ${batchIndex + 1}/${instanceBatches.length}: ${instanceBatch
+          .map((r) => r.instanceId)
+          .join(', ')}`,
+      );
+
+      // Record successfully terminated runners so that we can clean up their SSM parameters later.
+      successfullyTerminated.push(...instanceBatch);
+    } catch (e) {
       console.error(
-        `[${region}] Error during SSM parameter cleanup for ${successfullyTerminated.length} runners: ${cleanupErr}`,
+        `[${region}] Failed to terminate batch ${batchIndex + 1}/${instanceBatches.length}: ${instanceBatch
+          .map((r) => r.instanceId)
+          .join(', ')} - ${e}`,
       );
+      // Re-throw so that callers are aware of the failure; the finally block will still execute and
+      // attempt SSM cleanup for the instances that were already terminated.
+      throw e;
     }
   }
 }
diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-down.test.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-down.test.ts
@@ -1448,30 +1448,24 @@ describe('scale-down', () => {
 
     it('dont finds on listGithubRunnersRep, finds with getRunnerRepo, busy === false', async () => {
       const mockedListGithubRunnersRepo = mocked(listGithubRunnersRepo);
-      const mockedGetRunnerRepo = mocked(getRunnerRepo);
       const ec2runner: RunnerInfo = {
         awsRegion: baseConfig.awsRegion,
         repo: repoKey,
         instanceId: 'instance-id-03',
         runnerType: 'runnerType-01',
         ghRunnerId: 'ghRunnerId-01',
       };
-      const theGhRunner = { name: 'instance-id-03', busy: false } as GhRunner;
 
       mockedListGithubRunnersRepo.mockResolvedValueOnce(ghRunners);
-      mockedGetRunnerRepo.mockResolvedValueOnce(theGhRunner);
 
-      expect(await getGHRunnerRepo(ec2runner, metrics)).toEqual(theGhRunner);
+      expect(await getGHRunnerRepo(ec2runner, metrics)).toBeUndefined();
 
       expect(mockedListGithubRunnersRepo).toBeCalledTimes(1);
       expect(mockedListGithubRunnersRepo).toBeCalledWith(repo, metrics);
-      expect(mockedGetRunnerRepo).toBeCalledTimes(1);
-      expect(mockedGetRunnerRepo).toBeCalledWith(repo, ec2runner.ghRunnerId, metrics);
     });
 
     it('listGithubRunnersRep and getRunnerRepo throws exception', async () => {
       const mockedListGithubRunnersRepo = mocked(listGithubRunnersRepo);
-      const mockedGetRunnerRepo = mocked(getRunnerRepo);
       const ec2runner: RunnerInfo = {
         awsRegion: baseConfig.awsRegion,
         repo: repoKey,
@@ -1481,14 +1475,11 @@ describe('scale-down', () => {
       };
 
       mockedListGithubRunnersRepo.mockRejectedValueOnce('Error');
-      mockedGetRunnerRepo.mockRejectedValueOnce('Error');
 
       expect(await getGHRunnerRepo(ec2runner, metrics)).toBeUndefined();
 
       expect(mockedListGithubRunnersRepo).toBeCalledTimes(1);
       expect(mockedListGithubRunnersRepo).toBeCalledWith(repo, metrics);
-      expect(mockedGetRunnerRepo).toBeCalledTimes(1);
-      expect(mockedGetRunnerRepo).toBeCalledWith(repo, ec2runner.ghRunnerId, metrics);
     });
   });
 
@@ -1635,30 +1626,24 @@ describe('scale-down', () => {
 
     it('dont finds on listGithubRunnersOrg, finds with getRunnerOrg, busy === false', async () => {
       const mockedListGithubRunnersOrg = mocked(listGithubRunnersOrg);
-      const mockedGetRunnerOrg = mocked(getRunnerOrg);
       const ec2runner: RunnerInfo = {
         awsRegion: baseConfig.awsRegion,
         org: org,
         instanceId: 'instance-id-03',
         runnerType: 'runnerType-01',
         ghRunnerId: 'ghRunnerId-01',
       };
-      const theGhRunner = { name: 'instance-id-03', busy: false } as GhRunner;
 
       mockedListGithubRunnersOrg.mockResolvedValueOnce(ghRunners);
-      mockedGetRunnerOrg.mockResolvedValueOnce(theGhRunner);
 
-      expect(await getGHRunnerOrg(ec2runner, metrics)).toEqual(theGhRunner);
+      expect(await getGHRunnerOrg(ec2runner, metrics)).toBeUndefined();
 
       expect(mockedListGithubRunnersOrg).toBeCalledTimes(1);
       expect(mockedListGithubRunnersOrg).toBeCalledWith(org, metrics);
-      expect(mockedGetRunnerOrg).toBeCalledTimes(1);
-      expect(mockedGetRunnerOrg).toBeCalledWith(org, ec2runner.ghRunnerId, metrics);
     });
 
     it('listGithubRunnersRep and getRunnerRepo throws exception', async () => {
       const mockedListGithubRunnersOrg = mocked(listGithubRunnersOrg);
-      const mockedGetRunnerOrg = mocked(getRunnerOrg);
       const ec2runner: RunnerInfo = {
         awsRegion: baseConfig.awsRegion,
         org: org,
@@ -1668,14 +1653,11 @@ describe('scale-down', () => {
       };
 
       mockedListGithubRunnersOrg.mockRejectedValueOnce('Error');
-      mockedGetRunnerOrg.mockRejectedValueOnce('Error');
 
       expect(await getGHRunnerOrg(ec2runner, metrics)).toBeUndefined();
 
       expect(mockedListGithubRunnersOrg).toBeCalledTimes(1);
       expect(mockedListGithubRunnersOrg).toBeCalledWith(org, metrics);
-      expect(mockedGetRunnerOrg).toBeCalledTimes(1);
-      expect(mockedGetRunnerOrg).toBeCalledWith(org, ec2runner.ghRunnerId, metrics);
     });
 
     it('getRunner throws when api rate limit is hit', async () => {
diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-down.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-down.ts
@@ -298,24 +298,6 @@ export async function getGHRunnerOrg(ec2runner: RunnerInfo, metrics: ScaleDownMe
     }
   }
 
-  if (ghRunner === undefined && ec2runner.ghRunnerId !== undefined) {
-    console.warn(
-      `Runner '${ec2runner.instanceId}' [${ec2runner.runnerType}](${org}) not found in ` +
-        `listGithubRunnersOrg call, attempting to grab directly`,
-    );
-    try {
-      ghRunner = await getRunnerOrg(ec2runner.org as string, ec2runner.ghRunnerId, metrics);
-    } catch (e) {
-      console.warn(
-        `Runner '${ec2runner.instanceId}' [${ec2runner.runnerType}](${org}) error when ` +
-          `listGithubRunnersOrg call: ${e}`,
-      );
-      /* istanbul ignore next */
-      if (isGHRateLimitError(e)) {
-        throw e;
-      }
-    }
-  }
   if (ghRunner) {
     if (ghRunner.busy) {
       metrics.runnerGhFoundBusyOrg(org, ec2runner);
@@ -343,31 +325,6 @@ export async function getGHRunnerRepo(ec2runner: RunnerInfo, metrics: ScaleDownM
     }
   }
 
-  if (ghRunner === undefined) {
-    if (ec2runner.ghRunnerId === undefined) {
-      console.warn(
-        `Runner '${ec2runner.instanceId}' [${ec2runner.runnerType}](${repo}) was neither found in ` +
-          `the list of runners returned by the listGithubRunnersRepo api call, nor did it have the ` +
-          `GithubRunnerId EC2 tag set.  This can happen if there's no runner running on the instance.`,
-      );
-    } else {
-      console.warn(
-        `Runner '${ec2runner.instanceId}' [${ec2runner.runnerType}](${repo}) not found in ` +
-          `listGithubRunnersRepo call, attempting to grab directly`,
-      );
-      try {
-        ghRunner = await getRunnerRepo(repo, ec2runner.ghRunnerId, metrics);
-      } catch (e) {
-        console.warn(
-          `Runner '${ec2runner.instanceId}' [${ec2runner.runnerType}](${repo}) error when getRunnerRepo call: ${e}`,
-        );
-        /* istanbul ignore next */
-        if (isGHRateLimitError(e)) {
-          throw e;
-        }
-      }
-    }
-  }
   if (ghRunner !== undefined) {
     if (ghRunner.busy) {
       metrics.runnerGhFoundBusyRepo(repo, ec2runner);
diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/utils.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/utils.ts
@@ -78,6 +78,8 @@ export async function expBackOff<T>(
         if (expBackOffMs > maxMs) {
           throw e;
         }
+        const functionName = callback.name || 'anonymous';
+        console.warn(`[expBackOff]"${functionName}" needing to back off for function for ${expBackOffMs}ms: ${e}`);
         await new Promise((resolve) => setTimeout(resolve, expBackOffMs));
         expBackOffMs = expBackOffMs * backOffFactor;
       } else {

Original file line number	Diff line number	Diff line change
`@@ -78,6 +78,8 @@ export async function expBackOff<T>(`
`78`	`78`	`if (expBackOffMs > maxMs) {`
`79`	`79`	`throw e;`
`80`	`80`	`}`
	`81`	`+ const functionName = callback.name \|\| 'anonymous';`
	`82`	+ console.warn(`[expBackOff]"${functionName}" needing to back off for function for ${expBackOffMs}ms: ${e}`);
`81`	`83`	`await new Promise((resolve) => setTimeout(resolve, expBackOffMs));`
`82`	`84`	`expBackOffMs = expBackOffMs * backOffFactor;`
`83`	`85`	`} else {`