Skip to content

Commit 931d10b

Browse files
Camylljeanschmidtatalman
authored
Fix scaleupchron metrics (#6572)
Fix metrics - don't log errors - use countEntry instead of addEntry - clearer naming schema - use correct prefixes --------- Co-authored-by: Jean Schmidt <[email protected]> Co-authored-by: Andrey Talman <[email protected]>
1 parent bdb5d4c commit 931d10b

File tree

4 files changed

+294
-82
lines changed

4 files changed

+294
-82
lines changed

terraform-aws-github-runner/modules/runners/lambdas/runners/src/lambda.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,9 +177,12 @@ export async function scaleUpChron(event: ScheduledEvent, context: Context, call
177177
);
178178

179179
try {
180+
metrics.scaleUpChronInitiated();
180181
await scaleUpChronR(metrics);
182+
metrics.scaleUpChronSuccess();
181183
return callback(null);
182184
} catch (e) {
185+
metrics.scaleUpChronFailure();
183186
console.error(e);
184187
return callback('Failed');
185188
} finally {

terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/metrics.ts

Lines changed: 135 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1635,56 +1635,162 @@ export class ScaleUpChronMetrics extends ScaleUpMetrics {
16351635
super('scaleUpChron');
16361636
}
16371637

1638-
queuedRunnerStats(org: string, runnerType: string, numQueuedJobs: number) {
1639-
const dimensions = new Map([
1638+
queuedRunnerStats(
1639+
org: string,
1640+
runnerType: string,
1641+
repo: string,
1642+
numQueuedJobs: number,
1643+
minQueueTimeMinutes: number,
1644+
maxQueueTimeMinutes: number,
1645+
) {
1646+
this.countEntry('run.scaleupchron.queuedRunners', 1);
1647+
this.countEntry('run.scaleupchron.queuedRunners.numJobs', numQueuedJobs);
1648+
this.addEntry('run.scaleupchron.queuedRunners.minMinutes', minQueueTimeMinutes);
1649+
this.addEntry('run.scaleupchron.queuedRunners.maxMinutes', maxQueueTimeMinutes);
1650+
1651+
const dimensions = new Map([['Org', org]]);
1652+
this.countEntry('run.scaleupchron.perOrg.queuedRunners', 1, dimensions);
1653+
this.countEntry('run.scaleupchron.perOrg.queuedRunners.numJobs', numQueuedJobs, dimensions);
1654+
this.addEntry('run.scaleupchron.perOrg.queuedRunners.minMinutes', minQueueTimeMinutes, dimensions);
1655+
this.addEntry('run.scaleupchron.perOrg.queuedRunners.maxMinutes', maxQueueTimeMinutes, dimensions);
1656+
1657+
const dimensionsRunnerOnly = new Map([['RunnerType', runnerType]]);
1658+
this.countEntry('run.scaleupchron.perRunnerType.queuedRunners', 1, dimensionsRunnerOnly);
1659+
this.countEntry('run.scaleupchron.perRunnerType.queuedRunners.numJobs', numQueuedJobs, dimensionsRunnerOnly);
1660+
this.addEntry('run.scaleupchron.perRunnerType.queuedRunners.minMinutes', minQueueTimeMinutes, dimensionsRunnerOnly);
1661+
this.addEntry('run.scaleupchron.perRunnerType.queuedRunners.maxMinutes', maxQueueTimeMinutes, dimensionsRunnerOnly);
1662+
1663+
dimensions.set('RunnerType', runnerType);
1664+
this.countEntry('run.scaleupchron.perOrg.perRunnerType.queuedRunners', 1, dimensions);
1665+
this.countEntry('run.scaleupchron.perOrg.perRunnerType.queuedRunners.numJobs', numQueuedJobs, dimensions);
1666+
this.addEntry('run.scaleupchron.perOrg.perRunnerType.queuedRunners.minMinutes', minQueueTimeMinutes, dimensions);
1667+
this.addEntry('run.scaleupchron.perOrg.perRunnerType.queuedRunners.maxMinutes', maxQueueTimeMinutes, dimensions);
1668+
1669+
const dimensionsRepo = new Map([
1670+
['Repo', repo],
16401671
['Org', org],
1641-
['RunnerType', runnerType],
1642-
['numQueuedJobs', numQueuedJobs.toString()],
16431672
]);
1644-
this.addEntry('gh.scaleupchron.queuedRunners', 3, dimensions);
1645-
}
1673+
this.countEntry('run.scaleupchron.perOrg.peRepo.queuedRunners', 1, dimensionsRepo);
1674+
}
1675+
1676+
queuedRunnerWillScaleStats(
1677+
org: string,
1678+
runnerType: string,
1679+
repo: string,
1680+
numQueuedJobs: number,
1681+
minQueueTimeMinutes: number,
1682+
maxQueueTimeMinutes: number,
1683+
) {
1684+
this.countEntry('run.scaleupchron.queuedRunners.willScale', 1);
1685+
this.countEntry('run.scaleupchron.queuedRunners.willScale.numJobs', numQueuedJobs);
1686+
this.addEntry('run.scaleupchron.queuedRunners.willScale.minMinutes', minQueueTimeMinutes);
1687+
this.addEntry('run.scaleupchron.queuedRunners.willScale.maxMinutes', maxQueueTimeMinutes);
1688+
1689+
const dimensions = new Map([['Org', org]]);
1690+
this.countEntry('run.scaleupchron.perOrg.queuedRunners.willScale', 1, dimensions);
1691+
this.countEntry('run.scaleupchron.perOrg.queuedRunners.willScale.numJobs', numQueuedJobs, dimensions);
1692+
this.addEntry('run.scaleupchron.perOrg.queuedRunners.willScale.minMinutes', minQueueTimeMinutes, dimensions);
1693+
this.addEntry('run.scaleupchron.perOrg.queuedRunners.willScale.maxMinutes', maxQueueTimeMinutes, dimensions);
1694+
1695+
const dimensionsRunnerOnly = new Map([['RunnerType', runnerType]]);
1696+
this.countEntry('run.scaleupchron.perRunnerType.queuedRunners.willScale', 1, dimensionsRunnerOnly);
1697+
this.countEntry(
1698+
'run.scaleupchron.perRunnerType.queuedRunners.willScale.numJobs',
1699+
numQueuedJobs,
1700+
dimensionsRunnerOnly,
1701+
);
1702+
this.addEntry(
1703+
'run.scaleupchron.perRunnerType.queuedRunners.willScale.minMinutes',
1704+
minQueueTimeMinutes,
1705+
dimensionsRunnerOnly,
1706+
);
1707+
this.addEntry(
1708+
'run.scaleupchron.perRunnerType.queuedRunners.willScale.maxMinutes',
1709+
maxQueueTimeMinutes,
1710+
dimensionsRunnerOnly,
1711+
);
1712+
1713+
dimensions.set('RunnerType', runnerType);
1714+
this.countEntry('run.scaleupchron.perOrg.perRunnerType.queuedRunners.willScale', 1, dimensions);
1715+
this.countEntry('run.scaleupchron.perOrg.perRunnerType.queuedRunners.willScale.numJobs', numQueuedJobs, dimensions);
1716+
this.addEntry(
1717+
'run.scaleupchron.perOrg.perRunnerType.queuedRunners.willScale.minMinutes',
1718+
minQueueTimeMinutes,
1719+
dimensions,
1720+
);
1721+
this.addEntry(
1722+
'run.scaleupchron.perOrg.perRunnerType.queuedRunners.willScale.maxMinutes',
1723+
maxQueueTimeMinutes,
1724+
dimensions,
1725+
);
16461726

1647-
queuedRunnerFailure(error: string) {
1648-
const dimensions = new Map([['error', error]]);
1649-
this.countEntry('gh.scaleupchron.queuedRunners.failure', 1, dimensions);
1727+
const dimensionsRepo = new Map([
1728+
['Repo', repo],
1729+
['Org', org],
1730+
]);
1731+
this.countEntry('run.scaleupchron.perOrg.peRepo.queuedRunners.willScale', 1, dimensionsRepo);
16501732
}
16511733

16521734
/* istanbul ignore next */
16531735
getQueuedJobsEndpointSuccess(ms: number) {
1654-
this.countEntry(`gh.calls.total`, 1);
1655-
this.countEntry(`gh.calls.getQueuedJobsEndpoint.count`, 1);
1656-
this.countEntry(`gh.calls.getQueuedJobsEndpoint.success`, 1);
1657-
this.addEntry(`gh.calls.getQueuedJobsEndpoint.wallclock`, ms);
1736+
this.countEntry(`hud.calls.total`, 1);
1737+
this.countEntry(`hud.calls.getQueuedJobsEndpoint.count`, 1);
1738+
this.countEntry(`hud.calls.getQueuedJobsEndpoint.success`, 1);
1739+
this.addEntry(`hud.calls.getQueuedJobsEndpoint.wallclock`, ms);
16581740
}
16591741

16601742
/* istanbul ignore next */
16611743
getQueuedJobsEndpointFailure(ms: number) {
1662-
this.countEntry(`gh.calls.total`, 1);
1663-
this.countEntry(`gh.calls.getQueuedJobsEndpoint.count`, 1);
1664-
this.countEntry(`gh.calls.getQueuedJobsEndpoint.failure`, 1);
1665-
this.addEntry(`gh.calls.getQueuedJobsEndpoint.wallclock`, ms);
1744+
this.countEntry(`hud.calls.total`, 1);
1745+
this.countEntry(`hud.calls.getQueuedJobsEndpoint.count`, 1);
1746+
this.countEntry(`hud.calls.getQueuedJobsEndpoint.failure`, 1);
1747+
this.addEntry(`hud.calls.getQueuedJobsEndpoint.wallclock`, ms);
1748+
}
1749+
1750+
hudQueuedRunnerFailureInvalidData() {
1751+
this.countEntry('hud.scaleupchron.queuedRunnerFailure.invalidData', 1);
1752+
}
1753+
1754+
hudQueuedRunnerFailureNoData() {
1755+
this.countEntry('hud.scaleupchron.queuedRunnerFailure.noData', 1);
1756+
}
1757+
1758+
hudQueuedRunnerFailure() {
1759+
this.countEntry('hud.scaleupchron.queuedRunnerFailure', 1);
1760+
}
1761+
1762+
scaleUpChronInitiated() {
1763+
this.countEntry('run.scaleupchron.initiated', 1);
16661764
}
16671765

1668-
scaleUpInstanceSuccess() {
1766+
scaleUpChronSuccess() {
16691767
this.scaleUpSuccess();
1670-
this.countEntry('run.scaleupchron.success');
1768+
this.countEntry('run.scaleupchron.success', 1);
16711769
}
16721770

1673-
scaleUpInstanceFailureNonRetryable(error: string) {
1674-
const dimensions = new Map([['error', error]]);
1675-
// should we add more information about this or do we not care since it'll be requeued?
1676-
this.countEntry('run.scaleupchron.failure.nonRetryable', 1, dimensions);
1771+
scaleUpChronFailure() {
1772+
this.countEntry('run.scaleupchron.failure', 1);
16771773
}
16781774

1679-
scaleUpInstanceFailureRetryable(error: string) {
1680-
const dimensions = new Map([['error', error]]);
1775+
scaleUpChronInstanceSuccess() {
1776+
this.scaleUpSuccess();
1777+
this.countEntry('run.scaleupchron.instance.success', 1);
1778+
}
1779+
1780+
scaleUpChronInstanceFailure() {
1781+
this.countEntry('run.scaleupchron.instance.failure', 1);
1782+
}
1783+
1784+
scaleUpChronInstanceFailureNonRetryable() {
1785+
this.countEntry('run.scaleupchron.failure.nonRetryable', 1);
1786+
}
16811787

1682-
// should we add more information about this or do we not care since it'll be requeued?
1683-
this.countEntry('run.scaleupchron.failure.retryable', 1, dimensions);
1788+
scaleUpChronInstanceFailureRetryable() {
1789+
this.countEntry('run.scaleupchron.failure.retryable', 1);
16841790
}
16851791

1686-
scaleUpInstanceNoOp() {
1687-
this.countEntry('run.scaleupchron.noop');
1792+
scaleUpChronInstanceNoOp() {
1793+
this.countEntry('run.scaleupchron.noop', 1);
16881794
}
16891795
}
16901796

terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.test.ts

Lines changed: 78 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,12 @@ import { getRunnerTypes } from './gh-runners';
55

66
// import * as ScaleUpChronModule from './scale-up-chron';
77
import { scaleUpChron, getQueuedJobs } from './scale-up-chron';
8-
import { scaleUp } from './scale-up';
8+
import { RetryableScalingError, scaleUp } from './scale-up';
99

1010
import * as MetricsModule from './metrics';
1111
import { RunnerType } from './runners';
1212
import nock from 'nock';
13+
import axios from 'axios';
1314

1415
jest.mock('./runners');
1516
jest.mock('./gh-runners');
@@ -113,7 +114,7 @@ describe('scaleUpChron', () => {
113114
});
114115

115116
it('queued jobs do not match available runners', async () => {
116-
const scaleUpInstanceNoOpSpy = jest.spyOn(metrics, 'scaleUpInstanceNoOp');
117+
const scaleUpChronInstanceNoOpSpy = jest.spyOn(metrics, 'scaleUpChronInstanceNoOp');
117118

118119
jest.clearAllMocks();
119120
jest.spyOn(Config, 'Instance', 'get').mockImplementation(() => baseCfg);
@@ -123,11 +124,11 @@ describe('scaleUpChron', () => {
123124
mocked(expBackOff).mockResolvedValue({ data: hudQueryInvalidRunnerLabelResponse });
124125

125126
await scaleUpChron(metrics);
126-
expect(scaleUpInstanceNoOpSpy).toBeCalledTimes(1);
127+
expect(scaleUpChronInstanceNoOpSpy).toBeCalledTimes(1);
127128
});
128129

129130
it('queued jobs do not match scale config org', async () => {
130-
const scaleUpInstanceNoOp = jest.spyOn(metrics, 'scaleUpInstanceNoOp');
131+
const scaleUpChronInstanceNoOp = jest.spyOn(metrics, 'scaleUpChronInstanceNoOp');
131132

132133
jest.clearAllMocks();
133134
jest.spyOn(Config, 'Instance', 'get').mockImplementation(() => baseCfg);
@@ -137,12 +138,12 @@ describe('scaleUpChron', () => {
137138
mocked(getRunnerTypes).mockResolvedValue(new Map([[runnerTypeInvalid, { is_ephemeral: false } as RunnerType]]));
138139

139140
await scaleUpChron(metrics);
140-
expect(scaleUpInstanceNoOp).toBeCalledTimes(1);
141+
expect(scaleUpChronInstanceNoOp).toBeCalledTimes(1);
141142
});
142143

143144
it('queued jobs match available runners and scale config org and scaled up completes', async () => {
144145
const mockedScaleUp = mocked(scaleUp).mockResolvedValue(undefined);
145-
const scaleUpInstanceNoOpSpy = jest.spyOn(metrics, 'scaleUpInstanceNoOp');
146+
const scaleUpChronInstanceNoOpSpy = jest.spyOn(metrics, 'scaleUpChronInstanceNoOp');
146147

147148
jest.clearAllMocks();
148149
jest.spyOn(Config, 'Instance', 'get').mockImplementation(() => baseCfg);
@@ -155,13 +156,13 @@ describe('scaleUpChron', () => {
155156
mocked(expBackOff).mockResolvedValue({ data: hudQueryValidResponse });
156157

157158
await scaleUpChron(metrics);
158-
expect(scaleUpInstanceNoOpSpy).toBeCalledTimes(0);
159+
expect(scaleUpChronInstanceNoOpSpy).toBeCalledTimes(0);
159160
expect(mockedScaleUp).toBeCalledTimes(1);
160161
});
161162

162163
it('scaled up throws error', async () => {
163164
const mockedScaleUp = mocked(scaleUp).mockRejectedValue(Error('error'));
164-
const scaleUpInstanceFailureRetryableSpy = jest.spyOn(metrics, 'scaleUpInstanceFailureRetryable');
165+
const scaleUpChronInstanceFailureNonRetryableSpy = jest.spyOn(metrics, 'scaleUpChronInstanceFailureNonRetryable');
165166

166167
jest.clearAllMocks();
167168
jest.spyOn(Config, 'Instance', 'get').mockImplementation(() => baseCfg);
@@ -173,8 +174,31 @@ describe('scaleUpChron', () => {
173174
);
174175
mocked(expBackOff).mockResolvedValue({ data: hudQueryValidResponse });
175176

177+
await expect(scaleUpChron(metrics)).rejects.toThrow('Non-retryable errors occurred while scaling up: 1');
178+
expect(scaleUpChronInstanceFailureNonRetryableSpy).toBeCalledTimes(1);
179+
expect(mockedScaleUp).toBeCalledTimes(1);
180+
});
181+
182+
it('handles RetryableScalingError', async () => {
183+
// Test for the RetryableScalingError case (lines 82-83)
184+
const retryableError = new RetryableScalingError('Retryable error');
185+
const mockedScaleUp = mocked(scaleUp).mockRejectedValue(retryableError);
186+
const scaleUpChronInstanceFailureRetryableSpy = jest.spyOn(metrics, 'scaleUpChronInstanceFailureRetryable');
187+
188+
jest.clearAllMocks();
189+
jest.spyOn(Config, 'Instance', 'get').mockImplementation(() => baseCfg);
190+
191+
mocked(shuffleArrayInPlace).mockReturnValue([hudQueryValidResponse]);
192+
mocked(getRepo).mockReturnValue({ owner: 'test_org1', repo: 'test_repo1' });
193+
mocked(getRunnerTypes).mockResolvedValue(
194+
new Map([[runnerTypeValid, { runnerTypeName: 'test_runner_type1' } as RunnerType]]),
195+
);
196+
mocked(expBackOff).mockResolvedValue({ data: hudQueryValidResponse });
197+
198+
// This should not throw since RetryableScalingError is handled differently
176199
await scaleUpChron(metrics);
177-
expect(scaleUpInstanceFailureRetryableSpy).toBeCalledTimes(1);
200+
201+
expect(scaleUpChronInstanceFailureRetryableSpy).toBeCalledTimes(1);
178202
expect(mockedScaleUp).toBeCalledTimes(1);
179203
});
180204
});
@@ -213,11 +237,54 @@ describe('getQueuedJobs', () => {
213237

214238
it('get queue data from url request with empty response', async () => {
215239
const errorResponse = '';
216-
const queuedRunnerFailureSpy = jest.spyOn(metrics, 'queuedRunnerFailure');
240+
const hudNoDataspy = jest.spyOn(metrics, 'hudQueuedRunnerFailureNoData');
241+
const hudFailureSpy = jest.spyOn(metrics, 'hudQueuedRunnerFailure');
217242

218243
mocked(expBackOff).mockResolvedValue({ data: errorResponse });
219244

220245
expect(await getQueuedJobs(metrics, 'url')).toEqual([]);
221-
expect(queuedRunnerFailureSpy).toBeCalledTimes(1);
246+
expect(hudNoDataspy).toBeCalledTimes(1);
247+
expect(hudFailureSpy).toBeCalledTimes(1);
248+
});
249+
250+
it('handles successful response with metrics trackRequest', async () => {
251+
// Test for lines 121-123
252+
const trackRequestSpy = jest.spyOn(metrics, 'trackRequest');
253+
jest.spyOn(metrics, 'getQueuedJobsEndpointSuccess');
254+
255+
// Create a spy on axios.get
256+
const axiosGetSpy = jest.spyOn(axios, 'get');
257+
258+
// Setup the trackRequest to call its callback function
259+
trackRequestSpy.mockImplementation((success, failure, callback) => {
260+
return callback();
261+
});
262+
263+
// Setup the expBackOff to call its callback function which uses trackRequest
264+
mocked(expBackOff).mockImplementation((callback) => {
265+
return callback();
266+
});
267+
268+
axiosGetSpy.mockResolvedValue({ data: hudQueryValidResponse });
269+
270+
await getQueuedJobs(metrics, 'url');
271+
272+
expect(trackRequestSpy).toHaveBeenCalled();
273+
expect(axiosGetSpy).toHaveBeenCalledWith('url');
274+
});
275+
276+
it('handles invalid data (not an array) from response', async () => {
277+
// Test for line 142
278+
const hudQueuedRunnerFailureInvalidDataSpy = jest.spyOn(metrics, 'hudQueuedRunnerFailureInvalidData');
279+
280+
mocked(expBackOff).mockResolvedValue({ data: { notAnArray: true } });
281+
282+
try {
283+
await getQueuedJobs(metrics, 'url');
284+
} catch (error) {
285+
// The function will throw an error, but we want to verify the metric was called
286+
}
287+
288+
expect(hudQueuedRunnerFailureInvalidDataSpy).toHaveBeenCalled();
222289
});
223290
});

0 commit comments

Comments
 (0)