Skip to content

Commit 4418605

Browse files
authored
fix(kiloclaw): evict region from KV on machine-creation capacity errors (#1694)
fix(kiloclaw): evict region from KV on machine-creation 403 quota errors
1 parent 2c91db8 commit 4418605

File tree

2 files changed

+139
-0
lines changed

2 files changed

+139
-0
lines changed

kiloclaw/src/durable-objects/kiloclaw-instance.test.ts

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3188,6 +3188,136 @@ describe('start: 412 insufficient resources recovery', () => {
31883188
});
31893189
});
31903190

3191+
// ============================================================================
3192+
// start: region eviction on machine-creation capacity errors
3193+
// ============================================================================
3194+
3195+
describe('start: evicts region from KV on machine-creation capacity error', () => {
3196+
beforeEach(() => {
3197+
(flyClient.listMachines as Mock).mockResolvedValue([]);
3198+
});
3199+
3200+
it('evicts flyRegion from KV when createMachine returns 403 quota exceeded', async () => {
3201+
const env = createFakeEnv();
3202+
const { instance, storage } = createInstance(undefined, env);
3203+
await seedProvisioned(storage, { flyMachineId: null, lastStartedAt: null, flyRegion: 'lhr' });
3204+
const evictSpy = vi.spyOn(regions, 'evictCapacityRegionFromKV').mockResolvedValue(undefined);
3205+
3206+
(flyClient.createMachine as Mock)
3207+
.mockRejectedValueOnce(
3208+
new FlyApiError(
3209+
'Fly API createMachine failed (403)',
3210+
403,
3211+
'{"error":"organization \\"Kilo\\" is using 3194880 MB of memory in lhr which is over the allowed quota. please consider other regions"}'
3212+
)
3213+
)
3214+
.mockResolvedValueOnce({ id: 'machine-retry', region: 'cdg' });
3215+
(flyClient.waitForState as Mock).mockResolvedValue(undefined);
3216+
(flyClient.getVolume as Mock).mockResolvedValue({ id: 'vol-1', region: 'lhr' });
3217+
(flyClient.deleteVolume as Mock).mockResolvedValue(undefined);
3218+
(flyClient.createVolumeWithFallback as Mock).mockResolvedValue({
3219+
id: 'vol-new',
3220+
region: 'cdg',
3221+
});
3222+
3223+
await instance.start('user-1');
3224+
3225+
expect(evictSpy).toHaveBeenCalledWith(env.KV_CLAW_CACHE, env, 'lhr');
3226+
expect(storage._store.get('flyRegion')).toBe('cdg');
3227+
expect(storage._store.get('status')).toBe('running');
3228+
evictSpy.mockRestore();
3229+
});
3230+
3231+
it('does NOT evict flyRegion from KV on 409 insufficient memory (transient)', async () => {
3232+
const env = createFakeEnv();
3233+
const { instance, storage } = createInstance(undefined, env);
3234+
await seedProvisioned(storage, { flyMachineId: null, lastStartedAt: null, flyRegion: 'dfw' });
3235+
const evictSpy = vi.spyOn(regions, 'evictCapacityRegionFromKV').mockResolvedValue(undefined);
3236+
3237+
(flyClient.createMachine as Mock)
3238+
.mockRejectedValueOnce(
3239+
new FlyApiError(
3240+
'insufficient memory',
3241+
409,
3242+
'{"error":"aborted: insufficient resources available to fulfill request: could not reserve resource for machine: insufficient memory available to fulfill request"}'
3243+
)
3244+
)
3245+
.mockResolvedValueOnce({ id: 'machine-retry', region: 'sjc' });
3246+
(flyClient.waitForState as Mock).mockResolvedValue(undefined);
3247+
(flyClient.getVolume as Mock).mockResolvedValue({ id: 'vol-1', region: 'dfw' });
3248+
(flyClient.deleteVolume as Mock).mockResolvedValue(undefined);
3249+
(flyClient.createVolumeWithFallback as Mock).mockResolvedValue({
3250+
id: 'vol-new',
3251+
region: 'sjc',
3252+
});
3253+
3254+
await instance.start('user-1');
3255+
3256+
expect(evictSpy).not.toHaveBeenCalled();
3257+
evictSpy.mockRestore();
3258+
});
3259+
3260+
it('evicts flyRegion from KV when updateMachine returns 403 during startExistingMachine', async () => {
3261+
const env = createFakeEnv();
3262+
const { instance, storage } = createInstance(undefined, env);
3263+
await seedRunning(storage, {
3264+
status: 'stopped',
3265+
lastStartedAt: Date.now() - 60_000,
3266+
flyRegion: 'lhr',
3267+
});
3268+
const evictSpy = vi.spyOn(regions, 'evictCapacityRegionFromKV').mockResolvedValue(undefined);
3269+
3270+
(flyClient.getMachine as Mock).mockResolvedValue({ state: 'stopped' });
3271+
(flyClient.updateMachine as Mock).mockRejectedValue(
3272+
new FlyApiError(
3273+
'Fly API updateMachine failed (403)',
3274+
403,
3275+
'{"error":"organization \\"Kilo\\" is using 3194880 MB of memory in lhr which is over the allowed quota. please consider other regions"}'
3276+
)
3277+
);
3278+
(flyClient.getVolume as Mock).mockResolvedValue({ id: 'vol-1', region: 'lhr' });
3279+
(flyClient.destroyMachine as Mock).mockResolvedValue(undefined);
3280+
(flyClient.deleteVolume as Mock).mockResolvedValue(undefined);
3281+
(flyClient.createVolumeWithFallback as Mock).mockResolvedValue({
3282+
id: 'vol-new',
3283+
region: 'cdg',
3284+
});
3285+
(flyClient.createMachine as Mock).mockResolvedValue({ id: 'machine-new', region: 'cdg' });
3286+
(flyClient.waitForState as Mock).mockResolvedValue(undefined);
3287+
3288+
await instance.start('user-1');
3289+
3290+
expect(evictSpy).toHaveBeenCalledWith(env.KV_CLAW_CACHE, env, 'lhr');
3291+
expect(storage._store.get('flyRegion')).toBe('cdg');
3292+
evictSpy.mockRestore();
3293+
});
3294+
3295+
it('does not evict when flyRegion is null', async () => {
3296+
const env = createFakeEnv();
3297+
const { instance, storage } = createInstance(undefined, env);
3298+
await seedProvisioned(storage, { flyMachineId: null, lastStartedAt: null, flyRegion: null });
3299+
const evictSpy = vi.spyOn(regions, 'evictCapacityRegionFromKV').mockResolvedValue(undefined);
3300+
3301+
(flyClient.createMachine as Mock)
3302+
.mockRejectedValueOnce(
3303+
new FlyApiError('insufficient resources', 412, '{"error":"insufficient resources"}')
3304+
)
3305+
.mockResolvedValueOnce({ id: 'machine-retry', region: 'cdg' });
3306+
(flyClient.waitForState as Mock).mockResolvedValue(undefined);
3307+
(flyClient.getVolume as Mock).mockResolvedValue({ id: 'vol-1' });
3308+
(flyClient.deleteVolume as Mock).mockResolvedValue(undefined);
3309+
(flyClient.createVolumeWithFallback as Mock).mockResolvedValue({
3310+
id: 'vol-new',
3311+
region: 'cdg',
3312+
});
3313+
3314+
await instance.start('user-1');
3315+
3316+
expect(evictSpy).not.toHaveBeenCalled();
3317+
evictSpy.mockRestore();
3318+
});
3319+
});
3320+
31913321
// ============================================================================
31923322
// stop() error handling
31933323
// ============================================================================

kiloclaw/src/durable-objects/kiloclaw-instance/index.ts

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1046,6 +1046,15 @@ export class KiloClawInstance extends DurableObject<KiloClawEnv> {
10461046
statusCode: code,
10471047
region: this.s.flyRegion ?? 'unknown',
10481048
});
1049+
1050+
// Evict the current region from KV so future provisions avoid it.
1051+
// Only on 403 (org quota exceeded) — 409 (host memory) is transient.
1052+
// createVolumeWithFallback already evicts on volume-creation failures,
1053+
// but machine-creation 403s bypass that path.
1054+
if (code === 403 && this.s.flyRegion) {
1055+
await evictCapacityRegionFromKV(this.env.KV_CLAW_CACHE, this.env, this.s.flyRegion);
1056+
}
1057+
10491058
await flyMachines.replaceStrandedVolume(
10501059
flyConfig,
10511060
this.ctx,

0 commit comments

Comments
 (0)