Skip to content

Commit 306a61e

Browse files
committed
fix(gastown): address second round of PR review comments
Comment: expired kilocode_token can't self-refresh - Use jose's jwtVerify with a large clockTolerance instead of verifyKiloToken so expired but validly-signed tokens are accepted. The alarm is the recovery path for expired tokens; rejecting on exp would leave the town permanently stuck. Comment: eviction snapshot --no-verify bypasses pre-push hooks - Removed --no-verify from the git push in the eviction drain path. Repo-level pre-push hooks now run normally. Comment: force-save commit missing git author identity - Pass GIT_AUTHOR_NAME/EMAIL and GIT_COMMITTER_NAME/EMAIL from the agent's startupEnv into the Bun.spawn env, falling back to GASTOWN_GIT_AUTHOR_* from process.env, then to 'Gastown'. Comment: stale triage restart interrupts unrelated work - Guard the RESTART/RESTART_WITH_BACKOFF path with an agentStillOnBead check before stopping the agent or resetting its status to idle. Matches the existing CLOSE_BEAD / REASSIGN_BEAD guard pattern. Comment: failed review messages disappear from needs-attention - extractFailureMessage now falls back to metadata.message (from review_completed/pr_creation_failed events) when the structured failure_reason.message path yields nothing.
1 parent 3eaefd1 commit 306a61e

File tree

3 files changed

+76
-28
lines changed

3 files changed

+76
-28
lines changed

cloudflare-gastown/container/src/process-manager.ts

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1137,7 +1137,7 @@ export async function drainAll(): Promise<void> {
11371137
const hasOrigin = (await remoteCheck.exited) === 0;
11381138

11391139
const gitCmd = hasOrigin
1140-
? "git add -A && git commit --allow-empty -m 'WIP: container eviction save' && git push --set-upstream origin HEAD --no-verify"
1140+
? "git add -A && git commit --allow-empty -m 'WIP: container eviction save' && git push --set-upstream origin HEAD"
11411141
: "git add -A && git commit --allow-empty -m 'WIP: container eviction save'";
11421142

11431143
if (!hasOrigin) {
@@ -1146,10 +1146,24 @@ export async function drainAll(): Promise<void> {
11461146
);
11471147
}
11481148

1149+
// Use the agent's startup env for git author/committer identity.
1150+
// The control-server's process.env may not have GIT_AUTHOR_NAME set,
1151+
// but the agent's startupEnv (captured at spawn time) does.
1152+
const gitEnv: Record<string, string | undefined> = { ...process.env };
1153+
const authorName =
1154+
agent.startupEnv?.GIT_AUTHOR_NAME ?? process.env.GASTOWN_GIT_AUTHOR_NAME ?? 'Gastown';
1155+
const authorEmail =
1156+
agent.startupEnv?.GIT_AUTHOR_EMAIL ?? process.env.GASTOWN_GIT_AUTHOR_EMAIL ?? 'gastown@kilo.ai';
1157+
gitEnv.GIT_AUTHOR_NAME = authorName;
1158+
gitEnv.GIT_COMMITTER_NAME = authorName;
1159+
gitEnv.GIT_AUTHOR_EMAIL = authorEmail;
1160+
gitEnv.GIT_COMMITTER_EMAIL = authorEmail;
1161+
11491162
const proc = Bun.spawn(['bash', '-c', gitCmd], {
11501163
cwd: agent.workdir,
11511164
stdout: 'pipe',
11521165
stderr: 'pipe',
1166+
env: gitEnv,
11531167
});
11541168
const exitCode = await proc.exited;
11551169
const stdout = await new Response(proc.stdout).text();

cloudflare-gastown/src/dos/Town.do.ts

Lines changed: 51 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,8 @@ import { query } from '../util/query.util';
5757
import { getAgentDOStub } from './Agent.do';
5858
import { getTownContainerStub } from './TownContainer.do';
5959

60-
import { verifyKiloToken } from '@kilocode/worker-utils';
60+
import { verifyKiloToken, kiloTokenPayload } from '@kilocode/worker-utils';
61+
import { jwtVerify } from 'jose';
6162
import { generateKiloApiToken } from '../util/kilo-token.util';
6263
import { resolveSecret } from '../util/secret.util';
6364
import { writeEvent, type GastownEventData } from '../util/analytics.util';
@@ -1679,18 +1680,24 @@ export class TownDO extends DurableObject<Env> {
16791680
switch (action) {
16801681
case 'RESTART':
16811682
case 'RESTART_WITH_BACKOFF': {
1682-
// Stop the agent in the container, reset to idle so the
1683-
// scheduler picks it up again on the next alarm cycle.
1684-
if (targetAgent?.status === 'working' || targetAgent?.status === 'stalled') {
1685-
dispatch.stopAgentInContainer(this.env, this.townId, targetAgentId).catch(() => {});
1686-
}
16871683
if (targetAgent) {
16881684
// Use the bead captured in the triage snapshot (not the agent's
16891685
// current hook, which may have changed since the triage request
16901686
// was created). Fall back to current hook for backward compat.
16911687
const restartBeadId =
16921688
snapshotHookedBeadId ?? targetAgent.current_hook_bead_id;
16931689

1690+
// Only stop the agent if it's still working on the snapshot bead.
1691+
// If it has moved on, stopping it would abort unrelated work.
1692+
const agentStillOnBead =
1693+
restartBeadId && targetAgent.current_hook_bead_id === restartBeadId;
1694+
if (
1695+
agentStillOnBead &&
1696+
(targetAgent.status === 'working' || targetAgent.status === 'stalled')
1697+
) {
1698+
dispatch.stopAgentInContainer(this.env, this.townId, targetAgentId).catch(() => {});
1699+
}
1700+
16941701
// Check if the hooked bead has exhausted its dispatch cap.
16951702
// If so, fail it immediately instead of letting the reconciler
16961703
// re-dispatch indefinitely (#1653).
@@ -1706,25 +1713,28 @@ export class TownDO extends DurableObject<Env> {
17061713
break;
17071714
}
17081715
}
1709-
// RESTART clears last_activity_at so the scheduler picks it
1710-
// up immediately. RESTART_WITH_BACKOFF sets it to now() so
1711-
// the dispatch cooldown (DISPATCH_COOLDOWN_MS) delays the
1712-
// next attempt, preventing immediate restart of crash loops.
1713-
const activityAt = action === 'RESTART_WITH_BACKOFF' ? now() : null;
1714-
query(
1715-
this.sql,
1716-
/* sql */ `
1717-
UPDATE ${agent_metadata}
1718-
SET ${agent_metadata.columns.status} = 'idle',
1719-
${agent_metadata.columns.last_activity_at} = ?
1720-
WHERE ${agent_metadata.bead_id} = ?
1721-
`,
1722-
[activityAt, targetAgentId]
1723-
);
1724-
// Also stamp the bead's last_dispatch_attempt_at so the
1725-
// reconciler's exponential backoff gate fires correctly.
1726-
// Without this, the backoff variant allows immediate
1727-
// redispatch once last_dispatch_attempt_at ages out.
1716+
// Only reset agent state if it's still on the snapshot bead.
1717+
// If it moved on, let it continue its current work.
1718+
if (agentStillOnBead) {
1719+
// RESTART clears last_activity_at so the scheduler picks it
1720+
// up immediately. RESTART_WITH_BACKOFF sets it to now() so
1721+
// the dispatch cooldown (DISPATCH_COOLDOWN_MS) delays the
1722+
// next attempt, preventing immediate restart of crash loops.
1723+
const activityAt = action === 'RESTART_WITH_BACKOFF' ? now() : null;
1724+
query(
1725+
this.sql,
1726+
/* sql */ `
1727+
UPDATE ${agent_metadata}
1728+
SET ${agent_metadata.columns.status} = 'idle',
1729+
${agent_metadata.columns.last_activity_at} = ?
1730+
WHERE ${agent_metadata.bead_id} = ?
1731+
`,
1732+
[activityAt, targetAgentId]
1733+
);
1734+
}
1735+
// Stamp the bead's last_dispatch_attempt_at regardless — even
1736+
// if the agent moved on, the backoff gate should still fire
1737+
// on the snapshot bead to prevent immediate redispatch.
17281738
if (action === 'RESTART_WITH_BACKOFF' && restartBeadId) {
17291739
query(
17301740
this.sql,
@@ -3540,9 +3550,24 @@ export class TownDO extends DurableObject<Env> {
35403550

35413551
// Verify the existing token's signature before trusting its claims.
35423552
// This prevents a forged token from being re-signed with real credentials.
3553+
// Use a very large clockTolerance so that already-expired (but validly
3554+
// signed) tokens are still accepted — this alarm is the recovery path
3555+
// for expired tokens, so rejecting them on exp would leave the town
3556+
// permanently stuck if it missed the 7-day refresh window.
35433557
let payload: { kiloUserId: string; apiTokenPepper?: string | null; exp?: number };
35443558
try {
3545-
payload = await verifyKiloToken(token, secret);
3559+
const TEN_YEARS_SECONDS = 10 * 365 * 24 * 60 * 60;
3560+
const { payload: raw } = await jwtVerify(
3561+
token,
3562+
new TextEncoder().encode(secret),
3563+
{ algorithms: ['HS256'], clockTolerance: TEN_YEARS_SECONDS }
3564+
);
3565+
const parsed = kiloTokenPayload.safeParse(raw);
3566+
if (!parsed.success) {
3567+
logger.warn('refreshKilocodeTokenIfExpiring: token payload failed schema validation');
3568+
return;
3569+
}
3570+
payload = parsed.data;
35463571
} catch {
35473572
// Signature invalid or token malformed — don't remint from untrusted claims.
35483573
logger.warn('refreshKilocodeTokenIfExpiring: existing token failed signature verification');

cloudflare-gastown/src/dos/town/review-queue.ts

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,17 +37,26 @@ function now(): string {
3737
return new Date().toISOString();
3838
}
3939

40-
/** Extract the human-readable failure message from a bead event's metadata. */
40+
/**
41+
* Extract the human-readable failure message from a bead event's metadata.
42+
*
43+
* Two sources:
44+
* - status_changed events store it at `metadata.failure_reason.message`
45+
* - review_completed / pr_creation_failed events store it at `metadata.message`
46+
*/
4147
function extractFailureMessage(
4248
status: string,
4349
metadata: Record<string, unknown> | null | undefined
4450
): string | null {
4551
if (status !== 'failed' || !metadata) return null;
52+
// Structured failure_reason (from status_changed events via updateBeadStatus)
4653
const fr = metadata.failure_reason;
4754
if (typeof fr === 'object' && fr !== null && 'message' in fr) {
4855
const msg = (fr as Record<string, unknown>).message;
4956
if (typeof msg === 'string') return msg;
5057
}
58+
// Top-level message (from review_completed / pr_creation_failed events)
59+
if (typeof metadata.message === 'string') return metadata.message;
5160
return null;
5261
}
5362

0 commit comments

Comments
 (0)