Skip to content

Commit 5267695

Browse files
committed
desktop: detect no-progress loops + bounded repair ladder
1 parent f299b57 commit 5267695

14 files changed

+2231
-209
lines changed
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
import * as fs from 'node:fs';
2+
import * as path from 'node:path';
3+
4+
import { MessageContentType } from '@bytebot/shared';
5+
import {
6+
DesktopLoopDetector,
7+
buildDesktopActionSignature,
8+
} from './agent.desktop-safety';
9+
10+
type FixtureRow = {
11+
tool: string;
12+
coordinates?: { x: number; y: number };
13+
button?: string;
14+
clickCount?: number;
15+
screenshotHash: string;
16+
};
17+
18+
describe('desktop loop detector fixture replay', () => {
19+
it('replays the Google Flights modal stall fixture and trips quickly', () => {
20+
const fixturePath = path.join(
21+
__dirname,
22+
'fixtures',
23+
'desktop-loop-google-flights.jsonl',
24+
);
25+
const lines = fs
26+
.readFileSync(fixturePath, 'utf8')
27+
.split('\n')
28+
.map((l) => l.trim())
29+
.filter(Boolean);
30+
31+
const detector = new DesktopLoopDetector();
32+
33+
let lastResult: ReturnType<DesktopLoopDetector['record']> | null = null;
34+
let steps = 0;
35+
36+
for (const line of lines) {
37+
const row = JSON.parse(line) as FixtureRow;
38+
steps++;
39+
40+
const block: any = {
41+
type: MessageContentType.ToolUse,
42+
id: `fx-${steps}`,
43+
name: row.tool,
44+
input: {},
45+
};
46+
47+
if (row.coordinates) {
48+
block.input.coordinates = row.coordinates;
49+
}
50+
if (row.tool === 'computer_click_mouse') {
51+
block.input.button = row.button || 'left';
52+
block.input.clickCount = row.clickCount || 1;
53+
}
54+
55+
const signature = buildDesktopActionSignature(block);
56+
lastResult = detector.record({
57+
atMs: Date.now(),
58+
signature,
59+
screenshotHash: row.screenshotHash,
60+
});
61+
62+
if (lastResult.interrupt) break;
63+
}
64+
65+
expect(lastResult?.interrupt).toBe(true);
66+
expect(lastResult?.rule).toBe('repeat_in_window_no_progress');
67+
expect(steps).toBeLessThanOrEqual(12);
68+
});
69+
});
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
import {
2+
decodePngDimensionsFromBase64,
3+
parseUiRepairCandidate,
4+
validateUiRepairCandidate,
5+
} from './agent.desktop-repair';
6+
7+
describe('agent.desktop-repair', () => {
8+
describe('parseUiRepairCandidate', () => {
9+
it('parses valid JSON with rationale_code', () => {
10+
const res = parseUiRepairCandidate(
11+
JSON.stringify({
12+
x: 10,
13+
y: 20,
14+
confidence: 0.9,
15+
rationale_code: 'CLOSE_X',
16+
}),
17+
);
18+
expect(res.ok).toBe(true);
19+
if (res.ok) {
20+
expect(res.candidate.rationaleCode).toBe('CLOSE_X');
21+
expect(res.candidate.x).toBe(10);
22+
expect(res.candidate.y).toBe(20);
23+
}
24+
});
25+
26+
it('rejects missing JSON', () => {
27+
const res = parseUiRepairCandidate('nope');
28+
expect(res.ok).toBe(false);
29+
});
30+
31+
it('rejects invalid rationale_code', () => {
32+
const res = parseUiRepairCandidate(
33+
JSON.stringify({
34+
x: 10,
35+
y: 20,
36+
confidence: 0.9,
37+
rationale_code: 'DELETE_ALL',
38+
}),
39+
);
40+
expect(res.ok).toBe(false);
41+
});
42+
});
43+
44+
describe('decodePngDimensionsFromBase64', () => {
45+
it('extracts width/height from a tiny PNG', () => {
46+
// 1x1 transparent PNG
47+
const base64 =
48+
'iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PF2K1QAAAABJRU5ErkJggg==';
49+
const dims = decodePngDimensionsFromBase64(base64);
50+
expect(dims).toEqual({ width: 1, height: 1 });
51+
});
52+
});
53+
54+
describe('validateUiRepairCandidate', () => {
55+
it('accepts CLOSE_X in top-right region across multiple viewport sizes', () => {
56+
const viewports = [
57+
{ width: 100, height: 100 },
58+
{ width: 800, height: 600 },
59+
{ width: 1366, height: 768 },
60+
{ width: 1920, height: 1080 },
61+
];
62+
63+
for (const vp of viewports) {
64+
const res = validateUiRepairCandidate({
65+
candidate: {
66+
x: Math.floor(vp.width * 0.9),
67+
y: Math.floor(vp.height * 0.1),
68+
confidence: 0.9,
69+
rationaleCode: 'CLOSE_X',
70+
},
71+
dimensions: vp,
72+
minConfidence: 0.7,
73+
});
74+
expect(res.ok).toBe(true);
75+
}
76+
});
77+
78+
it('rejects CLOSE_X outside top-right region', () => {
79+
const res = validateUiRepairCandidate({
80+
candidate: { x: 10, y: 90, confidence: 0.9, rationaleCode: 'CLOSE_X' },
81+
dimensions: { width: 100, height: 100 },
82+
minConfidence: 0.7,
83+
});
84+
expect(res.ok).toBe(false);
85+
});
86+
87+
it('accepts DISMISS_BUTTON only in conservative right-side region', () => {
88+
const viewports = [
89+
{ width: 320, height: 240 },
90+
{ width: 1024, height: 768 },
91+
{ width: 1920, height: 1080 },
92+
];
93+
94+
for (const vp of viewports) {
95+
const res = validateUiRepairCandidate({
96+
candidate: {
97+
x: Math.floor(vp.width * 0.85),
98+
y: Math.floor(vp.height * 0.5),
99+
confidence: 0.95,
100+
rationaleCode: 'DISMISS_BUTTON',
101+
},
102+
dimensions: vp,
103+
minConfidence: 0.7,
104+
});
105+
expect(res.ok).toBe(true);
106+
}
107+
});
108+
109+
it('rejects low confidence', () => {
110+
const res = validateUiRepairCandidate({
111+
candidate: { x: 90, y: 10, confidence: 0.2, rationaleCode: 'CLOSE_X' },
112+
dimensions: { width: 100, height: 100 },
113+
minConfidence: 0.7,
114+
});
115+
expect(res.ok).toBe(false);
116+
});
117+
});
118+
});

0 commit comments

Comments
 (0)