Skip to content
This repository was archived by the owner on Nov 15, 2025. It is now read-only.

Commit 7717274

Browse files
first commit
0 parents  commit 7717274

File tree

7 files changed

+492
-0
lines changed

7 files changed

+492
-0
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
node_modules
2+
.DS_Store

.nvmrc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
22

LICENSE

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2025 Craig Morten
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

README.md

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# Virtual Screen Reader LLM
2+
3+
A CLI tool that integrates [`@guidepup/virtual-screen-reader`](https://www.npmjs.com/package/@guidepup/virtual-screen-reader) with an LLM (Large Language Model) to navigate and interact with web pages programmatically.
4+
5+
## Installation
6+
7+
### Prerequisites
8+
9+
- **Ollama**: Install the Ollama server for LLM integration. Visit [Ollama](https://ollama.com) for installation instructions.
10+
11+
### Steps
12+
13+
1. Install dependencies:
14+
15+
```bash
16+
yarn install
17+
```
18+
19+
2. Start the Ollama local server:
20+
21+
```bash
22+
yarn serve
23+
```
24+
25+
3. Pull the required LLM model and set up the environment:
26+
27+
```bash
28+
yarn pull
29+
```
30+
31+
## Usage
32+
33+
### Running the CLI
34+
35+
```bash
36+
yarn cli <url> <goal> [maxAttempts]
37+
```
38+
39+
#### Example
40+
41+
```bash
42+
yarn cli https://www.example.com "navigate to the About page" 10
43+
```
44+
45+
### Parameters
46+
47+
- `<url>`: The URL of the web page to navigate.
48+
- `<goal>`: The goal or task for the virtual screen reader to achieve.
49+
- `[maxAttempts]` (optional): The maximum number of attempts to achieve the goal (default: 10).

package.json

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
{
2+
"name": "@guidepup/virtual-screen-reader-llm",
3+
"version": "1.0.0",
4+
"main": "index.js",
5+
"license": "MIT",
6+
"bin": {
7+
"virtual-screen-reader-llm": "src/index.js"
8+
},
9+
"dependencies": {
10+
"@guidepup/virtual-screen-reader": "^0.30.1",
11+
"ollama": "^0.5.14",
12+
"playwright": "^1.51.1"
13+
},
14+
"scripts": {
15+
"serve": "ollama serve",
16+
"pull": "ollama pull llama3.1",
17+
"cli": "node src/index.js",
18+
"postinstall": "npx playwright install"
19+
}
20+
}

src/index.js

Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
const { chromium } = require("playwright");
2+
const { Ollama } = require("ollama");
3+
4+
async function inject({ page }) {
5+
await page.addScriptTag({
6+
url: "https://unpkg.com/@guidepup/virtual-screen-reader/lib/esm/index.browser.js",
7+
type: "module",
8+
});
9+
10+
await page.addScriptTag({
11+
content: `import { virtual } from "https://unpkg.com/@guidepup/virtual-screen-reader/lib/esm/index.browser.js"; window.virtual = virtual;`,
12+
type: "module",
13+
});
14+
}
15+
16+
async function start({ page }) {
17+
await page.evaluate(async () => {
18+
await window.virtual.start({
19+
container: document.body,
20+
});
21+
});
22+
}
23+
24+
async function act({ page }) {
25+
await page.evaluate(async () => {
26+
await window.virtual.act();
27+
});
28+
}
29+
30+
async function next({ page }) {
31+
await page.evaluate(async () => {
32+
await window.virtual.next();
33+
});
34+
}
35+
36+
async function lastSpokenPhrase({ page }) {
37+
return await page.evaluate(async () => {
38+
return await window.virtual.lastSpokenPhrase();
39+
});
40+
}
41+
42+
async function stop({ page }) {
43+
await page.evaluate(async () => {
44+
await window.virtual.stop();
45+
});
46+
}
47+
48+
const ollama = new Ollama();
49+
50+
const COMMANDS_MAP = {
51+
ACT: "act",
52+
NEXT: "next",
53+
STOP: "stop",
54+
};
55+
56+
const COMMANDS_EXPLANATIONS_MAP = {
57+
[COMMANDS_MAP.ACT]: "Click on the element.",
58+
[COMMANDS_MAP.NEXT]: "Move to the next element.",
59+
[COMMANDS_MAP.STOP]:
60+
"Stop the screen reader, indicating the goal has been achieved.",
61+
};
62+
63+
const COMMANDS = Object.values(COMMANDS_MAP);
64+
65+
const args = process.argv.slice(2);
66+
67+
if (args.length < 2) {
68+
console.error("Usage: node src/index.js <url> <goal> [maxAttempts]");
69+
70+
process.exit(1);
71+
}
72+
73+
const [url, goal, maxAttemptsArg] = args;
74+
const maxAttempts = maxAttemptsArg ? parseInt(maxAttemptsArg, 10) : 10;
75+
76+
const SYSTEM_MESSAGE = `You are a senior accessibility testing expert controlling a screen reader.
77+
78+
Your goal is: ${goal}.
79+
80+
Based on the screen reader output provided to you, choose the next command to achieve the above goal.
81+
82+
Available commands:
83+
${COMMANDS.map(
84+
(option) => `- ${option} - explanation: ${COMMANDS_EXPLANATIONS_MAP[option]}`
85+
).join("\n")}
86+
87+
You MUST always respond in the following format: <command> <any other thoughts after a space>.
88+
It is CRUCIAL that the command is the first thing you output.
89+
90+
Notes:
91+
92+
- If you encounter a modal you must dismiss the modal by navigating to a close button, reject button, or something similar and by using the "act" command. Dialogs and modals will result in a loop if you just navigate, so you MUST interact with a CTA to close it.
93+
- Rely only on the screen reader output you have received and do not assume anything about the state of the page otherwise. E.g. just because you act on something doesn't mean it worked - you must use screen reader output to confirm the action worked.
94+
- As an accessibility expert you should navigate using the full range of commands available to you and make use of accessibility features such as skip links.
95+
- Always double check your logic for what the next command should be.
96+
- The goal is always be achievable by exploring a page fully.
97+
- Don't act on arbitrary links to discover the goal.
98+
- Always double check you have actually achieved your goal before using the "stop" command.
99+
- If you encounter a cookie modal then try to reject cookies.
100+
- If a command doesn't work, try again with a different command. Repeating a command won't help.
101+
- Even if you are confused by the screen reader output, you MUST only respond with one of the provided commands, and any additional content must come afterwards.`;
102+
103+
(async () => {
104+
const browser = await chromium.launch({ headless: false });
105+
const context = await browser.newContext();
106+
const page = await context.newPage();
107+
108+
await page.goto(url);
109+
110+
await inject({ page });
111+
await start({ page });
112+
113+
let attempts = 0;
114+
let goalAchieved = false;
115+
116+
const messages = [];
117+
118+
let error = null;
119+
120+
while (attempts < maxAttempts && !goalAchieved) {
121+
attempts++;
122+
123+
const spokenPhrase = await lastSpokenPhrase({ page });
124+
const userMessage = `Screen Reader Output: ${spokenPhrase}\n${
125+
error ? `Error: ${error}\n` : ""
126+
}`;
127+
error = null;
128+
console.info(userMessage);
129+
130+
if (messages.length > 6) {
131+
messages.unshift();
132+
messages.unshift();
133+
}
134+
135+
messages.push({
136+
role: "user",
137+
content: userMessage,
138+
});
139+
140+
const response = await ollama.chat({
141+
model: "llama3.1",
142+
messages: [
143+
{
144+
role: "system",
145+
content: SYSTEM_MESSAGE,
146+
},
147+
...messages,
148+
],
149+
});
150+
151+
const command = response.message.content.trim();
152+
console.log(`Agent: ${command}\n`);
153+
154+
messages.push({
155+
role: "assistant",
156+
content: command,
157+
});
158+
159+
const [commandType] = command.split(/\s+/gi);
160+
161+
switch (commandType) {
162+
case COMMANDS_MAP.ACT: {
163+
await act({ page });
164+
165+
break;
166+
}
167+
case COMMANDS_MAP.NEXT: {
168+
await next({ page });
169+
170+
break;
171+
}
172+
case COMMANDS_MAP.STOP: {
173+
goalAchieved = true;
174+
175+
break;
176+
}
177+
default: {
178+
error = "Unknown command received.";
179+
console.error(error);
180+
181+
break;
182+
}
183+
}
184+
}
185+
186+
if (goalAchieved) {
187+
console.log("Goal achieved!");
188+
} else {
189+
console.error("Max attempts reached without achieving the goal.");
190+
}
191+
192+
await stop({ page });
193+
await browser.close();
194+
})();

0 commit comments

Comments
 (0)