first commit

jlp-craigmorten · jlp-craigmorten · commit 77172743717b · 2025-04-13T18:54:56.000+01:00
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+node_modules
+.DS_Store
diff --git a/.nvmrc b/.nvmrc
@@ -0,0 +1 @@
+22
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2025 Craig Morten
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,49 @@
+# Virtual Screen Reader LLM
+
+A CLI tool that integrates [`@guidepup/virtual-screen-reader`](https://www.npmjs.com/package/@guidepup/virtual-screen-reader) with an LLM (Large Language Model) to navigate and interact with web pages programmatically.
+
+## Installation
+
+### Prerequisites
+
+- **Ollama**: Install the Ollama server for LLM integration. Visit [Ollama](https://ollama.com) for installation instructions.
+
+### Steps
+
+1. Install dependencies:
+
+   ```bash
+   yarn install
+   ```
+
+2. Start the Ollama local server:
+
+   ```bash
+   yarn serve
+   ```
+
+3. Pull the required LLM model and set up the environment:
+
+   ```bash
+   yarn pull
+   ```
+
+## Usage
+
+### Running the CLI
+
+```bash
+yarn cli <url> <goal> [maxAttempts]
+```
+
+#### Example
+
+```bash
+yarn cli https://www.example.com "navigate to the About page" 10
+```
+
+### Parameters
+
+- `<url>`: The URL of the web page to navigate.
+- `<goal>`: The goal or task for the virtual screen reader to achieve.
+- `[maxAttempts]` (optional): The maximum number of attempts to achieve the goal (default: 10).
diff --git a/package.json b/package.json
@@ -0,0 +1,20 @@
+{
+  "name": "@guidepup/virtual-screen-reader-llm",
+  "version": "1.0.0",
+  "main": "index.js",
+  "license": "MIT",
+  "bin": {
+    "virtual-screen-reader-llm": "src/index.js"
+  },
+  "dependencies": {
+    "@guidepup/virtual-screen-reader": "^0.30.1",
+    "ollama": "^0.5.14",
+    "playwright": "^1.51.1"
+  },
+  "scripts": {
+    "serve": "ollama serve",
+    "pull": "ollama pull llama3.1",
+    "cli": "node src/index.js",
+    "postinstall": "npx playwright install"
+  }
+}
diff --git a/src/index.js b/src/index.js
@@ -0,0 +1,194 @@
+const { chromium } = require("playwright");
+const { Ollama } = require("ollama");
+
+async function inject({ page }) {
+  await page.addScriptTag({
+    url: "https://unpkg.com/@guidepup/virtual-screen-reader/lib/esm/index.browser.js",
+    type: "module",
+  });
+
+  await page.addScriptTag({
+    content: `import { virtual } from "https://unpkg.com/@guidepup/virtual-screen-reader/lib/esm/index.browser.js"; window.virtual = virtual;`,
+    type: "module",
+  });
+}
+
+async function start({ page }) {
+  await page.evaluate(async () => {
+    await window.virtual.start({
+      container: document.body,
+    });
+  });
+}
+
+async function act({ page }) {
+  await page.evaluate(async () => {
+    await window.virtual.act();
+  });
+}
+
+async function next({ page }) {
+  await page.evaluate(async () => {
+    await window.virtual.next();
+  });
+}
+
+async function lastSpokenPhrase({ page }) {
+  return await page.evaluate(async () => {
+    return await window.virtual.lastSpokenPhrase();
+  });
+}
+
+async function stop({ page }) {
+  await page.evaluate(async () => {
+    await window.virtual.stop();
+  });
+}
+
+const ollama = new Ollama();
+
+const COMMANDS_MAP = {
+  ACT: "act",
+  NEXT: "next",
+  STOP: "stop",
+};
+
+const COMMANDS_EXPLANATIONS_MAP = {
+  [COMMANDS_MAP.ACT]: "Click on the element.",
+  [COMMANDS_MAP.NEXT]: "Move to the next element.",
+  [COMMANDS_MAP.STOP]:
+    "Stop the screen reader, indicating the goal has been achieved.",
+};
+
+const COMMANDS = Object.values(COMMANDS_MAP);
+
+const args = process.argv.slice(2);
+
+if (args.length < 2) {
+  console.error("Usage: node src/index.js <url> <goal> [maxAttempts]");
+
+  process.exit(1);
+}
+
+const [url, goal, maxAttemptsArg] = args;
+const maxAttempts = maxAttemptsArg ? parseInt(maxAttemptsArg, 10) : 10;
+
+const SYSTEM_MESSAGE = `You are a senior accessibility testing expert controlling a screen reader.
+
+Your goal is: ${goal}.
+
+Based on the screen reader output provided to you, choose the next command to achieve the above goal.
+  
+Available commands:
+${COMMANDS.map(
+  (option) => `- ${option} - explanation: ${COMMANDS_EXPLANATIONS_MAP[option]}`
+).join("\n")}
+
+You MUST always respond in the following format: <command> <any other thoughts after a space>.
+It is CRUCIAL that the command is the first thing you output.
+
+Notes:
+
+- If you encounter a modal you must dismiss the modal by navigating to a close button, reject button, or something similar and by using the "act" command. Dialogs and modals will result in a loop if you just navigate, so you MUST interact with a CTA to close it.
+- Rely only on the screen reader output you have received and do not assume anything about the state of the page otherwise. E.g. just because you act on something doesn't mean it worked - you must use screen reader output to confirm the action worked.
+- As an accessibility expert you should navigate using the full range of commands available to you and make use of accessibility features such as skip links.
+- Always double check your logic for what the next command should be.
+- The goal is always be achievable by exploring a page fully.
+- Don't act on arbitrary links to discover the goal.
+- Always double check you have actually achieved your goal before using the "stop" command.
+- If you encounter a cookie modal then try to reject cookies.
+- If a command doesn't work, try again with a different command. Repeating a command won't help.
+- Even if you are confused by the screen reader output, you MUST only respond with one of the provided commands, and any additional content must come afterwards.`;
+
+(async () => {
+  const browser = await chromium.launch({ headless: false });
+  const context = await browser.newContext();
+  const page = await context.newPage();
+
+  await page.goto(url);
+
+  await inject({ page });
+  await start({ page });
+
+  let attempts = 0;
+  let goalAchieved = false;
+
+  const messages = [];
+
+  let error = null;
+
+  while (attempts < maxAttempts && !goalAchieved) {
+    attempts++;
+
+    const spokenPhrase = await lastSpokenPhrase({ page });
+    const userMessage = `Screen Reader Output: ${spokenPhrase}\n${
+      error ? `Error: ${error}\n` : ""
+    }`;
+    error = null;
+    console.info(userMessage);
+
+    if (messages.length > 6) {
+      messages.unshift();
+      messages.unshift();
+    }
+
+    messages.push({
+      role: "user",
+      content: userMessage,
+    });
+
+    const response = await ollama.chat({
+      model: "llama3.1",
+      messages: [
+        {
+          role: "system",
+          content: SYSTEM_MESSAGE,
+        },
+        ...messages,
+      ],
+    });
+
+    const command = response.message.content.trim();
+    console.log(`Agent: ${command}\n`);
+
+    messages.push({
+      role: "assistant",
+      content: command,
+    });
+
+    const [commandType] = command.split(/\s+/gi);
+
+    switch (commandType) {
+      case COMMANDS_MAP.ACT: {
+        await act({ page });
+
+        break;
+      }
+      case COMMANDS_MAP.NEXT: {
+        await next({ page });
+
+        break;
+      }
+      case COMMANDS_MAP.STOP: {
+        goalAchieved = true;
+
+        break;
+      }
+      default: {
+        error = "Unknown command received.";
+        console.error(error);
+
+        break;
+      }
+    }
+  }
+
+  if (goalAchieved) {
+    console.log("Goal achieved!");
+  } else {
+    console.error("Max attempts reached without achieving the goal.");
+  }
+
+  await stop({ page });
+  await browser.close();
+})();
diff --git a/yarn.lock b/yarn.lock