feat: Add URL support to all PDF tools

shtse8 · shtse8 · commit 86d5f751c140 · 2025-04-04T17:34:51.000+01:00
diff --git a/README.md b/README.md
@@ -96,31 +96,35 @@ This server equips your AI agent with the following tools for PDF interaction:
 
 - 📄 **`read_pdf_all_text`:**
   - **Description:** Reads all text content and basic information (metadata,
-    page count) from a specified PDF file.
-  - **Input:** `{ "path": "string" }` (Relative path to the PDF file)
+    page count) from a specified PDF file, either local or via URL.
+  - **Input:** `{ "path": "string" }` OR `{ "url": "string" }` (Provide either
+    relative path OR URL)
   - **Output:** An object containing `text`, `numPages`, `numRenderedPages`,
     `info`, `metadata`, and `version` from the PDF.
 
 - 📑 **`read_pdf_page_text`:**
-  - **Description:** Reads text content from specific pages of a PDF file.
-  - **Input:** `{ "path": "string", "pages": "number[] | string" }` (Relative
-    path and an array of 1-based page numbers like `[1, 3, 5]` or a string range
-    like `'1,3-5,7'`)
+  - **Description:** Reads text content from specific pages of a PDF file,
+    either local or via URL.
+  - **Input:** `{ "path": "string", "pages": "..." }` OR
+    `{ "url": "string", "pages": "..." }` (Provide path OR URL, plus page
+    numbers/ranges)
   - **Output:** An object containing an array `pages` (each element has `page`
     number and extracted `text`) and optionally `missingPages` if some requested
     pages couldn't be processed.
 
 - ℹ️ **`get_pdf_metadata`:**
-  - **Description:** Reads metadata (like author, title, creator, producer,
-    dates) and general info from a PDF file without extracting all text content
-    explicitly in the output (though it's parsed internally).
-  - **Input:** `{ "path": "string" }` (Relative path to the PDF file)
+  - **Description:** Reads metadata and general info from a PDF file, either
+    local or via URL.
+  - **Input:** `{ "path": "string" }` OR `{ "url": "string" }` (Provide either
+    relative path OR URL)
   - **Output:** An object containing `info`, `metadata`, `numPages`, and
     `version`.
 
 - #️⃣ **`get_pdf_page_count`:**
-  - **Description:** Quickly gets the total number of pages in a PDF file.
-  - **Input:** `{ "path": "string" }` (Relative path to the PDF file)
+  - **Description:** Quickly gets the total number of pages in a PDF file,
+    either local or via URL.
+  - **Input:** `{ "path": "string" }` OR `{ "url": "string" }` (Provide either
+    relative path OR URL)
   - **Output:** An object containing `numPages`.
 
 ---
diff --git a/memory-bank/activeContext.md b/memory-bank/activeContext.md
@@ -11,24 +11,29 @@ implementing the core PDF reading tools based on the `filesystem-mcp` template.
 - Updated `package.json` with new project name (`@shtse8/pdf-reader-mcp`),
   version, description, and added `pdf-parse` dependency.
 - Ran `npm install`.
-- Created handler files for the four PDF tools:
+- Created handler files for the four PDF tools (initially local path only):
   - `src/handlers/readPdfAllText.ts`
   - `src/handlers/readPdfPageText.ts`
   - `src/handlers/getPdfMetadata.ts`
   - `src/handlers/getPdfPageCount.ts`
 - Refactored handlers to follow the `ToolDefinition` export pattern found in
   `filesystem-mcp` (instead of using `defineHandler`).
 - Integrated the new tool definitions into `src/handlers/index.ts`.
-- Updated `README.md` to reflect the PDF Reader functionality and tools.
+- Updated `README.md` to reflect the PDF Reader functionality and tools
+  (initially local path only).
+- Removed unused filesystem handlers (e.g., listFiles, editFile) from
+  `src/handlers/index.ts` and deleted corresponding `.ts` files.
+- **Added URL support:** Modified all PDF handlers and Zod schemas to accept
+  either a local `path` or a remote `url`. Updated `README.md` again.
 - Updated Memory Bank files (`techContext.md`, `systemPatterns.md`,
   `projectbrief.md`, `productContext.md`) with initial PDF Reader context.
 - Removed unused filesystem handlers (e.g., listFiles, editFile) from
   `src/handlers/index.ts` and deleted corresponding `.ts` files.
 
 ## 3. Next Steps
 
-- Update `memory-bank/progress.md` to reflect handler removal.
-- Build the project (`npm run build`) again after removing handlers.
+- Update `memory-bank/progress.md` to reflect URL support.
+- Build the project (`npm run build`) again after adding URL support.
 - Consider adding basic tests for the PDF handlers.
 - Commit the initial implementation to the Git repository.
 - Potentially test the server using `@modelcontextprotocol/inspector` or by
@@ -41,3 +46,4 @@ implementing the core PDF reading tools based on the `filesystem-mcp` template.
 - `read_pdf_page_text` uses the `pagerender` callback for potentially better
   accuracy on specific pages.
 - Removed inherited filesystem tools to focus solely on PDF functionality.
+- Added support for fetching PDFs via URL using `fetch`.
diff --git a/memory-bank/progress.md b/memory-bank/progress.md
@@ -4,7 +4,7 @@
 
 - **Project Setup:** Cloned from `filesystem-mcp`, dependencies installed
   (`pdf-parse` added).
-- **Core Tool Handlers:**
+- **Core Tool Handlers (Support both local path and URL):**
   - `read_pdf_all_text`: Implemented, integrated.
   - `read_pdf_page_text`: Implemented (using `pagerender`), integrated.
   - `get_pdf_metadata`: Implemented, integrated.
@@ -18,14 +18,15 @@
 
 ## 2. What's Left to Build/Verify
 
-- **Compilation:** Need to run `npm run build` to check for TypeScript errors.
+- **Compilation:** Need to run `npm run build` again after adding URL support.
 - **Runtime Testing:**
   - Verify the server starts correctly.
-  - Test each PDF tool with actual PDF files (various types if possible) using
+  - Test each PDF tool with both local paths and URLs using
     `@modelcontextprotocol/inspector` or a live agent.
   - Specifically test `read_pdf_page_text` with different page ranges and edge
     cases.
-  - Verify error handling (e.g., file not found, corrupted PDF).
+  - Verify error handling (e.g., file not found, URL fetch errors, corrupted
+    PDF).
 - **Testing Framework:** Consider adding automated tests (e.g., using Jest or
   Vitest) for handlers.
 - **Refinement:** Review code for potential improvements or edge cases missed.
@@ -35,16 +36,17 @@
 
 ## 3. Current Status
 
-Initial implementation of the core PDF reading tools is complete. Documentation
-updated. Ready for build and testing.
+Implementation of core PDF reading tools (with URL support) is complete.
+Documentation updated. Ready for final build and testing.
 
 ## 4. Known Issues/Risks
 
 - **`pdf-parse` Limitations:** The accuracy of text extraction, especially for
   complex layouts or scanned PDFs, depends heavily on `pdf-parse`. Page number
   detection in `pagerender` might need verification (1-based vs 0-based).
-- **Error Handling:** Current error handling is basic; more specific error types
-  or details might be needed based on testing.
+- **Error Handling:** Basic error handling for file access and URL fetching
+  implemented. More specific PDF parsing errors might need refinement based on
+  testing.
 - **Performance:** Performance on very large PDF files hasn't been tested.
 - **Inherited Filesystem Tools:** Removed. The server now focuses exclusively on
   PDF reading tools. Documentation reflects this.
diff --git a/src/handlers/getPdfMetadata.ts b/src/handlers/getPdfMetadata.ts
@@ -7,8 +7,12 @@ import type { ToolDefinition } from './index.js';
 
 // Define the Zod schema for input arguments
 const GetPdfMetadataArgsSchema = z.object({
-  path: z.string().min(1, 'Path cannot be empty.'),
-}).strict();
+  path: z.string().min(1).optional().describe("Relative path to the local PDF file."),
+  url: z.string().url().optional().describe("URL of the PDF file."),
+}).strict().refine(
+    (data) => (data.path && !data.url) || (!data.path && data.url), // Ensure either path or url is provided, but not both
+    { message: "Either 'path' or 'url' must be provided, but not both." }
+);
 
 // Infer TypeScript type for arguments
 type GetPdfMetadataArgs = z.infer<typeof GetPdfMetadataArgsSchema>;
@@ -25,10 +29,29 @@ const handleGetPdfMetadataFunc = async (args: unknown) => {
     throw new McpError(ErrorCode.InvalidParams, 'Argument validation failed');
   }
 
-  const safePath = resolvePath(parsedArgs.path);
+  const { path: relativePath, url } = parsedArgs;
+  let dataBuffer: Buffer;
+  let sourceDescription: string = 'unknown source'; // Initialize
 
   try {
-    const dataBuffer = await fs.readFile(safePath);
+    // Fetch or read the PDF buffer
+    if (relativePath) {
+      sourceDescription = `'${relativePath}'`;
+      const safePath = resolvePath(relativePath);
+      dataBuffer = await fs.readFile(safePath);
+    } else if (url) {
+      sourceDescription = `'${url}'`;
+      const response = await fetch(url);
+      if (!response.ok) {
+        throw new McpError(ErrorCode.InternalError, `Failed to fetch PDF from ${url}. Status: ${response.status} ${response.statusText}`);
+      }
+      const arrayBuffer = await response.arrayBuffer();
+      dataBuffer = Buffer.from(arrayBuffer);
+    } else {
+      throw new McpError(ErrorCode.InvalidParams, "Missing 'path' or 'url'.");
+    }
+
+    // Now parse the buffer
     // We only need metadata, but pdf-parse reads everything anyway
     const data = await pdf(dataBuffer);
 
@@ -41,11 +64,15 @@ const handleGetPdfMetadataFunc = async (args: unknown) => {
         version: data.version,
     };
   } catch (error: any) {
-    let errorMessage = `Failed to read or parse PDF for metadata at '${parsedArgs.path}'.`;
-    if (error.code === 'ENOENT') {
-      errorMessage = `File not found at '${parsedArgs.path}'. Resolved to: ${safePath}`;
+    if (error instanceof McpError) throw error; // Re-throw known MCP errors
+
+    let errorMessage = `Failed to read or parse PDF for metadata from ${sourceDescription}.`;
+    // Keep ENOENT check for local files
+    if (relativePath && error.code === 'ENOENT') {
+      const safePath = resolvePath(relativePath); // Resolve again for error message
+      errorMessage = `File not found at '${relativePath}'. Resolved to: ${safePath}`;
     } else if (error instanceof Error) {
-      errorMessage += ` Reason: ${error.message}`;
+       errorMessage += ` Reason: ${error.message}`;
     } else {
        errorMessage += ` Unknown error: ${String(error)}`;
     }
diff --git a/src/handlers/getPdfPageCount.ts b/src/handlers/getPdfPageCount.ts
@@ -7,8 +7,12 @@ import type { ToolDefinition } from './index.js';
 
 // Define the Zod schema for input arguments
 const GetPdfPageCountArgsSchema = z.object({
-  path: z.string().min(1, 'Path cannot be empty.'),
-}).strict();
+  path: z.string().min(1).optional().describe("Relative path to the local PDF file."),
+  url: z.string().url().optional().describe("URL of the PDF file."),
+}).strict().refine(
+    (data) => (data.path && !data.url) || (!data.path && data.url), // Ensure either path or url is provided, but not both
+    { message: "Either 'path' or 'url' must be provided, but not both." }
+);
 
 // Infer TypeScript type for arguments
 type GetPdfPageCountArgs = z.infer<typeof GetPdfPageCountArgsSchema>;
@@ -25,10 +29,29 @@ const handleGetPdfPageCountFunc = async (args: unknown) => {
     throw new McpError(ErrorCode.InvalidParams, 'Argument validation failed');
   }
 
-  const safePath = resolvePath(parsedArgs.path);
+  const { path: relativePath, url } = parsedArgs;
+  let dataBuffer: Buffer;
+  let sourceDescription: string = 'unknown source'; // Initialize
 
   try {
-    const dataBuffer = await fs.readFile(safePath);
+    // Fetch or read the PDF buffer
+    if (relativePath) {
+      sourceDescription = `'${relativePath}'`;
+      const safePath = resolvePath(relativePath);
+      dataBuffer = await fs.readFile(safePath);
+    } else if (url) {
+      sourceDescription = `'${url}'`;
+      const response = await fetch(url);
+      if (!response.ok) {
+        throw new McpError(ErrorCode.InternalError, `Failed to fetch PDF from ${url}. Status: ${response.status} ${response.statusText}`);
+      }
+      const arrayBuffer = await response.arrayBuffer();
+      dataBuffer = Buffer.from(arrayBuffer);
+    } else {
+      throw new McpError(ErrorCode.InvalidParams, "Missing 'path' or 'url'.");
+    }
+
+    // Now parse the buffer
     // We only need the page count, but pdf-parse reads everything
     const data = await pdf(dataBuffer);
 
@@ -37,11 +60,15 @@ const handleGetPdfPageCountFunc = async (args: unknown) => {
         numPages: data.numpages,
     };
   } catch (error: any) {
-    let errorMessage = `Failed to read or parse PDF for page count at '${parsedArgs.path}'.`;
-    if (error.code === 'ENOENT') {
-      errorMessage = `File not found at '${parsedArgs.path}'. Resolved to: ${safePath}`;
+    if (error instanceof McpError) throw error; // Re-throw known MCP errors
+
+    let errorMessage = `Failed to read or parse PDF for page count from ${sourceDescription}.`;
+    // Keep ENOENT check for local files
+    if (relativePath && error.code === 'ENOENT') {
+      const safePath = resolvePath(relativePath); // Resolve again for error message
+      errorMessage = `File not found at '${relativePath}'. Resolved to: ${safePath}`;
     } else if (error instanceof Error) {
-      errorMessage += ` Reason: ${error.message}`;
+       errorMessage += ` Reason: ${error.message}`;
     } else {
        errorMessage += ` Unknown error: ${String(error)}`;
     }
diff --git a/src/handlers/readPdfAllText.ts b/src/handlers/readPdfAllText.ts
@@ -7,8 +7,12 @@ import type { ToolDefinition } from './index.js'; // Import the internal interfa
 
 // 1. Define the Zod schema for input arguments
 const ReadPdfAllTextArgsSchema = z.object({
-  path: z.string().min(1, 'Path cannot be empty.'),
-}).strict(); // Use strict to prevent unexpected arguments
+  path: z.string().min(1).optional().describe("Relative path to the local PDF file."),
+  url: z.string().url().optional().describe("URL of the PDF file."),
+}).strict().refine(
+    (data) => (data.path && !data.url) || (!data.path && data.url), // Ensure either path or url is provided, but not both
+    { message: "Either 'path' or 'url' must be provided, but not both." }
+);
 
 // Infer TypeScript type for arguments
 type ReadPdfAllTextArgs = z.infer<typeof ReadPdfAllTextArgsSchema>;
@@ -26,10 +30,29 @@ const handleReadPdfAllTextFunc = async (args: unknown) => {
     throw new McpError(ErrorCode.InvalidParams, 'Argument validation failed');
   }
 
-  const safePath = resolvePath(parsedArgs.path);
-
+  const { path: relativePath, url } = parsedArgs;
+  let dataBuffer: Buffer;
+  let sourceDescription: string = 'unknown source'; // Initialize here
   try {
-    const dataBuffer = await fs.readFile(safePath);
+    if (relativePath) {
+      sourceDescription = `'${relativePath}'`;
+      const safePath = resolvePath(relativePath);
+      dataBuffer = await fs.readFile(safePath);
+    } else if (url) {
+      sourceDescription = `'${url}'`;
+      const response = await fetch(url);
+      if (!response.ok) {
+        // Use InternalError or a more generic code if NetworkError doesn't exist
+        throw new McpError(ErrorCode.InternalError, `Failed to fetch PDF from ${url}. Status: ${response.status} ${response.statusText}`);
+      }
+      const arrayBuffer = await response.arrayBuffer();
+      dataBuffer = Buffer.from(arrayBuffer);
+    } else {
+      // This should be caught by Zod refine, but as a safeguard:
+      throw new McpError(ErrorCode.InvalidParams, "Missing 'path' or 'url'.");
+    }
+
+    // Now parse the buffer
     const data = await pdf(dataBuffer);
 
     // pdf-parse returns numpages, numrender, info, metadata, text, version
@@ -45,12 +68,15 @@ const handleReadPdfAllTextFunc = async (args: unknown) => {
         version: data.version,
     };
   } catch (error: any) {
-    // Provide a more specific error message if possible
-    let errorMessage = `Failed to read or parse PDF at '${parsedArgs.path}'.`;
-    if (error.code === 'ENOENT') {
-      errorMessage = `File not found at '${parsedArgs.path}'. Resolved to: ${safePath}`;
+    if (error instanceof McpError) throw error; // Re-throw known MCP errors
+
+    let errorMessage = `Failed to read or parse PDF from ${sourceDescription}.`; // Remove default value here, already initialized
+    // Keep ENOENT check for local files
+    if (relativePath && error.code === 'ENOENT') {
+      const safePath = resolvePath(relativePath); // Resolve again for error message
+      errorMessage = `File not found at '${relativePath}'. Resolved to: ${safePath}`;
     } else if (error instanceof Error) {
-      errorMessage += ` Reason: ${error.message}`;
+       errorMessage += ` Reason: ${error.message}`;
     } else {
        errorMessage += ` Unknown error: ${String(error)}`;
     }
diff --git a/src/handlers/readPdfPageText.ts b/src/handlers/readPdfPageText.ts