Merge pull request #20 from BadlyDrawnBoy/codex/decide-on-sampling-and-guardrails-implementation

mad-sol-dev · web-flow · commit 9d93105d9b4b · 2025-12-21T19:12:52.000+01:00
Add full-document guardrails for read/search tools
diff --git a/bun.lock b/bun.lock
diff --git a/package.json b/package.json
@@ -92,7 +92,8 @@
     "typedoc": "^0.28.15",
     "typedoc-plugin-markdown": "^4.9.0",
     "typescript": "^5.9.3",
-    "vitepress": "^1.5.0"
+    "vitepress": "^1.5.0",
+    "vitest": "^2.1.4"
   },
   "packageManager": "bun@1.3.1"
 }
diff --git a/src/handlers/readPdf.ts b/src/handlers/readPdf.ts
@@ -25,6 +25,7 @@ const processSingleSource = async (
     includeMetadata: boolean;
     includePageCount: boolean;
     includeImages: boolean;
+    allowFullDocument: boolean;
   }
 ): Promise<PdfSourceResult> => {
   const MAX_CONCURRENT_PAGES = 5;
@@ -51,14 +52,20 @@ const processSingleSource = async (
     const output: PdfResultData = { ...metadataOutput };
 
     // Determine pages to process
-    const { pagesToProcess, invalidPages } = determinePagesToProcess(
+    const { pagesToProcess, invalidPages, guardWarning } = determinePagesToProcess(
       targetPages,
       totalPages,
-      options.includeFullText
+      options.includeFullText,
+      {
+        allowFullDocument: options.allowFullDocument,
+      }
     );
 
     // Add warnings for invalid pages
-    const warnings = buildWarnings(invalidPages, totalPages);
+    const warnings = [
+      ...buildWarnings(invalidPages, totalPages),
+      ...(guardWarning ? [guardWarning] : []),
+    ];
     if (warnings.length > 0) {
       output.warnings = warnings;
     }
@@ -162,8 +169,14 @@ export const readPdf = tool()
   )
   .input(readPdfArgsSchema)
   .handler(async ({ input }) => {
-    const { sources, include_full_text, include_metadata, include_page_count, include_images } =
-      input;
+    const {
+      sources,
+      include_full_text,
+      include_metadata,
+      include_page_count,
+      include_images,
+      allow_full_document,
+    } = input;
 
     // Process sources with concurrency limit to prevent memory exhaustion
     // Processing large PDFs concurrently can consume significant memory
@@ -174,6 +187,7 @@ export const readPdf = tool()
       includeMetadata: include_metadata ?? true,
       includePageCount: include_page_count ?? true,
       includeImages: include_images ?? false,
+      allowFullDocument: allow_full_document ?? false,
     };
 
     for (let i = 0; i < sources.length; i += MAX_CONCURRENT_SOURCES) {
diff --git a/src/handlers/searchPdf.ts b/src/handlers/searchPdf.ts
@@ -185,7 +185,8 @@ const destroyPdfDocument = async (
 const processSearchSource = async (
   source: PdfSource,
   sourceDescription: string,
-  options: SearchOptions
+  options: SearchOptions,
+  allowFullDocument: boolean
 ): Promise<PdfSourceSearchResult> => {
   let pdfDocument: pdfjsLib.PDFDocumentProxy | null = null;
   let result: PdfSourceSearchResult = { source: sourceDescription, success: false };
@@ -197,7 +198,14 @@ const processSearchSource = async (
     pdfDocument = await loadPdfDocument(loadArgs, sourceDescription);
     const totalPages = pdfDocument.numPages;
 
-    const { pagesToProcess, invalidPages } = determinePagesToProcess(targetPages, totalPages, true);
+    const { pagesToProcess, invalidPages, guardWarning } = determinePagesToProcess(
+      targetPages,
+      totalPages,
+      true,
+      {
+        allowFullDocument,
+      }
+    );
     const pageLabels = await getPageLabelsSafe(pdfDocument, sourceDescription);
     const { hits, truncatedPages } = await collectPageHits(
       pdfDocument,
@@ -207,7 +215,10 @@ const processSearchSource = async (
       options
     );
 
-    const warnings = buildWarnings(invalidPages, totalPages);
+    const warnings = [
+      ...buildWarnings(invalidPages, totalPages),
+      ...(guardWarning ? [guardWarning] : []),
+    ];
 
     result = {
       source: sourceDescription,
@@ -248,6 +259,7 @@ export const pdfSearch = tool()
       max_chars_per_page,
       preserve_whitespace,
       trim_lines,
+      allow_full_document,
     } = input;
 
     const baseOptions: SearchOptions = {
@@ -281,10 +293,15 @@ export const pdfSearch = tool()
       const batchResults = await Promise.all(
         batch.map((source) => {
           const sourceDescription = source.path ?? source.url ?? 'unknown source';
-          return processSearchSource(source, sourceDescription, {
-            ...baseOptions,
-            maxHits: remainingHits,
-          });
+          return processSearchSource(
+            source,
+            sourceDescription,
+            {
+              ...baseOptions,
+              maxHits: remainingHits,
+            },
+            allow_full_document ?? false
+          );
         })
       );
 
diff --git a/src/schemas/pdfSearch.ts b/src/schemas/pdfSearch.ts
@@ -32,6 +32,13 @@ export const pdfSearchArgsSchema = object({
     bool(description('Preserve original whitespace when building text.'))
   ),
   trim_lines: optional(bool(description('Trim leading/trailing whitespace for each text line.'))),
+  allow_full_document: optional(
+    bool(
+      description(
+        'When true, allows searching the entire document if no pages are specified. When false, only a small sample of pages will be processed.'
+      )
+    )
+  ),
 });
 
 export type PdfSearchArgs = InferOutput<typeof pdfSearchArgsSchema>;
diff --git a/src/schemas/readPdf.ts b/src/schemas/readPdf.ts
@@ -1,7 +1,7 @@
 // Vex validation schemas for PDF reading
 
 import { array, bool, description, type InferOutput, object, optional } from '@sylphx/vex';
-import { pageSpecifierSchema, pdfSourceSchema, type PdfSource as SharedPdfSource } from './pdfSource.js';
+import { pdfSourceSchema, type PdfSource as SharedPdfSource } from './pdfSource.js';
 
 // Schema for the read_pdf tool arguments
 export const readPdfArgsSchema = object({
@@ -22,6 +22,13 @@ export const readPdfArgsSchema = object({
       description('Extract and include embedded images from the PDF pages as base64-encoded data.')
     )
   ),
+  allow_full_document: optional(
+    bool(
+      description(
+        'When true, allows reading the entire document if no pages are specified. When false, only a small sample of pages will be processed.'
+      )
+    )
+  ),
 });
 
 export type ReadPdfArgs = InferOutput<typeof readPdfArgsSchema>;
diff --git a/test/handlers/pageGuards.test.ts b/test/handlers/pageGuards.test.ts
@@ -2,18 +2,22 @@ import { beforeEach, describe, expect, it, vi } from 'vitest';
 import { pdfGetPageStats } from '../../src/handlers/getPageStats.js';
 import { pdfListImages } from '../../src/handlers/listImages.js';
 import { pdfReadPages } from '../../src/handlers/readPages.js';
+import { readPdf } from '../../src/handlers/readPdf.js';
+import { pdfSearch } from '../../src/handlers/searchPdf.js';
 import { DEFAULT_SAMPLE_PAGE_LIMIT } from '../../src/pdf/parser.js';
 
 const {
   mockExtractPageContent,
   mockExtractImages,
+  mockExtractMetadataAndPageCount,
   mockLoadPdfDocument,
   mockGetCachedPageText,
   mockSetCachedPageText,
   mockFingerprint,
 } = vi.hoisted(() => ({
   mockExtractPageContent: vi.fn(),
   mockExtractImages: vi.fn(),
+  mockExtractMetadataAndPageCount: vi.fn(),
   mockLoadPdfDocument: vi.fn(),
   mockGetCachedPageText: vi.fn(),
   mockSetCachedPageText: vi.fn(),
@@ -30,6 +34,7 @@ vi.mock('../../src/pdf/extractor.js', async () => {
     ...actual,
     extractPageContent: mockExtractPageContent,
     extractImages: mockExtractImages,
+    extractMetadataAndPageCount: mockExtractMetadataAndPageCount,
   };
 });
 
@@ -73,6 +78,7 @@ beforeEach(() => {
   mockGetCachedPageText.mockReturnValue(undefined);
   mockSetCachedPageText.mockImplementation(() => {});
   mockExtractImages.mockResolvedValue([{ page: 1, index: 0, width: 100, height: 100, format: 'png' }]);
+  mockExtractMetadataAndPageCount.mockResolvedValue({ page_count: 12 });
   mockExtractPageContent.mockImplementation(async (_doc, pageNum: number) => [
     { type: 'text', yPosition: 0, textContent: `Page ${pageNum}` },
   ]);
@@ -143,4 +149,36 @@ describe('PDF handlers page guards', () => {
     expect(entry.data?.pages?.map((page) => page.page_number)).toEqual([2, 4]);
     expect(entry.data?.warnings).toBeUndefined();
   });
+
+  it('samples readPdf full-text requests without allow_full_document', async () => {
+    const result = await readPdf.handler({
+      input: { sources: [{ path: 'doc.pdf' }], include_full_text: true },
+      ctx: {},
+    });
+
+    const payload = JSON.parse(extractTextPayload(result)) as {
+      results: Array<{ data?: { warnings?: string[]; full_text?: string } }>;
+    };
+    const entry = payload.results[0]!;
+
+    expect(mockExtractPageContent).toHaveBeenCalledTimes(DEFAULT_SAMPLE_PAGE_LIMIT);
+    expect(typeof entry.data?.full_text).toBe('string');
+    expect(entry.data?.warnings?.some((warning) => warning.includes('allow_full_document=true'))).toBe(true);
+  });
+
+  it('samples pdfSearch requests when pages are omitted', async () => {
+    const result = await pdfSearch.handler({
+      input: { sources: [{ path: 'doc.pdf' }], query: 'page' },
+      ctx: {},
+    });
+
+    const payload = JSON.parse(extractTextPayload(result)) as {
+      results: Array<{ data?: { warnings?: string[]; total_hits?: number } }>;
+    };
+    const entry = payload.results[0]!;
+
+    expect(mockExtractPageContent).toHaveBeenCalledTimes(DEFAULT_SAMPLE_PAGE_LIMIT);
+    expect(entry.data?.total_hits).toBeGreaterThan(0);
+    expect(entry.data?.warnings?.some((warning) => warning.includes('allow_full_document=true'))).toBe(true);
+  });
 });