Skip to content

Commit 9d93105

Browse files
authored
Merge pull request #20 from BadlyDrawnBoy/codex/decide-on-sampling-and-guardrails-implementation
Add full-document guardrails for read/search tools
2 parents 37538a1 + d5f425f commit 9d93105

File tree

7 files changed

+162
-15
lines changed

7 files changed

+162
-15
lines changed

bun.lock

Lines changed: 64 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,8 @@
9292
"typedoc": "^0.28.15",
9393
"typedoc-plugin-markdown": "^4.9.0",
9494
"typescript": "^5.9.3",
95-
"vitepress": "^1.5.0"
95+
"vitepress": "^1.5.0",
96+
"vitest": "^2.1.4"
9697
},
9798
"packageManager": "bun@1.3.1"
9899
}

src/handlers/readPdf.ts

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ const processSingleSource = async (
2525
includeMetadata: boolean;
2626
includePageCount: boolean;
2727
includeImages: boolean;
28+
allowFullDocument: boolean;
2829
}
2930
): Promise<PdfSourceResult> => {
3031
const MAX_CONCURRENT_PAGES = 5;
@@ -51,14 +52,20 @@ const processSingleSource = async (
5152
const output: PdfResultData = { ...metadataOutput };
5253

5354
// Determine pages to process
54-
const { pagesToProcess, invalidPages } = determinePagesToProcess(
55+
const { pagesToProcess, invalidPages, guardWarning } = determinePagesToProcess(
5556
targetPages,
5657
totalPages,
57-
options.includeFullText
58+
options.includeFullText,
59+
{
60+
allowFullDocument: options.allowFullDocument,
61+
}
5862
);
5963

6064
// Add warnings for invalid pages
61-
const warnings = buildWarnings(invalidPages, totalPages);
65+
const warnings = [
66+
...buildWarnings(invalidPages, totalPages),
67+
...(guardWarning ? [guardWarning] : []),
68+
];
6269
if (warnings.length > 0) {
6370
output.warnings = warnings;
6471
}
@@ -162,8 +169,14 @@ export const readPdf = tool()
162169
)
163170
.input(readPdfArgsSchema)
164171
.handler(async ({ input }) => {
165-
const { sources, include_full_text, include_metadata, include_page_count, include_images } =
166-
input;
172+
const {
173+
sources,
174+
include_full_text,
175+
include_metadata,
176+
include_page_count,
177+
include_images,
178+
allow_full_document,
179+
} = input;
167180

168181
// Process sources with concurrency limit to prevent memory exhaustion
169182
// Processing large PDFs concurrently can consume significant memory
@@ -174,6 +187,7 @@ export const readPdf = tool()
174187
includeMetadata: include_metadata ?? true,
175188
includePageCount: include_page_count ?? true,
176189
includeImages: include_images ?? false,
190+
allowFullDocument: allow_full_document ?? false,
177191
};
178192

179193
for (let i = 0; i < sources.length; i += MAX_CONCURRENT_SOURCES) {

src/handlers/searchPdf.ts

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,8 @@ const destroyPdfDocument = async (
185185
const processSearchSource = async (
186186
source: PdfSource,
187187
sourceDescription: string,
188-
options: SearchOptions
188+
options: SearchOptions,
189+
allowFullDocument: boolean
189190
): Promise<PdfSourceSearchResult> => {
190191
let pdfDocument: pdfjsLib.PDFDocumentProxy | null = null;
191192
let result: PdfSourceSearchResult = { source: sourceDescription, success: false };
@@ -197,7 +198,14 @@ const processSearchSource = async (
197198
pdfDocument = await loadPdfDocument(loadArgs, sourceDescription);
198199
const totalPages = pdfDocument.numPages;
199200

200-
const { pagesToProcess, invalidPages } = determinePagesToProcess(targetPages, totalPages, true);
201+
const { pagesToProcess, invalidPages, guardWarning } = determinePagesToProcess(
202+
targetPages,
203+
totalPages,
204+
true,
205+
{
206+
allowFullDocument,
207+
}
208+
);
201209
const pageLabels = await getPageLabelsSafe(pdfDocument, sourceDescription);
202210
const { hits, truncatedPages } = await collectPageHits(
203211
pdfDocument,
@@ -207,7 +215,10 @@ const processSearchSource = async (
207215
options
208216
);
209217

210-
const warnings = buildWarnings(invalidPages, totalPages);
218+
const warnings = [
219+
...buildWarnings(invalidPages, totalPages),
220+
...(guardWarning ? [guardWarning] : []),
221+
];
211222

212223
result = {
213224
source: sourceDescription,
@@ -248,6 +259,7 @@ export const pdfSearch = tool()
248259
max_chars_per_page,
249260
preserve_whitespace,
250261
trim_lines,
262+
allow_full_document,
251263
} = input;
252264

253265
const baseOptions: SearchOptions = {
@@ -281,10 +293,15 @@ export const pdfSearch = tool()
281293
const batchResults = await Promise.all(
282294
batch.map((source) => {
283295
const sourceDescription = source.path ?? source.url ?? 'unknown source';
284-
return processSearchSource(source, sourceDescription, {
285-
...baseOptions,
286-
maxHits: remainingHits,
287-
});
296+
return processSearchSource(
297+
source,
298+
sourceDescription,
299+
{
300+
...baseOptions,
301+
maxHits: remainingHits,
302+
},
303+
allow_full_document ?? false
304+
);
288305
})
289306
);
290307

src/schemas/pdfSearch.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,13 @@ export const pdfSearchArgsSchema = object({
3232
bool(description('Preserve original whitespace when building text.'))
3333
),
3434
trim_lines: optional(bool(description('Trim leading/trailing whitespace for each text line.'))),
35+
allow_full_document: optional(
36+
bool(
37+
description(
38+
'When true, allows searching the entire document if no pages are specified. When false, only a small sample of pages will be processed.'
39+
)
40+
)
41+
),
3542
});
3643

3744
export type PdfSearchArgs = InferOutput<typeof pdfSearchArgsSchema>;

src/schemas/readPdf.ts

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
// Vex validation schemas for PDF reading
22

33
import { array, bool, description, type InferOutput, object, optional } from '@sylphx/vex';
4-
import { pageSpecifierSchema, pdfSourceSchema, type PdfSource as SharedPdfSource } from './pdfSource.js';
4+
import { pdfSourceSchema, type PdfSource as SharedPdfSource } from './pdfSource.js';
55

66
// Schema for the read_pdf tool arguments
77
export const readPdfArgsSchema = object({
@@ -22,6 +22,13 @@ export const readPdfArgsSchema = object({
2222
description('Extract and include embedded images from the PDF pages as base64-encoded data.')
2323
)
2424
),
25+
allow_full_document: optional(
26+
bool(
27+
description(
28+
'When true, allows reading the entire document if no pages are specified. When false, only a small sample of pages will be processed.'
29+
)
30+
)
31+
),
2532
});
2633

2734
export type ReadPdfArgs = InferOutput<typeof readPdfArgsSchema>;

test/handlers/pageGuards.test.ts

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,22 @@ import { beforeEach, describe, expect, it, vi } from 'vitest';
22
import { pdfGetPageStats } from '../../src/handlers/getPageStats.js';
33
import { pdfListImages } from '../../src/handlers/listImages.js';
44
import { pdfReadPages } from '../../src/handlers/readPages.js';
5+
import { readPdf } from '../../src/handlers/readPdf.js';
6+
import { pdfSearch } from '../../src/handlers/searchPdf.js';
57
import { DEFAULT_SAMPLE_PAGE_LIMIT } from '../../src/pdf/parser.js';
68

79
const {
810
mockExtractPageContent,
911
mockExtractImages,
12+
mockExtractMetadataAndPageCount,
1013
mockLoadPdfDocument,
1114
mockGetCachedPageText,
1215
mockSetCachedPageText,
1316
mockFingerprint,
1417
} = vi.hoisted(() => ({
1518
mockExtractPageContent: vi.fn(),
1619
mockExtractImages: vi.fn(),
20+
mockExtractMetadataAndPageCount: vi.fn(),
1721
mockLoadPdfDocument: vi.fn(),
1822
mockGetCachedPageText: vi.fn(),
1923
mockSetCachedPageText: vi.fn(),
@@ -30,6 +34,7 @@ vi.mock('../../src/pdf/extractor.js', async () => {
3034
...actual,
3135
extractPageContent: mockExtractPageContent,
3236
extractImages: mockExtractImages,
37+
extractMetadataAndPageCount: mockExtractMetadataAndPageCount,
3338
};
3439
});
3540

@@ -73,6 +78,7 @@ beforeEach(() => {
7378
mockGetCachedPageText.mockReturnValue(undefined);
7479
mockSetCachedPageText.mockImplementation(() => {});
7580
mockExtractImages.mockResolvedValue([{ page: 1, index: 0, width: 100, height: 100, format: 'png' }]);
81+
mockExtractMetadataAndPageCount.mockResolvedValue({ page_count: 12 });
7682
mockExtractPageContent.mockImplementation(async (_doc, pageNum: number) => [
7783
{ type: 'text', yPosition: 0, textContent: `Page ${pageNum}` },
7884
]);
@@ -143,4 +149,36 @@ describe('PDF handlers page guards', () => {
143149
expect(entry.data?.pages?.map((page) => page.page_number)).toEqual([2, 4]);
144150
expect(entry.data?.warnings).toBeUndefined();
145151
});
152+
153+
it('samples readPdf full-text requests without allow_full_document', async () => {
154+
const result = await readPdf.handler({
155+
input: { sources: [{ path: 'doc.pdf' }], include_full_text: true },
156+
ctx: {},
157+
});
158+
159+
const payload = JSON.parse(extractTextPayload(result)) as {
160+
results: Array<{ data?: { warnings?: string[]; full_text?: string } }>;
161+
};
162+
const entry = payload.results[0]!;
163+
164+
expect(mockExtractPageContent).toHaveBeenCalledTimes(DEFAULT_SAMPLE_PAGE_LIMIT);
165+
expect(typeof entry.data?.full_text).toBe('string');
166+
expect(entry.data?.warnings?.some((warning) => warning.includes('allow_full_document=true'))).toBe(true);
167+
});
168+
169+
it('samples pdfSearch requests when pages are omitted', async () => {
170+
const result = await pdfSearch.handler({
171+
input: { sources: [{ path: 'doc.pdf' }], query: 'page' },
172+
ctx: {},
173+
});
174+
175+
const payload = JSON.parse(extractTextPayload(result)) as {
176+
results: Array<{ data?: { warnings?: string[]; total_hits?: number } }>;
177+
};
178+
const entry = payload.results[0]!;
179+
180+
expect(mockExtractPageContent).toHaveBeenCalledTimes(DEFAULT_SAMPLE_PAGE_LIMIT);
181+
expect(entry.data?.total_hits).toBeGreaterThan(0);
182+
expect(entry.data?.warnings?.some((warning) => warning.includes('allow_full_document=true'))).toBe(true);
183+
});
146184
});

0 commit comments

Comments
 (0)