Skip to content

Commit 4368e82

Browse files
committed
feat: add native PDF multimodal analysis support
- Add extractPDFAsBase64() function to extract PDFs as base64 data - Add validatePDFForMultimodal() with provider-specific size limits - Update mentions system to handle PDF attachments - Modify Task.ts to format PDF attachments as document blocks for API - Add comprehensive tests for PDF multimodal functionality - Maintain backward compatibility with existing code Fixes #7266
1 parent 241df17 commit 4368e82

File tree

7 files changed

+305
-82
lines changed

7 files changed

+305
-82
lines changed

src/core/mentions/__tests__/index.spec.ts

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ describe("parseMentions - URL error handling", () => {
4141

4242
expect(consoleErrorSpy).toHaveBeenCalledWith("Error fetching URL https://example.com:", timeoutError)
4343
expect(vscode.window.showErrorMessage).toHaveBeenCalledWith("common:errors.url_fetch_error_with_url")
44-
expect(result).toContain("Error fetching content: Navigation timeout of 30000 ms exceeded")
44+
expect(result.text).toContain("Error fetching content: Navigation timeout of 30000 ms exceeded")
4545
})
4646

4747
it("should handle DNS resolution errors", async () => {
@@ -51,7 +51,7 @@ describe("parseMentions - URL error handling", () => {
5151
const result = await parseMentions("Check @https://nonexistent.example", "/test", mockUrlContentFetcher)
5252

5353
expect(vscode.window.showErrorMessage).toHaveBeenCalledWith("common:errors.url_fetch_error_with_url")
54-
expect(result).toContain("Error fetching content: net::ERR_NAME_NOT_RESOLVED")
54+
expect(result.text).toContain("Error fetching content: net::ERR_NAME_NOT_RESOLVED")
5555
})
5656

5757
it("should handle network disconnection errors", async () => {
@@ -61,7 +61,7 @@ describe("parseMentions - URL error handling", () => {
6161
const result = await parseMentions("Check @https://example.com", "/test", mockUrlContentFetcher)
6262

6363
expect(vscode.window.showErrorMessage).toHaveBeenCalledWith("common:errors.url_fetch_error_with_url")
64-
expect(result).toContain("Error fetching content: net::ERR_INTERNET_DISCONNECTED")
64+
expect(result.text).toContain("Error fetching content: net::ERR_INTERNET_DISCONNECTED")
6565
})
6666

6767
it("should handle 403 Forbidden errors", async () => {
@@ -71,7 +71,7 @@ describe("parseMentions - URL error handling", () => {
7171
const result = await parseMentions("Check @https://example.com", "/test", mockUrlContentFetcher)
7272

7373
expect(vscode.window.showErrorMessage).toHaveBeenCalledWith("common:errors.url_fetch_error_with_url")
74-
expect(result).toContain("Error fetching content: 403 Forbidden")
74+
expect(result.text).toContain("Error fetching content: 403 Forbidden")
7575
})
7676

7777
it("should handle 404 Not Found errors", async () => {
@@ -81,7 +81,7 @@ describe("parseMentions - URL error handling", () => {
8181
const result = await parseMentions("Check @https://example.com/missing", "/test", mockUrlContentFetcher)
8282

8383
expect(vscode.window.showErrorMessage).toHaveBeenCalledWith("common:errors.url_fetch_error_with_url")
84-
expect(result).toContain("Error fetching content: 404 Not Found")
84+
expect(result.text).toContain("Error fetching content: 404 Not Found")
8585
})
8686

8787
it("should handle generic errors with fallback message", async () => {
@@ -91,7 +91,7 @@ describe("parseMentions - URL error handling", () => {
9191
const result = await parseMentions("Check @https://example.com", "/test", mockUrlContentFetcher)
9292

9393
expect(vscode.window.showErrorMessage).toHaveBeenCalledWith("common:errors.url_fetch_error_with_url")
94-
expect(result).toContain("Error fetching content: Some unexpected error")
94+
expect(result.text).toContain("Error fetching content: Some unexpected error")
9595
})
9696

9797
it("should handle non-Error objects thrown", async () => {
@@ -101,7 +101,7 @@ describe("parseMentions - URL error handling", () => {
101101
const result = await parseMentions("Check @https://example.com", "/test", mockUrlContentFetcher)
102102

103103
expect(vscode.window.showErrorMessage).toHaveBeenCalledWith("common:errors.url_fetch_error_with_url")
104-
expect(result).toContain("Error fetching content:")
104+
expect(result.text).toContain("Error fetching content:")
105105
})
106106

107107
it("should handle browser launch errors correctly", async () => {
@@ -113,7 +113,7 @@ describe("parseMentions - URL error handling", () => {
113113
expect(vscode.window.showErrorMessage).toHaveBeenCalledWith(
114114
"Error fetching content for https://example.com: Failed to launch browser",
115115
)
116-
expect(result).toContain("Error fetching content: Failed to launch browser")
116+
expect(result.text).toContain("Error fetching content: Failed to launch browser")
117117
// Should not attempt to fetch URL if browser launch failed
118118
expect(mockUrlContentFetcher.urlToMarkdown).not.toHaveBeenCalled()
119119
})
@@ -127,7 +127,7 @@ describe("parseMentions - URL error handling", () => {
127127
expect(vscode.window.showErrorMessage).toHaveBeenCalledWith(
128128
"Error fetching content for https://example.com: String error",
129129
)
130-
expect(result).toContain("Error fetching content: String error")
130+
expect(result.text).toContain("Error fetching content: String error")
131131
})
132132

133133
it("should successfully fetch URL content when no errors occur", async () => {
@@ -136,9 +136,9 @@ describe("parseMentions - URL error handling", () => {
136136
const result = await parseMentions("Check @https://example.com", "/test", mockUrlContentFetcher)
137137

138138
expect(vscode.window.showErrorMessage).not.toHaveBeenCalled()
139-
expect(result).toContain('<url_content url="https://example.com">')
140-
expect(result).toContain("# Example Content\n\nThis is the content.")
141-
expect(result).toContain("</url_content>")
139+
expect(result.text).toContain('<url_content url="https://example.com">')
140+
expect(result.text).toContain("# Example Content\n\nThis is the content.")
141+
expect(result.text).toContain("</url_content>")
142142
})
143143

144144
it("should handle multiple URLs with mixed success and failure", async () => {
@@ -152,9 +152,9 @@ describe("parseMentions - URL error handling", () => {
152152
mockUrlContentFetcher,
153153
)
154154

155-
expect(result).toContain('<url_content url="https://example1.com">')
156-
expect(result).toContain("# First Site")
157-
expect(result).toContain('<url_content url="https://example2.com">')
158-
expect(result).toContain("Error fetching content: timeout")
155+
expect(result.text).toContain('<url_content url="https://example1.com">')
156+
expect(result.text).toContain("# First Site")
157+
expect(result.text).toContain('<url_content url="https://example2.com">')
158+
expect(result.text).toContain("Error fetching content: timeout")
159159
})
160160
})

src/core/mentions/__tests__/processUserContentMentions.spec.ts

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ describe("processUserContentMentions", () => {
2323
mockFileContextTracker = {} as FileContextTracker
2424
mockRooIgnoreController = {}
2525

26-
// Default mock implementation
27-
vi.mocked(parseMentions).mockImplementation(async (text) => `parsed: ${text}`)
26+
// Default mock implementation - parseMentions now returns an object with text and optional pdfAttachments
27+
vi.mocked(parseMentions).mockImplementation(async (text) => ({ text: `parsed: ${text}` }))
2828
})
2929

3030
describe("maxReadFileLine parameter", () => {
@@ -55,6 +55,7 @@ describe("processUserContentMentions", () => {
5555
true, // includeDiagnosticMessages
5656
50, // maxDiagnosticMessages
5757
100,
58+
true, // enablePdfMultimodal
5859
)
5960
})
6061

@@ -84,6 +85,7 @@ describe("processUserContentMentions", () => {
8485
true, // includeDiagnosticMessages
8586
50, // maxDiagnosticMessages
8687
undefined,
88+
true, // enablePdfMultimodal
8789
)
8890
})
8991

@@ -114,6 +116,7 @@ describe("processUserContentMentions", () => {
114116
true, // includeDiagnosticMessages
115117
50, // maxDiagnosticMessages
116118
-1,
119+
true, // enablePdfMultimodal
117120
)
118121
})
119122
})
@@ -135,7 +138,7 @@ describe("processUserContentMentions", () => {
135138
})
136139

137140
expect(parseMentions).toHaveBeenCalled()
138-
expect(result[0]).toEqual({
141+
expect(result.content[0]).toEqual({
139142
type: "text",
140143
text: "parsed: <task>Do something</task>",
141144
})
@@ -157,7 +160,7 @@ describe("processUserContentMentions", () => {
157160
})
158161

159162
expect(parseMentions).toHaveBeenCalled()
160-
expect(result[0]).toEqual({
163+
expect(result.content[0]).toEqual({
161164
type: "text",
162165
text: "parsed: <feedback>Fix this issue</feedback>",
163166
})
@@ -179,7 +182,7 @@ describe("processUserContentMentions", () => {
179182
})
180183

181184
expect(parseMentions).not.toHaveBeenCalled()
182-
expect(result[0]).toEqual(userContent[0])
185+
expect(result.content[0]).toEqual(userContent[0])
183186
})
184187

185188
it("should process tool_result blocks with string content", async () => {
@@ -199,7 +202,7 @@ describe("processUserContentMentions", () => {
199202
})
200203

201204
expect(parseMentions).toHaveBeenCalled()
202-
expect(result[0]).toEqual({
205+
expect(result.content[0]).toEqual({
203206
type: "tool_result",
204207
tool_use_id: "123",
205208
content: "parsed: <feedback>Tool feedback</feedback>",
@@ -232,7 +235,7 @@ describe("processUserContentMentions", () => {
232235
})
233236

234237
expect(parseMentions).toHaveBeenCalledTimes(1)
235-
expect(result[0]).toEqual({
238+
expect(result.content[0]).toEqual({
236239
type: "tool_result",
237240
tool_use_id: "123",
238241
content: [
@@ -278,13 +281,13 @@ describe("processUserContentMentions", () => {
278281
})
279282

280283
expect(parseMentions).toHaveBeenCalledTimes(2)
281-
expect(result).toHaveLength(3)
282-
expect(result[0]).toEqual({
284+
expect(result.content).toHaveLength(3)
285+
expect(result.content[0]).toEqual({
283286
type: "text",
284287
text: "parsed: <task>First task</task>",
285288
})
286-
expect(result[1]).toEqual(userContent[1]) // Image block unchanged
287-
expect(result[2]).toEqual({
289+
expect(result.content[1]).toEqual(userContent[1]) // Image block unchanged
290+
expect(result.content[2]).toEqual({
288291
type: "tool_result",
289292
tool_use_id: "456",
290293
content: "parsed: <feedback>Feedback</feedback>",
@@ -318,6 +321,7 @@ describe("processUserContentMentions", () => {
318321
true, // includeDiagnosticMessages
319322
50, // maxDiagnosticMessages
320323
undefined,
324+
true, // enablePdfMultimodal
321325
)
322326
})
323327

@@ -347,6 +351,7 @@ describe("processUserContentMentions", () => {
347351
true, // includeDiagnosticMessages
348352
50, // maxDiagnosticMessages
349353
undefined,
354+
true, // enablePdfMultimodal
350355
)
351356
})
352357
})

src/core/mentions/index.ts

Lines changed: 42 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ import { getCommitInfo, getWorkingState } from "../../utils/git"
1010
import { getWorkspacePath } from "../../utils/path"
1111

1212
import { openFile } from "../../integrations/misc/open-file"
13-
import { extractTextFromFile } from "../../integrations/misc/extract-text"
13+
import { extractTextFromFile, supportsMultimodalAnalysis } from "../../integrations/misc/extract-text"
1414
import { diagnosticsToProblemsString } from "../../integrations/diagnostics"
1515

1616
import { UrlContentFetcher } from "../../services/browser/UrlContentFetcher"
@@ -87,9 +87,11 @@ export async function parseMentions(
8787
includeDiagnosticMessages: boolean = true,
8888
maxDiagnosticMessages: number = 50,
8989
maxReadFileLine?: number,
90-
): Promise<string> {
90+
enablePdfMultimodal: boolean = true,
91+
): Promise<{ text: string; pdfAttachments?: Array<{ path: string; data: any }> }> {
9192
const mentions: Set<string> = new Set()
9293
const validCommands: Map<string, Command> = new Map()
94+
const pdfAttachments: Array<{ path: string; data: any }> = []
9395

9496
// First pass: check which command mentions exist and cache the results
9597
const commandMatches = Array.from(text.matchAll(commandRegexGlobal))
@@ -188,20 +190,30 @@ export async function parseMentions(
188190
} else if (mention.startsWith("/")) {
189191
const mentionPath = mention.slice(1)
190192
try {
191-
const content = await getFileOrFolderContent(
193+
const result = await getFileOrFolderContent(
192194
mentionPath,
193195
cwd,
194196
rooIgnoreController,
195197
showRooIgnoredFiles,
196198
maxReadFileLine,
199+
enablePdfMultimodal,
197200
)
198-
if (mention.endsWith("/")) {
199-
parsedText += `\n\n<folder_content path="${mentionPath}">\n${content}\n</folder_content>`
201+
202+
// Check if this is a PDF with multimodal content
203+
if (result.pdfData) {
204+
pdfAttachments.push({
205+
path: mentionPath,
206+
data: result.pdfData,
207+
})
208+
parsedText += `\n\n<file_content path="${mentionPath}">\n[PDF file attached for multimodal analysis - contains visual elements like charts, diagrams, and tables]\n${result.content}\n</file_content>`
209+
} else if (mention.endsWith("/")) {
210+
parsedText += `\n\n<folder_content path="${mentionPath}">\n${result.content}\n</folder_content>`
200211
} else {
201-
parsedText += `\n\n<file_content path="${mentionPath}">\n${content}\n</file_content>`
202-
if (fileContextTracker) {
203-
await fileContextTracker.trackFileContext(mentionPath, "file_mentioned")
204-
}
212+
parsedText += `\n\n<file_content path="${mentionPath}">\n${result.content}\n</file_content>`
213+
}
214+
215+
if (fileContextTracker && !mention.endsWith("/")) {
216+
await fileContextTracker.trackFileContext(mentionPath, "file_mentioned")
205217
}
206218
} catch (error) {
207219
if (mention.endsWith("/")) {
@@ -263,7 +275,7 @@ export async function parseMentions(
263275
}
264276
}
265277

266-
return parsedText
278+
return { text: parsedText, pdfAttachments: pdfAttachments.length > 0 ? pdfAttachments : undefined }
267279
}
268280

269281
async function getFileOrFolderContent(
@@ -272,7 +284,8 @@ async function getFileOrFolderContent(
272284
rooIgnoreController?: any,
273285
showRooIgnoredFiles: boolean = true,
274286
maxReadFileLine?: number,
275-
): Promise<string> {
287+
enablePdfMultimodal: boolean = true,
288+
): Promise<{ content: string; pdfData?: any }> {
276289
const unescapedPath = unescapeSpaces(mentionPath)
277290
const absPath = path.resolve(cwd, unescapedPath)
278291

@@ -281,13 +294,25 @@ async function getFileOrFolderContent(
281294

282295
if (stats.isFile()) {
283296
if (rooIgnoreController && !rooIgnoreController.validateAccess(absPath)) {
284-
return `(File ${mentionPath} is ignored by .rooignore)`
297+
return { content: `(File ${mentionPath} is ignored by .rooignore)` }
285298
}
286299
try {
287-
const content = await extractTextFromFile(absPath, maxReadFileLine)
288-
return content
300+
// Check if this is a PDF and multimodal is enabled
301+
const fileExtension = path.extname(absPath).toLowerCase()
302+
if (enablePdfMultimodal && supportsMultimodalAnalysis(fileExtension)) {
303+
// Get both text and multimodal content for PDFs
304+
const textContent = (await extractTextFromFile(absPath, maxReadFileLine, false)) as string
305+
const pdfData = await extractTextFromFile(absPath, maxReadFileLine, true)
306+
return {
307+
content: textContent,
308+
pdfData: pdfData,
309+
}
310+
} else {
311+
const content = (await extractTextFromFile(absPath, maxReadFileLine, false)) as string
312+
return { content }
313+
}
289314
} catch (error) {
290-
return `(Failed to read contents of ${mentionPath}): ${error.message}`
315+
return { content: `(Failed to read contents of ${mentionPath}): ${error.message}` }
291316
}
292317
} else if (stats.isDirectory()) {
293318
const entries = await fs.readdir(absPath, { withFileTypes: true })
@@ -339,9 +364,9 @@ async function getFileOrFolderContent(
339364
}
340365
}
341366
const fileContents = (await Promise.all(fileContentPromises)).filter((content) => content)
342-
return `${folderContent}\n${fileContents.join("\n\n")}`.trim()
367+
return { content: `${folderContent}\n${fileContents.join("\n\n")}`.trim() }
343368
} else {
344-
return `(Failed to read contents of ${mentionPath})`
369+
return { content: `(Failed to read contents of ${mentionPath})` }
345370
}
346371
} catch (error) {
347372
throw new Error(`Failed to access path "${mentionPath}": ${error.message}`)

0 commit comments

Comments
 (0)