Skip to content

Commit c58b1e8

Browse files
committed
fix: image tooling
1 parent 96bf3e0 commit c58b1e8

File tree

13 files changed

+1533
-238
lines changed

13 files changed

+1533
-238
lines changed

apple/Clarissa/Sources/Tools/ImageAnalysisTool.swift

Lines changed: 18 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,14 @@ import UIKit
77
import AppKit
88
#endif
99

10-
/// Tool for analyzing images and PDFs using Apple's Vision and PDFKit frameworks
11-
/// Supports OCR, image classification, face detection, document detection, and PDF text extraction
10+
/// Tool for analyzing images and PDFs using Apple's Vision and PDFKit frameworks.
11+
/// NOTE: For user-attached images, pre-processing happens BEFORE the LLM call
12+
/// via ImagePreProcessor. This tool is for targeted follow-up operations on
13+
/// files referenced by URL (e.g., "get face coordinates" or "extract page 5").
14+
/// Only file:// URLs are supported to stay within context limits.
1215
final class ImageAnalysisTool: ClarissaTool, @unchecked Sendable {
1316
let name = "image_analysis"
14-
let description = "Analyze images or PDFs for text (OCR), objects, faces, or documents. For PDFs: extract text or OCR scanned pages. Provide data as base64 or file URL."
17+
let description = "Perform targeted analysis on images or PDFs via file URL. Actions: 'ocr' (text), 'classify' (objects), 'detect_faces', 'detect_document', 'pdf_extract_text', 'pdf_ocr', 'pdf_page_count'. Only file:// URLs supported."
1518
let priority = ToolPriority.extended
1619

1720
/// Maximum characters to return from PDF text extraction
@@ -29,21 +32,13 @@ final class ImageAnalysisTool: ClarissaTool, @unchecked Sendable {
2932
"enum": ["ocr", "classify", "detect_faces", "detect_document", "pdf_extract_text", "pdf_ocr", "pdf_page_count"],
3033
"description": "Analysis type: 'ocr' for image text, 'classify' for objects, 'detect_faces' for faces, 'detect_document' for boundaries, 'pdf_extract_text' for searchable PDFs, 'pdf_ocr' for scanned PDFs, 'pdf_page_count' for page count"
3134
],
32-
"imageBase64": [
33-
"type": "string",
34-
"description": "Base64-encoded image data"
35-
],
3635
"imageURL": [
3736
"type": "string",
38-
"description": "File URL to the image (file:// scheme)"
39-
],
40-
"pdfBase64": [
41-
"type": "string",
42-
"description": "Base64-encoded PDF data"
37+
"description": "File URL to the image (file:// scheme only)"
4338
],
4439
"pdfURL": [
4540
"type": "string",
46-
"description": "File URL to the PDF (file:// scheme)"
41+
"description": "File URL to the PDF (file:// scheme only)"
4742
],
4843
"pageRange": [
4944
"type": "string",
@@ -63,17 +58,13 @@ final class ImageAnalysisTool: ClarissaTool, @unchecked Sendable {
6358

6459
let pageRange = args["pageRange"] as? String
6560

66-
// Handle PDF actions
61+
// Handle PDF actions (file URLs only)
6762
if action.hasPrefix("pdf_") {
68-
let pdfDocument: PDFDocument
69-
if let base64 = args["pdfBase64"] as? String {
70-
pdfDocument = try pdfFromBase64(base64)
71-
} else if let urlString = args["pdfURL"] as? String,
72-
let url = URL(string: urlString) {
73-
pdfDocument = try pdfFromURL(url)
74-
} else {
75-
throw ToolError.invalidArguments("PDF actions require 'pdfBase64' or 'pdfURL'")
63+
guard let urlString = args["pdfURL"] as? String,
64+
let url = URL(string: urlString) else {
65+
throw ToolError.invalidArguments("PDF actions require 'pdfURL' (file:// URL)")
7666
}
67+
let pdfDocument = try pdfFromURL(url)
7768

7869
switch action {
7970
case "pdf_extract_text":
@@ -87,16 +78,12 @@ final class ImageAnalysisTool: ClarissaTool, @unchecked Sendable {
8778
}
8879
}
8980

90-
// Handle image actions
91-
let cgImage: CGImage
92-
if let base64 = args["imageBase64"] as? String {
93-
cgImage = try imageFromBase64(base64)
94-
} else if let urlString = args["imageURL"] as? String,
95-
let url = URL(string: urlString) {
96-
cgImage = try imageFromURL(url)
97-
} else {
98-
throw ToolError.invalidArguments("Image actions require 'imageBase64' or 'imageURL'")
81+
// Handle image actions (file URLs only)
82+
guard let urlString = args["imageURL"] as? String,
83+
let url = URL(string: urlString) else {
84+
throw ToolError.invalidArguments("Image actions require 'imageURL' (file:// URL)")
9985
}
86+
let cgImage = try imageFromURL(url)
10087

10188
switch action {
10289
case "ocr":
@@ -114,21 +101,6 @@ final class ImageAnalysisTool: ClarissaTool, @unchecked Sendable {
114101

115102
// MARK: - Image Loading
116103

117-
private func imageFromBase64(_ base64: String) throws -> CGImage {
118-
// Remove data URL prefix if present
119-
let cleanBase64 = base64.replacingOccurrences(
120-
of: "^data:image/[^;]+;base64,",
121-
with: "",
122-
options: .regularExpression
123-
)
124-
125-
guard let imageData = Data(base64Encoded: cleanBase64, options: .ignoreUnknownCharacters) else {
126-
throw ToolError.invalidArguments("Invalid base64 image data")
127-
}
128-
129-
return try cgImageFromData(imageData)
130-
}
131-
132104
private func imageFromURL(_ url: URL) throws -> CGImage {
133105
guard url.isFileURL else {
134106
throw ToolError.invalidArguments("Only file:// URLs are supported")
@@ -251,21 +223,6 @@ final class ImageAnalysisTool: ClarissaTool, @unchecked Sendable {
251223

252224
// MARK: - PDF Loading
253225

254-
private func pdfFromBase64(_ base64: String) throws -> PDFDocument {
255-
// Remove data URL prefix if present
256-
let cleanBase64 = base64.replacingOccurrences(
257-
of: "^data:application/pdf;base64,",
258-
with: "",
259-
options: .regularExpression
260-
)
261-
262-
guard let pdfData = Data(base64Encoded: cleanBase64, options: .ignoreUnknownCharacters),
263-
let document = PDFDocument(data: pdfData) else {
264-
throw ToolError.invalidArguments("Invalid base64 PDF data")
265-
}
266-
return document
267-
}
268-
269226
private func pdfFromURL(_ url: URL) throws -> PDFDocument {
270227
guard url.isFileURL else {
271228
throw ToolError.invalidArguments("Only file:// URLs are supported for PDFs")
Lines changed: 267 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,267 @@
1+
import Foundation
2+
import Vision
3+
import PDFKit
4+
#if canImport(UIKit)
5+
import UIKit
6+
#elseif canImport(AppKit)
7+
import AppKit
8+
#endif
9+
10+
/// Pre-processes images and PDFs BEFORE involving the LLM.
11+
/// This is critical for Apple Foundation Models which have a 4,096 token limit.
12+
/// Instead of passing base64 data (~100KB+ = 25,000+ tokens), we extract text
13+
/// and metadata (~500 chars = ~150 tokens) to stay within context limits.
14+
///
15+
/// Usage:
16+
/// let processor = ImagePreProcessor()
17+
/// let result = await processor.process(imageData: data)
18+
/// // result.contextString contains ~500 chars of extracted text/metadata
19+
/// // Pass this to the LLM, NOT the base64 data
20+
final class ImagePreProcessor: Sendable {
21+
22+
/// Result of pre-processing an image or PDF
23+
struct ProcessingResult: Sendable {
24+
let extractedText: String
25+
let classifications: [String]
26+
let faceCount: Int
27+
let hasDocument: Bool
28+
let pageCount: Int // For PDFs
29+
let error: String?
30+
31+
/// Compact context string for the LLM (typically 200-500 chars)
32+
var contextString: String {
33+
var parts: [String] = []
34+
35+
if let error = error {
36+
return "[Image processing error: \(error)]"
37+
}
38+
39+
if pageCount > 0 {
40+
parts.append("PDF with \(pageCount) page\(pageCount == 1 ? "" : "s")")
41+
}
42+
43+
if !extractedText.isEmpty {
44+
// Truncate text to keep context reasonable
45+
let maxTextLength = 1500
46+
let truncatedText = extractedText.count > maxTextLength
47+
? String(extractedText.prefix(maxTextLength)) + "..."
48+
: extractedText
49+
parts.append("Text content: \(truncatedText)")
50+
}
51+
52+
if !classifications.isEmpty {
53+
parts.append("Contains: \(classifications.joined(separator: ", "))")
54+
}
55+
56+
if faceCount > 0 {
57+
parts.append("Faces detected: \(faceCount)")
58+
}
59+
60+
if hasDocument {
61+
parts.append("Document detected in image")
62+
}
63+
64+
if parts.isEmpty {
65+
return "[Image analyzed - no text or notable content detected]"
66+
}
67+
68+
return "[Image Analysis]\n\(parts.joined(separator: "\n"))"
69+
}
70+
}
71+
72+
// MARK: - Public Methods
73+
74+
/// Process image data and extract text/metadata
75+
func process(imageData: Data) async -> ProcessingResult {
76+
guard let cgImage = cgImageFromData(imageData) else {
77+
return ProcessingResult(
78+
extractedText: "",
79+
classifications: [],
80+
faceCount: 0,
81+
hasDocument: false,
82+
pageCount: 0,
83+
error: "Failed to decode image"
84+
)
85+
}
86+
87+
return await processImage(cgImage)
88+
}
89+
90+
/// Process a PDF and extract text/metadata
91+
func process(pdfData: Data) async -> ProcessingResult {
92+
guard let document = PDFDocument(data: pdfData) else {
93+
return ProcessingResult(
94+
extractedText: "",
95+
classifications: [],
96+
faceCount: 0,
97+
hasDocument: false,
98+
pageCount: 0,
99+
error: "Failed to decode PDF"
100+
)
101+
}
102+
103+
return await processPDF(document)
104+
}
105+
106+
// MARK: - Image Processing
107+
108+
private func processImage(_ cgImage: CGImage) async -> ProcessingResult {
109+
// Run OCR, classification, and face detection in parallel
110+
async let ocrResult = performOCR(on: cgImage)
111+
async let classifyResult = performClassification(on: cgImage)
112+
async let faceResult = detectFaces(in: cgImage)
113+
async let docResult = detectDocument(in: cgImage)
114+
115+
let (text, classifications, faceCount, hasDocument) = await (
116+
ocrResult, classifyResult, faceResult, docResult
117+
)
118+
119+
return ProcessingResult(
120+
extractedText: text,
121+
classifications: classifications,
122+
faceCount: faceCount,
123+
hasDocument: hasDocument,
124+
pageCount: 0,
125+
error: nil
126+
)
127+
}
128+
129+
private func performOCR(on image: CGImage) async -> String {
130+
let request = VNRecognizeTextRequest()
131+
request.recognitionLevel = .accurate
132+
request.usesLanguageCorrection = true
133+
134+
let handler = VNImageRequestHandler(cgImage: image, options: [:])
135+
136+
do {
137+
try handler.perform([request])
138+
guard let observations = request.results else { return "" }
139+
return observations.compactMap { $0.topCandidates(1).first?.string }.joined(separator: "\n")
140+
} catch {
141+
return ""
142+
}
143+
}
144+
145+
private func performClassification(on image: CGImage) async -> [String] {
146+
let request = VNClassifyImageRequest()
147+
let handler = VNImageRequestHandler(cgImage: image, options: [:])
148+
149+
do {
150+
try handler.perform([request])
151+
guard let observations = request.results else { return [] }
152+
// Return top 5 with confidence > 20%
153+
return observations
154+
.filter { $0.confidence > 0.2 }
155+
.prefix(5)
156+
.map { $0.identifier.replacingOccurrences(of: "_", with: " ") }
157+
} catch {
158+
return []
159+
}
160+
}
161+
162+
private func detectFaces(in image: CGImage) async -> Int {
163+
let request = VNDetectFaceRectanglesRequest()
164+
let handler = VNImageRequestHandler(cgImage: image, options: [:])
165+
166+
do {
167+
try handler.perform([request])
168+
return request.results?.count ?? 0
169+
} catch {
170+
return 0
171+
}
172+
}
173+
174+
private func detectDocument(in image: CGImage) async -> Bool {
175+
let request = VNDetectDocumentSegmentationRequest()
176+
let handler = VNImageRequestHandler(cgImage: image, options: [:])
177+
178+
do {
179+
try handler.perform([request])
180+
return request.results?.first != nil
181+
} catch {
182+
return false
183+
}
184+
}
185+
186+
// MARK: - PDF Processing
187+
188+
private func processPDF(_ document: PDFDocument) async -> ProcessingResult {
189+
let pageCount = document.pageCount
190+
var allText = ""
191+
let maxPagesToProcess = min(pageCount, 5) // Limit to first 5 pages
192+
193+
for i in 0..<maxPagesToProcess {
194+
guard let page = document.page(at: i) else { continue }
195+
196+
// First try native text extraction
197+
if let pageText = page.string, !pageText.isEmpty {
198+
allText += "Page \(i + 1): \(pageText)\n"
199+
} else {
200+
// Fall back to OCR for scanned PDFs
201+
if let cgImage = renderPageToImage(page) {
202+
let ocrText = await performOCR(on: cgImage)
203+
if !ocrText.isEmpty {
204+
allText += "Page \(i + 1) (OCR): \(ocrText)\n"
205+
}
206+
}
207+
}
208+
}
209+
210+
return ProcessingResult(
211+
extractedText: allText,
212+
classifications: [],
213+
faceCount: 0,
214+
hasDocument: true,
215+
pageCount: pageCount,
216+
error: nil
217+
)
218+
}
219+
220+
private func renderPageToImage(_ page: PDFPage) -> CGImage? {
221+
let pageRect = page.bounds(for: .mediaBox)
222+
let scale: CGFloat = 2.0 // 2x for better OCR
223+
224+
#if canImport(UIKit)
225+
let renderer = UIGraphicsImageRenderer(size: CGSize(
226+
width: pageRect.width * scale,
227+
height: pageRect.height * scale
228+
))
229+
let image = renderer.image { context in
230+
UIColor.white.setFill()
231+
context.fill(CGRect(origin: .zero, size: renderer.format.bounds.size))
232+
context.cgContext.scaleBy(x: scale, y: scale)
233+
page.draw(with: .mediaBox, to: context.cgContext)
234+
}
235+
return image.cgImage
236+
#elseif canImport(AppKit)
237+
let image = NSImage(size: NSSize(
238+
width: pageRect.width * scale,
239+
height: pageRect.height * scale
240+
))
241+
image.lockFocus()
242+
NSColor.white.setFill()
243+
NSRect(origin: .zero, size: image.size).fill()
244+
if let context = NSGraphicsContext.current?.cgContext {
245+
context.scaleBy(x: scale, y: scale)
246+
page.draw(with: .mediaBox, to: context)
247+
}
248+
image.unlockFocus()
249+
return image.cgImage(forProposedRect: nil, context: nil, hints: nil)
250+
#endif
251+
}
252+
253+
// MARK: - Image Decoding
254+
255+
private func cgImageFromData(_ data: Data) -> CGImage? {
256+
#if canImport(UIKit)
257+
guard let uiImage = UIImage(data: data),
258+
let cgImage = uiImage.cgImage else { return nil }
259+
return cgImage
260+
#elseif canImport(AppKit)
261+
guard let nsImage = NSImage(data: data),
262+
let cgImage = nsImage.cgImage(forProposedRect: nil, context: nil, hints: nil) else { return nil }
263+
return cgImage
264+
#endif
265+
}
266+
}
267+

0 commit comments

Comments
 (0)