cameronrye
diff --git a/‎apple/Clarissa/Sources/Tools/ImageAnalysisTool.swift‎
Lines changed: 18 additions & 61 deletions b/‎apple/Clarissa/Sources/Tools/ImageAnalysisTool.swift‎
Lines changed: 18 additions & 61 deletions
diff --git a/‎apple/Clarissa/Sources/Tools/ImagePreProcessor.swift‎
Lines changed: 267 additions & 0 deletions b/‎apple/Clarissa/Sources/Tools/ImagePreProcessor.swift‎
Lines changed: 267 additions & 0 deletions
@@ -7,11 +7,14 @@ import UIKit
 import AppKit
 #endif
 
-/// Tool for analyzing images and PDFs using Apple's Vision and PDFKit frameworks
-/// Supports OCR, image classification, face detection, document detection, and PDF text extraction
+/// Tool for analyzing images and PDFs using Apple's Vision and PDFKit frameworks.
+/// NOTE: For user-attached images, pre-processing happens BEFORE the LLM call
+/// via ImagePreProcessor. This tool is for targeted follow-up operations on
+/// files referenced by URL (e.g., "get face coordinates" or "extract page 5").
+/// Only file:// URLs are supported to stay within context limits.
 final class ImageAnalysisTool: ClarissaTool, @unchecked Sendable {
     let name = "image_analysis"
-    let description = "Analyze images or PDFs for text (OCR), objects, faces, or documents. For PDFs: extract text or OCR scanned pages. Provide data as base64 or file URL."
+    let description = "Perform targeted analysis on images or PDFs via file URL. Actions: 'ocr' (text), 'classify' (objects), 'detect_faces', 'detect_document', 'pdf_extract_text', 'pdf_ocr', 'pdf_page_count'. Only file:// URLs supported."
     let priority = ToolPriority.extended
 
     /// Maximum characters to return from PDF text extraction
@@ -29,21 +32,13 @@ final class ImageAnalysisTool: ClarissaTool, @unchecked Sendable {
                     "enum": ["ocr", "classify", "detect_faces", "detect_document", "pdf_extract_text", "pdf_ocr", "pdf_page_count"],
                     "description": "Analysis type: 'ocr' for image text, 'classify' for objects, 'detect_faces' for faces, 'detect_document' for boundaries, 'pdf_extract_text' for searchable PDFs, 'pdf_ocr' for scanned PDFs, 'pdf_page_count' for page count"
                 ],
-                "imageBase64": [
-                    "type": "string",
-                    "description": "Base64-encoded image data"
-                ],
                 "imageURL": [
                     "type": "string",
-                    "description": "File URL to the image (file:// scheme)"
-                ],
-                "pdfBase64": [
-                    "type": "string",
-                    "description": "Base64-encoded PDF data"
+                    "description": "File URL to the image (file:// scheme only)"
                 ],
                 "pdfURL": [
                     "type": "string",
-                    "description": "File URL to the PDF (file:// scheme)"
+                    "description": "File URL to the PDF (file:// scheme only)"
                 ],
                 "pageRange": [
                     "type": "string",
@@ -63,17 +58,13 @@ final class ImageAnalysisTool: ClarissaTool, @unchecked Sendable {
 
         let pageRange = args["pageRange"] as? String
 
-        // Handle PDF actions
+        // Handle PDF actions (file URLs only)
         if action.hasPrefix("pdf_") {
-            let pdfDocument: PDFDocument
-            if let base64 = args["pdfBase64"] as? String {
-                pdfDocument = try pdfFromBase64(base64)
-            } else if let urlString = args["pdfURL"] as? String,
-                      let url = URL(string: urlString) {
-                pdfDocument = try pdfFromURL(url)
-            } else {
-                throw ToolError.invalidArguments("PDF actions require 'pdfBase64' or 'pdfURL'")
+            guard let urlString = args["pdfURL"] as? String,
+                  let url = URL(string: urlString) else {
+                throw ToolError.invalidArguments("PDF actions require 'pdfURL' (file:// URL)")
             }
+            let pdfDocument = try pdfFromURL(url)
 
             switch action {
             case "pdf_extract_text":
@@ -87,16 +78,12 @@ final class ImageAnalysisTool: ClarissaTool, @unchecked Sendable {
             }
         }
 
-        // Handle image actions
-        let cgImage: CGImage
-        if let base64 = args["imageBase64"] as? String {
-            cgImage = try imageFromBase64(base64)
-        } else if let urlString = args["imageURL"] as? String,
-                  let url = URL(string: urlString) {
-            cgImage = try imageFromURL(url)
-        } else {
-            throw ToolError.invalidArguments("Image actions require 'imageBase64' or 'imageURL'")
+        // Handle image actions (file URLs only)
+        guard let urlString = args["imageURL"] as? String,
+              let url = URL(string: urlString) else {
+            throw ToolError.invalidArguments("Image actions require 'imageURL' (file:// URL)")
         }
+        let cgImage = try imageFromURL(url)
 
         switch action {
         case "ocr":
@@ -114,21 +101,6 @@ final class ImageAnalysisTool: ClarissaTool, @unchecked Sendable {
 
     // MARK: - Image Loading
 
-    private func imageFromBase64(_ base64: String) throws -> CGImage {
-        // Remove data URL prefix if present
-        let cleanBase64 = base64.replacingOccurrences(
-            of: "^data:image/[^;]+;base64,",
-            with: "",
-            options: .regularExpression
-        )
-
-        guard let imageData = Data(base64Encoded: cleanBase64, options: .ignoreUnknownCharacters) else {
-            throw ToolError.invalidArguments("Invalid base64 image data")
-        }
-
-        return try cgImageFromData(imageData)
-    }
-
     private func imageFromURL(_ url: URL) throws -> CGImage {
         guard url.isFileURL else {
             throw ToolError.invalidArguments("Only file:// URLs are supported")
@@ -251,21 +223,6 @@ final class ImageAnalysisTool: ClarissaTool, @unchecked Sendable {
 
     // MARK: - PDF Loading
 
-    private func pdfFromBase64(_ base64: String) throws -> PDFDocument {
-        // Remove data URL prefix if present
-        let cleanBase64 = base64.replacingOccurrences(
-            of: "^data:application/pdf;base64,",
-            with: "",
-            options: .regularExpression
-        )
-
-        guard let pdfData = Data(base64Encoded: cleanBase64, options: .ignoreUnknownCharacters),
-              let document = PDFDocument(data: pdfData) else {
-            throw ToolError.invalidArguments("Invalid base64 PDF data")
-        }
-        return document
-    }
-
     private func pdfFromURL(_ url: URL) throws -> PDFDocument {
         guard url.isFileURL else {
             throw ToolError.invalidArguments("Only file:// URLs are supported for PDFs")
 
@@ -0,0 +1,267 @@
+import Foundation
+import Vision
+import PDFKit
+#if canImport(UIKit)
+import UIKit
+#elseif canImport(AppKit)
+import AppKit
+#endif
+
+/// Pre-processes images and PDFs BEFORE involving the LLM.
+/// This is critical for Apple Foundation Models which have a 4,096 token limit.
+/// Instead of passing base64 data (~100KB+ = 25,000+ tokens), we extract text
+/// and metadata (~500 chars = ~150 tokens) to stay within context limits.
+///
+/// Usage:
+///   let processor = ImagePreProcessor()
+///   let result = await processor.process(imageData: data)
+///   // result.contextString contains ~500 chars of extracted text/metadata
+///   // Pass this to the LLM, NOT the base64 data
+final class ImagePreProcessor: Sendable {
+    
+    /// Result of pre-processing an image or PDF
+    struct ProcessingResult: Sendable {
+        let extractedText: String
+        let classifications: [String]
+        let faceCount: Int
+        let hasDocument: Bool
+        let pageCount: Int  // For PDFs
+        let error: String?
+        
+        /// Compact context string for the LLM (typically 200-500 chars)
+        var contextString: String {
+            var parts: [String] = []
+            
+            if let error = error {
+                return "[Image processing error: \(error)]"
+            }
+            
+            if pageCount > 0 {
+                parts.append("PDF with \(pageCount) page\(pageCount == 1 ? "" : "s")")
+            }
+            
+            if !extractedText.isEmpty {
+                // Truncate text to keep context reasonable
+                let maxTextLength = 1500
+                let truncatedText = extractedText.count > maxTextLength 
+                    ? String(extractedText.prefix(maxTextLength)) + "..."
+                    : extractedText
+                parts.append("Text content: \(truncatedText)")
+            }
+            
+            if !classifications.isEmpty {
+                parts.append("Contains: \(classifications.joined(separator: ", "))")
+            }
+            
+            if faceCount > 0 {
+                parts.append("Faces detected: \(faceCount)")
+            }
+            
+            if hasDocument {
+                parts.append("Document detected in image")
+            }
+            
+            if parts.isEmpty {
+                return "[Image analyzed - no text or notable content detected]"
+            }
+            
+            return "[Image Analysis]\n\(parts.joined(separator: "\n"))"
+        }
+    }
+    
+    // MARK: - Public Methods
+    
+    /// Process image data and extract text/metadata
+    func process(imageData: Data) async -> ProcessingResult {
+        guard let cgImage = cgImageFromData(imageData) else {
+            return ProcessingResult(
+                extractedText: "",
+                classifications: [],
+                faceCount: 0,
+                hasDocument: false,
+                pageCount: 0,
+                error: "Failed to decode image"
+            )
+        }
+        
+        return await processImage(cgImage)
+    }
+    
+    /// Process a PDF and extract text/metadata
+    func process(pdfData: Data) async -> ProcessingResult {
+        guard let document = PDFDocument(data: pdfData) else {
+            return ProcessingResult(
+                extractedText: "",
+                classifications: [],
+                faceCount: 0,
+                hasDocument: false,
+                pageCount: 0,
+                error: "Failed to decode PDF"
+            )
+        }
+        
+        return await processPDF(document)
+    }
+    
+    // MARK: - Image Processing
+    
+    private func processImage(_ cgImage: CGImage) async -> ProcessingResult {
+        // Run OCR, classification, and face detection in parallel
+        async let ocrResult = performOCR(on: cgImage)
+        async let classifyResult = performClassification(on: cgImage)
+        async let faceResult = detectFaces(in: cgImage)
+        async let docResult = detectDocument(in: cgImage)
+        
+        let (text, classifications, faceCount, hasDocument) = await (
+            ocrResult, classifyResult, faceResult, docResult
+        )
+        
+        return ProcessingResult(
+            extractedText: text,
+            classifications: classifications,
+            faceCount: faceCount,
+            hasDocument: hasDocument,
+            pageCount: 0,
+            error: nil
+        )
+    }
+    
+    private func performOCR(on image: CGImage) async -> String {
+        let request = VNRecognizeTextRequest()
+        request.recognitionLevel = .accurate
+        request.usesLanguageCorrection = true
+        
+        let handler = VNImageRequestHandler(cgImage: image, options: [:])
+        
+        do {
+            try handler.perform([request])
+            guard let observations = request.results else { return "" }
+            return observations.compactMap { $0.topCandidates(1).first?.string }.joined(separator: "\n")
+        } catch {
+            return ""
+        }
+    }
+    
+    private func performClassification(on image: CGImage) async -> [String] {
+        let request = VNClassifyImageRequest()
+        let handler = VNImageRequestHandler(cgImage: image, options: [:])
+
+        do {
+            try handler.perform([request])
+            guard let observations = request.results else { return [] }
+            // Return top 5 with confidence > 20%
+            return observations
+                .filter { $0.confidence > 0.2 }
+                .prefix(5)
+                .map { $0.identifier.replacingOccurrences(of: "_", with: " ") }
+        } catch {
+            return []
+        }
+    }
+
+    private func detectFaces(in image: CGImage) async -> Int {
+        let request = VNDetectFaceRectanglesRequest()
+        let handler = VNImageRequestHandler(cgImage: image, options: [:])
+
+        do {
+            try handler.perform([request])
+            return request.results?.count ?? 0
+        } catch {
+            return 0
+        }
+    }
+
+    private func detectDocument(in image: CGImage) async -> Bool {
+        let request = VNDetectDocumentSegmentationRequest()
+        let handler = VNImageRequestHandler(cgImage: image, options: [:])
+
+        do {
+            try handler.perform([request])
+            return request.results?.first != nil
+        } catch {
+            return false
+        }
+    }
+
+    // MARK: - PDF Processing
+
+    private func processPDF(_ document: PDFDocument) async -> ProcessingResult {
+        let pageCount = document.pageCount
+        var allText = ""
+        let maxPagesToProcess = min(pageCount, 5)  // Limit to first 5 pages
+
+        for i in 0..<maxPagesToProcess {
+            guard let page = document.page(at: i) else { continue }
+
+            // First try native text extraction
+            if let pageText = page.string, !pageText.isEmpty {
+                allText += "Page \(i + 1): \(pageText)\n"
+            } else {
+                // Fall back to OCR for scanned PDFs
+                if let cgImage = renderPageToImage(page) {
+                    let ocrText = await performOCR(on: cgImage)
+                    if !ocrText.isEmpty {
+                        allText += "Page \(i + 1) (OCR): \(ocrText)\n"
+                    }
+                }
+            }
+        }
+
+        return ProcessingResult(
+            extractedText: allText,
+            classifications: [],
+            faceCount: 0,
+            hasDocument: true,
+            pageCount: pageCount,
+            error: nil
+        )
+    }
+
+    private func renderPageToImage(_ page: PDFPage) -> CGImage? {
+        let pageRect = page.bounds(for: .mediaBox)
+        let scale: CGFloat = 2.0  // 2x for better OCR
+
+        #if canImport(UIKit)
+        let renderer = UIGraphicsImageRenderer(size: CGSize(
+            width: pageRect.width * scale,
+            height: pageRect.height * scale
+        ))
+        let image = renderer.image { context in
+            UIColor.white.setFill()
+            context.fill(CGRect(origin: .zero, size: renderer.format.bounds.size))
+            context.cgContext.scaleBy(x: scale, y: scale)
+            page.draw(with: .mediaBox, to: context.cgContext)
+        }
+        return image.cgImage
+        #elseif canImport(AppKit)
+        let image = NSImage(size: NSSize(
+            width: pageRect.width * scale,
+            height: pageRect.height * scale
+        ))
+        image.lockFocus()
+        NSColor.white.setFill()
+        NSRect(origin: .zero, size: image.size).fill()
+        if let context = NSGraphicsContext.current?.cgContext {
+            context.scaleBy(x: scale, y: scale)
+            page.draw(with: .mediaBox, to: context)
+        }
+        image.unlockFocus()
+        return image.cgImage(forProposedRect: nil, context: nil, hints: nil)
+        #endif
+    }
+
+    // MARK: - Image Decoding
+
+    private func cgImageFromData(_ data: Data) -> CGImage? {
+        #if canImport(UIKit)
+        guard let uiImage = UIImage(data: data),
+              let cgImage = uiImage.cgImage else { return nil }
+        return cgImage
+        #elseif canImport(AppKit)
+        guard let nsImage = NSImage(data: data),
+              let cgImage = nsImage.cgImage(forProposedRect: nil, context: nil, hints: nil) else { return nil }
+        return cgImage
+        #endif
+    }
+}
+