|
| 1 | +import Foundation |
| 2 | +import Vision |
| 3 | +import PDFKit |
| 4 | +#if canImport(UIKit) |
| 5 | +import UIKit |
| 6 | +#elseif canImport(AppKit) |
| 7 | +import AppKit |
| 8 | +#endif |
| 9 | + |
| 10 | +/// Pre-processes images and PDFs BEFORE involving the LLM. |
| 11 | +/// This is critical for Apple Foundation Models which have a 4,096 token limit. |
| 12 | +/// Instead of passing base64 data (~100KB+ = 25,000+ tokens), we extract text |
| 13 | +/// and metadata (~500 chars = ~150 tokens) to stay within context limits. |
| 14 | +/// |
| 15 | +/// Usage: |
| 16 | +/// let processor = ImagePreProcessor() |
| 17 | +/// let result = await processor.process(imageData: data) |
| 18 | +/// // result.contextString contains ~500 chars of extracted text/metadata |
| 19 | +/// // Pass this to the LLM, NOT the base64 data |
| 20 | +final class ImagePreProcessor: Sendable { |
| 21 | + |
| 22 | + /// Result of pre-processing an image or PDF |
| 23 | + struct ProcessingResult: Sendable { |
| 24 | + let extractedText: String |
| 25 | + let classifications: [String] |
| 26 | + let faceCount: Int |
| 27 | + let hasDocument: Bool |
| 28 | + let pageCount: Int // For PDFs |
| 29 | + let error: String? |
| 30 | + |
| 31 | + /// Compact context string for the LLM (typically 200-500 chars) |
| 32 | + var contextString: String { |
| 33 | + var parts: [String] = [] |
| 34 | + |
| 35 | + if let error = error { |
| 36 | + return "[Image processing error: \(error)]" |
| 37 | + } |
| 38 | + |
| 39 | + if pageCount > 0 { |
| 40 | + parts.append("PDF with \(pageCount) page\(pageCount == 1 ? "" : "s")") |
| 41 | + } |
| 42 | + |
| 43 | + if !extractedText.isEmpty { |
| 44 | + // Truncate text to keep context reasonable |
| 45 | + let maxTextLength = 1500 |
| 46 | + let truncatedText = extractedText.count > maxTextLength |
| 47 | + ? String(extractedText.prefix(maxTextLength)) + "..." |
| 48 | + : extractedText |
| 49 | + parts.append("Text content: \(truncatedText)") |
| 50 | + } |
| 51 | + |
| 52 | + if !classifications.isEmpty { |
| 53 | + parts.append("Contains: \(classifications.joined(separator: ", "))") |
| 54 | + } |
| 55 | + |
| 56 | + if faceCount > 0 { |
| 57 | + parts.append("Faces detected: \(faceCount)") |
| 58 | + } |
| 59 | + |
| 60 | + if hasDocument { |
| 61 | + parts.append("Document detected in image") |
| 62 | + } |
| 63 | + |
| 64 | + if parts.isEmpty { |
| 65 | + return "[Image analyzed - no text or notable content detected]" |
| 66 | + } |
| 67 | + |
| 68 | + return "[Image Analysis]\n\(parts.joined(separator: "\n"))" |
| 69 | + } |
| 70 | + } |
| 71 | + |
| 72 | + // MARK: - Public Methods |
| 73 | + |
| 74 | + /// Process image data and extract text/metadata |
| 75 | + func process(imageData: Data) async -> ProcessingResult { |
| 76 | + guard let cgImage = cgImageFromData(imageData) else { |
| 77 | + return ProcessingResult( |
| 78 | + extractedText: "", |
| 79 | + classifications: [], |
| 80 | + faceCount: 0, |
| 81 | + hasDocument: false, |
| 82 | + pageCount: 0, |
| 83 | + error: "Failed to decode image" |
| 84 | + ) |
| 85 | + } |
| 86 | + |
| 87 | + return await processImage(cgImage) |
| 88 | + } |
| 89 | + |
| 90 | + /// Process a PDF and extract text/metadata |
| 91 | + func process(pdfData: Data) async -> ProcessingResult { |
| 92 | + guard let document = PDFDocument(data: pdfData) else { |
| 93 | + return ProcessingResult( |
| 94 | + extractedText: "", |
| 95 | + classifications: [], |
| 96 | + faceCount: 0, |
| 97 | + hasDocument: false, |
| 98 | + pageCount: 0, |
| 99 | + error: "Failed to decode PDF" |
| 100 | + ) |
| 101 | + } |
| 102 | + |
| 103 | + return await processPDF(document) |
| 104 | + } |
| 105 | + |
| 106 | + // MARK: - Image Processing |
| 107 | + |
| 108 | + private func processImage(_ cgImage: CGImage) async -> ProcessingResult { |
| 109 | + // Run OCR, classification, and face detection in parallel |
| 110 | + async let ocrResult = performOCR(on: cgImage) |
| 111 | + async let classifyResult = performClassification(on: cgImage) |
| 112 | + async let faceResult = detectFaces(in: cgImage) |
| 113 | + async let docResult = detectDocument(in: cgImage) |
| 114 | + |
| 115 | + let (text, classifications, faceCount, hasDocument) = await ( |
| 116 | + ocrResult, classifyResult, faceResult, docResult |
| 117 | + ) |
| 118 | + |
| 119 | + return ProcessingResult( |
| 120 | + extractedText: text, |
| 121 | + classifications: classifications, |
| 122 | + faceCount: faceCount, |
| 123 | + hasDocument: hasDocument, |
| 124 | + pageCount: 0, |
| 125 | + error: nil |
| 126 | + ) |
| 127 | + } |
| 128 | + |
| 129 | + private func performOCR(on image: CGImage) async -> String { |
| 130 | + let request = VNRecognizeTextRequest() |
| 131 | + request.recognitionLevel = .accurate |
| 132 | + request.usesLanguageCorrection = true |
| 133 | + |
| 134 | + let handler = VNImageRequestHandler(cgImage: image, options: [:]) |
| 135 | + |
| 136 | + do { |
| 137 | + try handler.perform([request]) |
| 138 | + guard let observations = request.results else { return "" } |
| 139 | + return observations.compactMap { $0.topCandidates(1).first?.string }.joined(separator: "\n") |
| 140 | + } catch { |
| 141 | + return "" |
| 142 | + } |
| 143 | + } |
| 144 | + |
| 145 | + private func performClassification(on image: CGImage) async -> [String] { |
| 146 | + let request = VNClassifyImageRequest() |
| 147 | + let handler = VNImageRequestHandler(cgImage: image, options: [:]) |
| 148 | + |
| 149 | + do { |
| 150 | + try handler.perform([request]) |
| 151 | + guard let observations = request.results else { return [] } |
| 152 | + // Return top 5 with confidence > 20% |
| 153 | + return observations |
| 154 | + .filter { $0.confidence > 0.2 } |
| 155 | + .prefix(5) |
| 156 | + .map { $0.identifier.replacingOccurrences(of: "_", with: " ") } |
| 157 | + } catch { |
| 158 | + return [] |
| 159 | + } |
| 160 | + } |
| 161 | + |
| 162 | + private func detectFaces(in image: CGImage) async -> Int { |
| 163 | + let request = VNDetectFaceRectanglesRequest() |
| 164 | + let handler = VNImageRequestHandler(cgImage: image, options: [:]) |
| 165 | + |
| 166 | + do { |
| 167 | + try handler.perform([request]) |
| 168 | + return request.results?.count ?? 0 |
| 169 | + } catch { |
| 170 | + return 0 |
| 171 | + } |
| 172 | + } |
| 173 | + |
| 174 | + private func detectDocument(in image: CGImage) async -> Bool { |
| 175 | + let request = VNDetectDocumentSegmentationRequest() |
| 176 | + let handler = VNImageRequestHandler(cgImage: image, options: [:]) |
| 177 | + |
| 178 | + do { |
| 179 | + try handler.perform([request]) |
| 180 | + return request.results?.first != nil |
| 181 | + } catch { |
| 182 | + return false |
| 183 | + } |
| 184 | + } |
| 185 | + |
| 186 | + // MARK: - PDF Processing |
| 187 | + |
| 188 | + private func processPDF(_ document: PDFDocument) async -> ProcessingResult { |
| 189 | + let pageCount = document.pageCount |
| 190 | + var allText = "" |
| 191 | + let maxPagesToProcess = min(pageCount, 5) // Limit to first 5 pages |
| 192 | + |
| 193 | + for i in 0..<maxPagesToProcess { |
| 194 | + guard let page = document.page(at: i) else { continue } |
| 195 | + |
| 196 | + // First try native text extraction |
| 197 | + if let pageText = page.string, !pageText.isEmpty { |
| 198 | + allText += "Page \(i + 1): \(pageText)\n" |
| 199 | + } else { |
| 200 | + // Fall back to OCR for scanned PDFs |
| 201 | + if let cgImage = renderPageToImage(page) { |
| 202 | + let ocrText = await performOCR(on: cgImage) |
| 203 | + if !ocrText.isEmpty { |
| 204 | + allText += "Page \(i + 1) (OCR): \(ocrText)\n" |
| 205 | + } |
| 206 | + } |
| 207 | + } |
| 208 | + } |
| 209 | + |
| 210 | + return ProcessingResult( |
| 211 | + extractedText: allText, |
| 212 | + classifications: [], |
| 213 | + faceCount: 0, |
| 214 | + hasDocument: true, |
| 215 | + pageCount: pageCount, |
| 216 | + error: nil |
| 217 | + ) |
| 218 | + } |
| 219 | + |
| 220 | + private func renderPageToImage(_ page: PDFPage) -> CGImage? { |
| 221 | + let pageRect = page.bounds(for: .mediaBox) |
| 222 | + let scale: CGFloat = 2.0 // 2x for better OCR |
| 223 | + |
| 224 | + #if canImport(UIKit) |
| 225 | + let renderer = UIGraphicsImageRenderer(size: CGSize( |
| 226 | + width: pageRect.width * scale, |
| 227 | + height: pageRect.height * scale |
| 228 | + )) |
| 229 | + let image = renderer.image { context in |
| 230 | + UIColor.white.setFill() |
| 231 | + context.fill(CGRect(origin: .zero, size: renderer.format.bounds.size)) |
| 232 | + context.cgContext.scaleBy(x: scale, y: scale) |
| 233 | + page.draw(with: .mediaBox, to: context.cgContext) |
| 234 | + } |
| 235 | + return image.cgImage |
| 236 | + #elseif canImport(AppKit) |
| 237 | + let image = NSImage(size: NSSize( |
| 238 | + width: pageRect.width * scale, |
| 239 | + height: pageRect.height * scale |
| 240 | + )) |
| 241 | + image.lockFocus() |
| 242 | + NSColor.white.setFill() |
| 243 | + NSRect(origin: .zero, size: image.size).fill() |
| 244 | + if let context = NSGraphicsContext.current?.cgContext { |
| 245 | + context.scaleBy(x: scale, y: scale) |
| 246 | + page.draw(with: .mediaBox, to: context) |
| 247 | + } |
| 248 | + image.unlockFocus() |
| 249 | + return image.cgImage(forProposedRect: nil, context: nil, hints: nil) |
| 250 | + #endif |
| 251 | + } |
| 252 | + |
| 253 | + // MARK: - Image Decoding |
| 254 | + |
| 255 | + private func cgImageFromData(_ data: Data) -> CGImage? { |
| 256 | + #if canImport(UIKit) |
| 257 | + guard let uiImage = UIImage(data: data), |
| 258 | + let cgImage = uiImage.cgImage else { return nil } |
| 259 | + return cgImage |
| 260 | + #elseif canImport(AppKit) |
| 261 | + guard let nsImage = NSImage(data: data), |
| 262 | + let cgImage = nsImage.cgImage(forProposedRect: nil, context: nil, hints: nil) else { return nil } |
| 263 | + return cgImage |
| 264 | + #endif |
| 265 | + } |
| 266 | +} |
| 267 | + |
0 commit comments