fix(data): prevent financial data hallucination with data integrity system

fewtarius · fewtarius · commit d935e138ae2f · 2026-03-15T05:42:01.000-04:00
- XLSX extraction now preserves row/column structure using cell references
  instead of flattening all cells into a single pipe-delimited line
- Add CSV/TSV processor with header detection, delimiter auto-detect,
  and proper quoted field parsing
- Add Data Integrity system prompt section (both SAM Default and Minimal)
  with zero-tolerance policy for fabricated numerical data
- DocumentImportReminderInjector adds stronger warning when spreadsheet
  data is imported, requiring search_memory before answering data queries
- Add CSV/TSV to supported file types in import system and file picker
diff --git a/Sources/ConfigurationSystem/SystemPromptConfiguration.swift b/Sources/ConfigurationSystem/SystemPromptConfiguration.swift
@@ -402,6 +402,37 @@ public struct SystemPromptConfiguration: Codable, Identifiable, Hashable, Sendab
         """
     }
 
+    /// Builds data integrity rules to prevent hallucination of numerical/financial data.
+    private static func buildDataIntegrity() -> String {
+        return """
+        ## Data Integrity (CRITICAL - ZERO TOLERANCE FOR DATA FABRICATION)
+
+        **NEVER fabricate, invent, estimate, round, or hallucinate ANY of the following:**
+        - Financial figures (revenue, expenses, balances, prices, rates)
+        - Statistical data (percentages, counts, averages, totals)
+        - Dates, amounts, or quantities from user documents
+        - Any specific number that should come from imported data
+
+        **MANDATORY PROTOCOL when user asks about data from imported documents:**
+        1. FIRST: Use memory_operations with search_memory to look up the specific data
+        2. VERIFY: Confirm the search results contain the actual numbers before responding
+        3. CITE: Reference which document the data came from in your response
+        4. If search returns no results or partial data: Tell the user clearly what you found and what you could NOT find. NEVER fill gaps with estimates or assumptions.
+
+        **When data is NOT found:**
+        - Say explicitly: "I searched the imported documents but could not find [specific data]"
+        - Ask the user to clarify or provide the missing information
+        - Suggest re-importing the document if it may not have been fully indexed
+
+        **For calculations on imported data:**
+        - ALWAYS retrieve the source numbers first via search_memory
+        - Use math_operations for any computation (never do math in your head)
+        - Show your work: state the source values and the calculation performed
+
+        **VIOLATION: Presenting any number as fact without retrieving it from a document or the user providing it directly. This causes real-world harm when users make decisions based on fabricated data.**
+        """
+    }
+
     /// Builds operational modes (conversational + task execution).
     private static func buildOperationalModes() -> String {
         return """
@@ -839,6 +870,13 @@ private static func buildSAMSpecificPatterns() -> String {
                     order: 3
                 ),
 
+                SystemPromptComponent(
+                    title: "Data Integrity",
+                    content: Self.buildDataIntegrity(),
+                    isEnabled: true,
+                    order: 3
+                ),
+
                 // PRIORITY 2 - OPERATIONAL MODES
                 SystemPromptComponent(
                     title: "Operational Modes",
@@ -932,6 +970,17 @@ private static func buildSAMSpecificPatterns() -> String {
                     order: 2
                 ),
 
+                SystemPromptComponent(
+                    title: "Data Integrity",
+                    content: """
+                    NEVER fabricate, invent, or estimate numerical data, financial figures, or statistics.
+                    If documents are imported, use search_memory to look up data before answering.
+                    If you cannot find the data, tell the user. Never fill in gaps with guesses.
+                    """,
+                    isEnabled: true,
+                    order: 2
+                ),
+
                 SystemPromptComponent(
                     title: "Completion Signal",
                     content: Self.buildMinimalCompletionSignal(),
diff --git a/Sources/MCPFramework/DocumentImportReminderInjector.swift b/Sources/MCPFramework/DocumentImportReminderInjector.swift
@@ -90,6 +90,12 @@ public class DocumentImportReminderInjector {
             return nil
         }
 
+        /// Detect if any imported documents contain tabular/financial data
+        let hasTabularData = docs.contains { doc in
+            let ext = (doc.filename as NSString).pathExtension.lowercased()
+            return ext == "csv" || ext == "tsv" || ext == "xlsx" || ext == "xls"
+        }
+
         var reminder = """
         IMPORTED DOCUMENTS IN THIS CONVERSATION:
         The following documents have already been imported into memory. DO NOT re-import them.
@@ -108,6 +114,18 @@ public class DocumentImportReminderInjector {
         memory_operations(operation: "search_memory", query: "your search query", similarity_threshold: "0.2")
         """
 
+        if hasTabularData {
+            reminder += """
+
+            ⚠️ SPREADSHEET DATA IMPORTED - DATA INTEGRITY RULES APPLY:
+            This conversation contains imported spreadsheet/tabular data.
+            You MUST use search_memory to look up ANY numbers, values, or data points.
+            NEVER guess, estimate, or fabricate values from these documents.
+            If search_memory doesn't return the data you need, tell the user and ask.
+            Use math_operations for any calculations on retrieved data.
+            """
+        }
+
         return reminder
     }
 
diff --git a/Sources/UserInterface/Documents/DocumentImportSystem.swift b/Sources/UserInterface/Documents/DocumentImportSystem.swift
@@ -27,6 +27,7 @@ public class DocumentImportSystem: ObservableObject {
     private let officeProcessor = OfficeDocumentProcessor()
     private let textProcessor = TextDocumentProcessor()
     private let imageProcessor = ImageDocumentProcessor()
+    private let csvProcessor = CSVDocumentProcessor()
 
     public init(conversationManager: ConversationManager) {
         self.conversationManager = conversationManager
@@ -127,6 +128,10 @@ public class DocumentImportSystem: ObservableObject {
             UTType("com.microsoft.excel.xls") ?? .data,
             UTType("org.openxmlformats.spreadsheetml.sheet") ?? .data,
 
+            /// CSV/TSV Spreadsheets.
+            .commaSeparatedText,
+            UTType("public.tab-separated-values-text") ?? .data,
+
             /// Text Documents.
             .plainText,
             .utf8PlainText,
@@ -167,6 +172,9 @@ public class DocumentImportSystem: ObservableObject {
         case _ where contentType.conforms(to: .image):
             processor = imageProcessor
 
+        case _ where isCSVDocument(contentType):
+            processor = csvProcessor
+
         case _ where isOfficeDocument(contentType):
             processor = officeProcessor
 
@@ -245,6 +253,12 @@ public class DocumentImportSystem: ObservableObject {
         logger.debug("SUCCESS: Document \(document.filename) (ID: \(document.id)) is now searchable via semantic memory in conversation: \(conversationId?.uuidString ?? "global")")
     }
 
+    private func isCSVDocument(_ contentType: UTType) -> Bool {
+        return contentType.conforms(to: .commaSeparatedText) ||
+               contentType.identifier == "public.tab-separated-values-text" ||
+               contentType.identifier == "public.comma-separated-values-text"
+    }
+
     private func isOfficeDocument(_ contentType: UTType) -> Bool {
         return contentType.identifier.contains("microsoft") ||
                contentType.identifier.contains("openxmlformats")
diff --git a/Sources/UserInterface/Documents/DocumentImportView.swift b/Sources/UserInterface/Documents/DocumentImportView.swift
@@ -185,6 +185,8 @@ struct DocumentImportView: View {
             .png,
             .jpeg,
             .tiff,
+            .commaSeparatedText,
+            UTType("public.tab-separated-values-text")!,
             UTType("com.microsoft.word.doc")!,
             UTType("org.openxmlformats.wordprocessingml.document")!,
             UTType("com.microsoft.excel.xls")!,
diff --git a/Sources/UserInterface/Documents/DocumentProcessors.swift b/Sources/UserInterface/Documents/DocumentProcessors.swift
@@ -158,6 +158,112 @@ class PDFDocumentProcessor: DocumentProcessor, @unchecked Sendable {
     }
 }
 
+/// CSV/TSV Document Processor - preserves tabular structure with headers and rows.
+class CSVDocumentProcessor: DocumentProcessor, @unchecked Sendable {
+    private let logger = Logger(label: "com.sam.documents.CSVProcessor")
+
+    func extractContent(from url: URL, contentType: UTType) async throws -> DocumentExtractedContent {
+        logger.debug("Processing CSV/TSV: \(url.lastPathComponent)")
+
+        let data = try Data(contentsOf: url)
+        guard let text = String(data: data, encoding: .utf8) ?? String(data: data, encoding: .utf16) else {
+            throw DocumentImportError.processingFailed("Could not decode CSV file")
+        }
+
+        var metadata: [String: String] = [:]
+        metadata["documentType"] = "Spreadsheet (CSV/TSV)"
+
+        /// Detect delimiter (comma, tab, semicolon)
+        let delimiter = detectDelimiter(text)
+        metadata["delimiter"] = delimiter == "\t" ? "tab" : String(delimiter)
+
+        /// Parse CSV preserving structure
+        let rows = parseCSV(text, delimiter: delimiter)
+        guard !rows.isEmpty else {
+            throw DocumentImportError.processingFailed("CSV file is empty or could not be parsed")
+        }
+
+        metadata["rowCount"] = String(rows.count)
+        metadata["columnCount"] = String(rows.first?.count ?? 0)
+
+        /// Build pipe-delimited output preserving row structure
+        var outputLines: [String] = []
+
+        /// First row is treated as headers
+        if let headers = rows.first {
+            outputLines.append(headers.joined(separator: " | "))
+            outputLines.append(String(repeating: "-", count: outputLines[0].count))
+        }
+
+        for row in rows.dropFirst() {
+            outputLines.append(row.joined(separator: " | "))
+        }
+
+        let output = outputLines.joined(separator: "\n")
+        metadata["note"] = "Tabular data preserved with row/column structure"
+
+        logger.debug("CSV processed: \(rows.count) rows, \(rows.first?.count ?? 0) columns")
+
+        return DocumentExtractedContent(text: output, metadata: metadata)
+    }
+
+    /// Detect the most likely delimiter in the content.
+    private func detectDelimiter(_ text: String) -> Character {
+        let firstLines = text.components(separatedBy: .newlines).prefix(5).joined(separator: "\n")
+        let commas = firstLines.filter { $0 == "," }.count
+        let tabs = firstLines.filter { $0 == "\t" }.count
+        let semicolons = firstLines.filter { $0 == ";" }.count
+
+        if tabs > commas && tabs > semicolons { return "\t" }
+        if semicolons > commas { return ";" }
+        return ","
+    }
+
+    /// Parse CSV handling quoted fields.
+    private func parseCSV(_ text: String, delimiter: Character) -> [[String]] {
+        var rows: [[String]] = []
+        var currentField = ""
+        var currentRow: [String] = []
+        var inQuotes = false
+
+        for char in text {
+            if inQuotes {
+                if char == "\"" {
+                    /// Check for escaped quote
+                    inQuotes = false
+                } else {
+                    currentField.append(char)
+                }
+            } else {
+                switch char {
+                case "\"":
+                    inQuotes = true
+                case delimiter:
+                    currentRow.append(currentField.trimmingCharacters(in: .whitespaces))
+                    currentField = ""
+                case "\n", "\r":
+                    if !currentField.isEmpty || !currentRow.isEmpty {
+                        currentRow.append(currentField.trimmingCharacters(in: .whitespaces))
+                        rows.append(currentRow)
+                        currentRow = []
+                        currentField = ""
+                    }
+                default:
+                    currentField.append(char)
+                }
+            }
+        }
+
+        /// Handle last row without trailing newline
+        if !currentField.isEmpty || !currentRow.isEmpty {
+            currentRow.append(currentField.trimmingCharacters(in: .whitespaces))
+            rows.append(currentRow)
+        }
+
+        return rows
+    }
+}
+
 /// Text Document Processor Handles plain text, markdown, RTF, and code files.
 class TextDocumentProcessor: DocumentProcessor, @unchecked Sendable {
     private let logger = Logger(label: "com.sam.documents.TextProcessor")
@@ -426,7 +532,7 @@ class OfficeDocumentProcessor: DocumentProcessor, @unchecked Sendable {
         return paragraphs.joined(separator: "\n\n")
     }
 
-    /// Extract text from Excel spreadsheet (.xlsx).
+    /// Extract text from Excel spreadsheet (.xlsx) preserving row/column structure.
     private func extractExcelDocument(from archive: Archive, metadata: inout [String: String]) async throws -> String {
         /// Excel stores shared strings in xl/sharedStrings.xml And sheet data in xl/worksheets/sheet*.xml.
 
@@ -448,8 +554,13 @@ class OfficeDocumentProcessor: DocumentProcessor, @unchecked Sendable {
         /// Find all worksheet entries.
         var worksheetText: [String] = []
         var sheetCount = 0
+        var totalRows = 0
 
-        for entry in archive where entry.path.hasPrefix("xl/worksheets/sheet") && entry.path.hasSuffix(".xml") {
+        /// Sort sheet entries so they appear in order
+        let sheetEntries = archive.sorted { $0.path < $1.path }
+            .filter { $0.path.hasPrefix("xl/worksheets/sheet") && $0.path.hasSuffix(".xml") }
+
+        for entry in sheetEntries {
             sheetCount += 1
 
             var sheetData = Data()
@@ -459,41 +570,88 @@ class OfficeDocumentProcessor: DocumentProcessor, @unchecked Sendable {
 
             let xmlDoc = try XMLDocument(data: sheetData)
 
-            /// Extract cell values Cells with shared strings reference index via <c t="s"><v>index</v></c> Cells with direct values use <c><v>value</v></c>.
-            let cellNodes = try xmlDoc.nodes(forXPath: "//c")
+            /// Parse by rows to preserve tabular structure
+            let rowNodes = try xmlDoc.nodes(forXPath: "//row")
+            var maxColumn = 0
+            var rows: [(rowNum: Int, cells: [(col: Int, value: String)])] = []
+
+            for rowNode in rowNodes {
+                guard let rowElement = rowNode as? XMLElement else { continue }
+                let rowNum = Int(rowElement.attribute(forName: "r")?.stringValue ?? "0") ?? 0
+                let cellNodes = try rowElement.nodes(forXPath: "./c")
 
-            var sheetCells: [String] = []
+                var rowCells: [(col: Int, value: String)] = []
 
-            for cellNode in cellNodes {
-                if let cell = cellNode as? XMLElement {
-                    /// Check if cell uses shared string.
+                for cellNode in cellNodes {
+                    guard let cell = cellNode as? XMLElement else { continue }
+                    let cellRef = cell.attribute(forName: "r")?.stringValue ?? ""
                     let cellType = cell.attribute(forName: "t")?.stringValue
+                    let colIndex = columnIndex(from: cellRef)
+
+                    if colIndex > maxColumn { maxColumn = colIndex }
 
-                    if let valueNode = try cell.nodes(forXPath: "./v").first {
-                        if let valueString = valueNode.stringValue {
-                            if cellType == "s", let index = Int(valueString), index < sharedStrings.count {
-                                /// Shared string reference.
-                                sheetCells.append(sharedStrings[index])
-                            } else {
-                                /// Direct value.
-                                sheetCells.append(valueString)
-                            }
+                    var cellValue = ""
+
+                    if let valueNode = try cell.nodes(forXPath: "./v").first,
+                       let valueString = valueNode.stringValue {
+                        if cellType == "s", let index = Int(valueString), index < sharedStrings.count {
+                            cellValue = sharedStrings[index]
+                        } else {
+                            cellValue = valueString
                         }
+                    } else if let inlineNode = try cell.nodes(forXPath: "./is/t").first {
+                        /// Handle inline strings
+                        cellValue = inlineNode.stringValue ?? ""
+                    }
+
+                    if !cellValue.isEmpty {
+                        rowCells.append((col: colIndex, value: cellValue))
                     }
                 }
+
+                if !rowCells.isEmpty {
+                    rows.append((rowNum: rowNum, cells: rowCells))
+                }
             }
 
-            if !sheetCells.isEmpty {
-                worksheetText.append("[Sheet \(sheetCount)]\n" + sheetCells.joined(separator: " | "))
+            if rows.isEmpty { continue }
+            totalRows += rows.count
+
+            /// Build pipe-delimited table with proper column alignment
+            var sheetLines: [String] = ["[Sheet \(sheetCount)]"]
+
+            for row in rows {
+                var columns = Array(repeating: "", count: maxColumn + 1)
+                for cell in row.cells {
+                    if cell.col <= maxColumn {
+                        columns[cell.col] = cell.value
+                    }
+                }
+                /// Trim trailing empty columns for this row
+                while columns.last?.isEmpty == true { columns.removeLast() }
+                sheetLines.append(columns.joined(separator: " | "))
             }
+
+            worksheetText.append(sheetLines.joined(separator: "\n"))
         }
 
         metadata["documentType"] = "Microsoft Excel"
         metadata["sheetCount"] = String(sheetCount)
-        metadata["note"] = "Cell values extracted (formatting not preserved)"
+        metadata["totalRows"] = String(totalRows)
+        metadata["note"] = "Tabular data preserved with row/column structure"
 
-        logger.debug("Extracted \(sheetCount) sheets from Excel document")
+        logger.debug("Extracted \(sheetCount) sheets, \(totalRows) rows from Excel document")
 
         return worksheetText.joined(separator: "\n\n")
     }
+
+    /// Convert Excel column reference (e.g., "A", "B", "AA") to zero-based index.
+    private func columnIndex(from cellRef: String) -> Int {
+        let letters = cellRef.prefix(while: { $0.isLetter })
+        var index = 0
+        for char in letters.uppercased() {
+            index = index * 26 + Int(char.asciiValue! - Character("A").asciiValue!) + 1
+        }
+        return index - 1
+    }
 }