Skip to content

Commit d935e13

Browse files
committed
fix(data): prevent financial data hallucination with data integrity system
- XLSX extraction now preserves row/column structure using cell references instead of flattening all cells into a single pipe-delimited line - Add CSV/TSV processor with header detection, delimiter auto-detect, and proper quoted field parsing - Add Data Integrity system prompt section (both SAM Default and Minimal) with zero-tolerance policy for fabricated numerical data - DocumentImportReminderInjector adds stronger warning when spreadsheet data is imported, requiring search_memory before answering data queries - Add CSV/TSV to supported file types in import system and file picker
1 parent 9611dbf commit d935e13

File tree

5 files changed

+262
-21
lines changed

5 files changed

+262
-21
lines changed

Sources/ConfigurationSystem/SystemPromptConfiguration.swift

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -402,6 +402,37 @@ public struct SystemPromptConfiguration: Codable, Identifiable, Hashable, Sendab
402402
"""
403403
}
404404

405+
/// Builds data integrity rules to prevent hallucination of numerical/financial data.
406+
private static func buildDataIntegrity() -> String {
407+
return """
408+
## Data Integrity (CRITICAL - ZERO TOLERANCE FOR DATA FABRICATION)
409+
410+
**NEVER fabricate, invent, estimate, round, or hallucinate ANY of the following:**
411+
- Financial figures (revenue, expenses, balances, prices, rates)
412+
- Statistical data (percentages, counts, averages, totals)
413+
- Dates, amounts, or quantities from user documents
414+
- Any specific number that should come from imported data
415+
416+
**MANDATORY PROTOCOL when user asks about data from imported documents:**
417+
1. FIRST: Use memory_operations with search_memory to look up the specific data
418+
2. VERIFY: Confirm the search results contain the actual numbers before responding
419+
3. CITE: Reference which document the data came from in your response
420+
4. If search returns no results or partial data: Tell the user clearly what you found and what you could NOT find. NEVER fill gaps with estimates or assumptions.
421+
422+
**When data is NOT found:**
423+
- Say explicitly: "I searched the imported documents but could not find [specific data]"
424+
- Ask the user to clarify or provide the missing information
425+
- Suggest re-importing the document if it may not have been fully indexed
426+
427+
**For calculations on imported data:**
428+
- ALWAYS retrieve the source numbers first via search_memory
429+
- Use math_operations for any computation (never do math in your head)
430+
- Show your work: state the source values and the calculation performed
431+
432+
**VIOLATION: Presenting any number as fact without retrieving it from a document or the user providing it directly. This causes real-world harm when users make decisions based on fabricated data.**
433+
"""
434+
}
435+
405436
/// Builds operational modes (conversational + task execution).
406437
private static func buildOperationalModes() -> String {
407438
return """
@@ -839,6 +870,13 @@ private static func buildSAMSpecificPatterns() -> String {
839870
order: 3
840871
),
841872

873+
SystemPromptComponent(
874+
title: "Data Integrity",
875+
content: Self.buildDataIntegrity(),
876+
isEnabled: true,
877+
order: 3
878+
),
879+
842880
// PRIORITY 2 - OPERATIONAL MODES
843881
SystemPromptComponent(
844882
title: "Operational Modes",
@@ -932,6 +970,17 @@ private static func buildSAMSpecificPatterns() -> String {
932970
order: 2
933971
),
934972

973+
SystemPromptComponent(
974+
title: "Data Integrity",
975+
content: """
976+
NEVER fabricate, invent, or estimate numerical data, financial figures, or statistics.
977+
If documents are imported, use search_memory to look up data before answering.
978+
If you cannot find the data, tell the user. Never fill in gaps with guesses.
979+
""",
980+
isEnabled: true,
981+
order: 2
982+
),
983+
935984
SystemPromptComponent(
936985
title: "Completion Signal",
937986
content: Self.buildMinimalCompletionSignal(),

Sources/MCPFramework/DocumentImportReminderInjector.swift

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,12 @@ public class DocumentImportReminderInjector {
9090
return nil
9191
}
9292

93+
/// Detect if any imported documents contain tabular/financial data
94+
let hasTabularData = docs.contains { doc in
95+
let ext = (doc.filename as NSString).pathExtension.lowercased()
96+
return ext == "csv" || ext == "tsv" || ext == "xlsx" || ext == "xls"
97+
}
98+
9399
var reminder = """
94100
IMPORTED DOCUMENTS IN THIS CONVERSATION:
95101
The following documents have already been imported into memory. DO NOT re-import them.
@@ -108,6 +114,18 @@ public class DocumentImportReminderInjector {
108114
memory_operations(operation: "search_memory", query: "your search query", similarity_threshold: "0.2")
109115
"""
110116

117+
if hasTabularData {
118+
reminder += """
119+
120+
⚠️ SPREADSHEET DATA IMPORTED - DATA INTEGRITY RULES APPLY:
121+
This conversation contains imported spreadsheet/tabular data.
122+
You MUST use search_memory to look up ANY numbers, values, or data points.
123+
NEVER guess, estimate, or fabricate values from these documents.
124+
If search_memory doesn't return the data you need, tell the user and ask.
125+
Use math_operations for any calculations on retrieved data.
126+
"""
127+
}
128+
111129
return reminder
112130
}
113131

Sources/UserInterface/Documents/DocumentImportSystem.swift

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ public class DocumentImportSystem: ObservableObject {
2727
private let officeProcessor = OfficeDocumentProcessor()
2828
private let textProcessor = TextDocumentProcessor()
2929
private let imageProcessor = ImageDocumentProcessor()
30+
private let csvProcessor = CSVDocumentProcessor()
3031

3132
public init(conversationManager: ConversationManager) {
3233
self.conversationManager = conversationManager
@@ -127,6 +128,10 @@ public class DocumentImportSystem: ObservableObject {
127128
UTType("com.microsoft.excel.xls") ?? .data,
128129
UTType("org.openxmlformats.spreadsheetml.sheet") ?? .data,
129130

131+
/// CSV/TSV Spreadsheets.
132+
.commaSeparatedText,
133+
UTType("public.tab-separated-values-text") ?? .data,
134+
130135
/// Text Documents.
131136
.plainText,
132137
.utf8PlainText,
@@ -167,6 +172,9 @@ public class DocumentImportSystem: ObservableObject {
167172
case _ where contentType.conforms(to: .image):
168173
processor = imageProcessor
169174

175+
case _ where isCSVDocument(contentType):
176+
processor = csvProcessor
177+
170178
case _ where isOfficeDocument(contentType):
171179
processor = officeProcessor
172180

@@ -245,6 +253,12 @@ public class DocumentImportSystem: ObservableObject {
245253
logger.debug("SUCCESS: Document \(document.filename) (ID: \(document.id)) is now searchable via semantic memory in conversation: \(conversationId?.uuidString ?? "global")")
246254
}
247255

256+
private func isCSVDocument(_ contentType: UTType) -> Bool {
257+
return contentType.conforms(to: .commaSeparatedText) ||
258+
contentType.identifier == "public.tab-separated-values-text" ||
259+
contentType.identifier == "public.comma-separated-values-text"
260+
}
261+
248262
private func isOfficeDocument(_ contentType: UTType) -> Bool {
249263
return contentType.identifier.contains("microsoft") ||
250264
contentType.identifier.contains("openxmlformats")

Sources/UserInterface/Documents/DocumentImportView.swift

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,8 @@ struct DocumentImportView: View {
185185
.png,
186186
.jpeg,
187187
.tiff,
188+
.commaSeparatedText,
189+
UTType("public.tab-separated-values-text")!,
188190
UTType("com.microsoft.word.doc")!,
189191
UTType("org.openxmlformats.wordprocessingml.document")!,
190192
UTType("com.microsoft.excel.xls")!,

Sources/UserInterface/Documents/DocumentProcessors.swift

Lines changed: 179 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,112 @@ class PDFDocumentProcessor: DocumentProcessor, @unchecked Sendable {
158158
}
159159
}
160160

161+
/// CSV/TSV Document Processor - preserves tabular structure with headers and rows.
162+
class CSVDocumentProcessor: DocumentProcessor, @unchecked Sendable {
163+
private let logger = Logger(label: "com.sam.documents.CSVProcessor")
164+
165+
func extractContent(from url: URL, contentType: UTType) async throws -> DocumentExtractedContent {
166+
logger.debug("Processing CSV/TSV: \(url.lastPathComponent)")
167+
168+
let data = try Data(contentsOf: url)
169+
guard let text = String(data: data, encoding: .utf8) ?? String(data: data, encoding: .utf16) else {
170+
throw DocumentImportError.processingFailed("Could not decode CSV file")
171+
}
172+
173+
var metadata: [String: String] = [:]
174+
metadata["documentType"] = "Spreadsheet (CSV/TSV)"
175+
176+
/// Detect delimiter (comma, tab, semicolon)
177+
let delimiter = detectDelimiter(text)
178+
metadata["delimiter"] = delimiter == "\t" ? "tab" : String(delimiter)
179+
180+
/// Parse CSV preserving structure
181+
let rows = parseCSV(text, delimiter: delimiter)
182+
guard !rows.isEmpty else {
183+
throw DocumentImportError.processingFailed("CSV file is empty or could not be parsed")
184+
}
185+
186+
metadata["rowCount"] = String(rows.count)
187+
metadata["columnCount"] = String(rows.first?.count ?? 0)
188+
189+
/// Build pipe-delimited output preserving row structure
190+
var outputLines: [String] = []
191+
192+
/// First row is treated as headers
193+
if let headers = rows.first {
194+
outputLines.append(headers.joined(separator: " | "))
195+
outputLines.append(String(repeating: "-", count: outputLines[0].count))
196+
}
197+
198+
for row in rows.dropFirst() {
199+
outputLines.append(row.joined(separator: " | "))
200+
}
201+
202+
let output = outputLines.joined(separator: "\n")
203+
metadata["note"] = "Tabular data preserved with row/column structure"
204+
205+
logger.debug("CSV processed: \(rows.count) rows, \(rows.first?.count ?? 0) columns")
206+
207+
return DocumentExtractedContent(text: output, metadata: metadata)
208+
}
209+
210+
/// Detect the most likely delimiter in the content.
211+
private func detectDelimiter(_ text: String) -> Character {
212+
let firstLines = text.components(separatedBy: .newlines).prefix(5).joined(separator: "\n")
213+
let commas = firstLines.filter { $0 == "," }.count
214+
let tabs = firstLines.filter { $0 == "\t" }.count
215+
let semicolons = firstLines.filter { $0 == ";" }.count
216+
217+
if tabs > commas && tabs > semicolons { return "\t" }
218+
if semicolons > commas { return ";" }
219+
return ","
220+
}
221+
222+
/// Parse CSV handling quoted fields.
223+
private func parseCSV(_ text: String, delimiter: Character) -> [[String]] {
224+
var rows: [[String]] = []
225+
var currentField = ""
226+
var currentRow: [String] = []
227+
var inQuotes = false
228+
229+
for char in text {
230+
if inQuotes {
231+
if char == "\"" {
232+
/// Check for escaped quote
233+
inQuotes = false
234+
} else {
235+
currentField.append(char)
236+
}
237+
} else {
238+
switch char {
239+
case "\"":
240+
inQuotes = true
241+
case delimiter:
242+
currentRow.append(currentField.trimmingCharacters(in: .whitespaces))
243+
currentField = ""
244+
case "\n", "\r":
245+
if !currentField.isEmpty || !currentRow.isEmpty {
246+
currentRow.append(currentField.trimmingCharacters(in: .whitespaces))
247+
rows.append(currentRow)
248+
currentRow = []
249+
currentField = ""
250+
}
251+
default:
252+
currentField.append(char)
253+
}
254+
}
255+
}
256+
257+
/// Handle last row without trailing newline
258+
if !currentField.isEmpty || !currentRow.isEmpty {
259+
currentRow.append(currentField.trimmingCharacters(in: .whitespaces))
260+
rows.append(currentRow)
261+
}
262+
263+
return rows
264+
}
265+
}
266+
161267
/// Text Document Processor Handles plain text, markdown, RTF, and code files.
162268
class TextDocumentProcessor: DocumentProcessor, @unchecked Sendable {
163269
private let logger = Logger(label: "com.sam.documents.TextProcessor")
@@ -426,7 +532,7 @@ class OfficeDocumentProcessor: DocumentProcessor, @unchecked Sendable {
426532
return paragraphs.joined(separator: "\n\n")
427533
}
428534

429-
/// Extract text from Excel spreadsheet (.xlsx).
535+
/// Extract text from Excel spreadsheet (.xlsx) preserving row/column structure.
430536
private func extractExcelDocument(from archive: Archive, metadata: inout [String: String]) async throws -> String {
431537
/// Excel stores shared strings in xl/sharedStrings.xml And sheet data in xl/worksheets/sheet*.xml.
432538

@@ -448,8 +554,13 @@ class OfficeDocumentProcessor: DocumentProcessor, @unchecked Sendable {
448554
/// Find all worksheet entries.
449555
var worksheetText: [String] = []
450556
var sheetCount = 0
557+
var totalRows = 0
451558

452-
for entry in archive where entry.path.hasPrefix("xl/worksheets/sheet") && entry.path.hasSuffix(".xml") {
559+
/// Sort sheet entries so they appear in order
560+
let sheetEntries = archive.sorted { $0.path < $1.path }
561+
.filter { $0.path.hasPrefix("xl/worksheets/sheet") && $0.path.hasSuffix(".xml") }
562+
563+
for entry in sheetEntries {
453564
sheetCount += 1
454565

455566
var sheetData = Data()
@@ -459,41 +570,88 @@ class OfficeDocumentProcessor: DocumentProcessor, @unchecked Sendable {
459570

460571
let xmlDoc = try XMLDocument(data: sheetData)
461572

462-
/// Extract cell values Cells with shared strings reference index via <c t="s"><v>index</v></c> Cells with direct values use <c><v>value</v></c>.
463-
let cellNodes = try xmlDoc.nodes(forXPath: "//c")
573+
/// Parse by rows to preserve tabular structure
574+
let rowNodes = try xmlDoc.nodes(forXPath: "//row")
575+
var maxColumn = 0
576+
var rows: [(rowNum: Int, cells: [(col: Int, value: String)])] = []
577+
578+
for rowNode in rowNodes {
579+
guard let rowElement = rowNode as? XMLElement else { continue }
580+
let rowNum = Int(rowElement.attribute(forName: "r")?.stringValue ?? "0") ?? 0
581+
let cellNodes = try rowElement.nodes(forXPath: "./c")
464582

465-
var sheetCells: [String] = []
583+
var rowCells: [(col: Int, value: String)] = []
466584

467-
for cellNode in cellNodes {
468-
if let cell = cellNode as? XMLElement {
469-
/// Check if cell uses shared string.
585+
for cellNode in cellNodes {
586+
guard let cell = cellNode as? XMLElement else { continue }
587+
let cellRef = cell.attribute(forName: "r")?.stringValue ?? ""
470588
let cellType = cell.attribute(forName: "t")?.stringValue
589+
let colIndex = columnIndex(from: cellRef)
590+
591+
if colIndex > maxColumn { maxColumn = colIndex }
471592

472-
if let valueNode = try cell.nodes(forXPath: "./v").first {
473-
if let valueString = valueNode.stringValue {
474-
if cellType == "s", let index = Int(valueString), index < sharedStrings.count {
475-
/// Shared string reference.
476-
sheetCells.append(sharedStrings[index])
477-
} else {
478-
/// Direct value.
479-
sheetCells.append(valueString)
480-
}
593+
var cellValue = ""
594+
595+
if let valueNode = try cell.nodes(forXPath: "./v").first,
596+
let valueString = valueNode.stringValue {
597+
if cellType == "s", let index = Int(valueString), index < sharedStrings.count {
598+
cellValue = sharedStrings[index]
599+
} else {
600+
cellValue = valueString
481601
}
602+
} else if let inlineNode = try cell.nodes(forXPath: "./is/t").first {
603+
/// Handle inline strings
604+
cellValue = inlineNode.stringValue ?? ""
605+
}
606+
607+
if !cellValue.isEmpty {
608+
rowCells.append((col: colIndex, value: cellValue))
482609
}
483610
}
611+
612+
if !rowCells.isEmpty {
613+
rows.append((rowNum: rowNum, cells: rowCells))
614+
}
484615
}
485616

486-
if !sheetCells.isEmpty {
487-
worksheetText.append("[Sheet \(sheetCount)]\n" + sheetCells.joined(separator: " | "))
617+
if rows.isEmpty { continue }
618+
totalRows += rows.count
619+
620+
/// Build pipe-delimited table with proper column alignment
621+
var sheetLines: [String] = ["[Sheet \(sheetCount)]"]
622+
623+
for row in rows {
624+
var columns = Array(repeating: "", count: maxColumn + 1)
625+
for cell in row.cells {
626+
if cell.col <= maxColumn {
627+
columns[cell.col] = cell.value
628+
}
629+
}
630+
/// Trim trailing empty columns for this row
631+
while columns.last?.isEmpty == true { columns.removeLast() }
632+
sheetLines.append(columns.joined(separator: " | "))
488633
}
634+
635+
worksheetText.append(sheetLines.joined(separator: "\n"))
489636
}
490637

491638
metadata["documentType"] = "Microsoft Excel"
492639
metadata["sheetCount"] = String(sheetCount)
493-
metadata["note"] = "Cell values extracted (formatting not preserved)"
640+
metadata["totalRows"] = String(totalRows)
641+
metadata["note"] = "Tabular data preserved with row/column structure"
494642

495-
logger.debug("Extracted \(sheetCount) sheets from Excel document")
643+
logger.debug("Extracted \(sheetCount) sheets, \(totalRows) rows from Excel document")
496644

497645
return worksheetText.joined(separator: "\n\n")
498646
}
647+
648+
/// Convert Excel column reference (e.g., "A", "B", "AA") to zero-based index.
649+
private func columnIndex(from cellRef: String) -> Int {
650+
let letters = cellRef.prefix(while: { $0.isLetter })
651+
var index = 0
652+
for char in letters.uppercased() {
653+
index = index * 26 + Int(char.asciiValue! - Character("A").asciiValue!) + 1
654+
}
655+
return index - 1
656+
}
499657
}

0 commit comments

Comments
 (0)