Skip to content

Commit 35eb09f

Browse files
authored
Merge pull request #9 from prosdevlab/feat/persist-extraction-results
feat(storage): persist extraction results with PII-safe path hashing
2 parents 8708809 + 46362d9 commit 35eb09f

File tree

13 files changed

+162
-40
lines changed

13 files changed

+162
-40
lines changed

AGENTS.md

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,38 @@ The project exposes an MCP server (`packages/cli/src/mcp/server.ts`) that allows
105105
- **Linting/Formatting**: Run `pnpm lint` and `pnpm format` (using Biome).
106106
- **Commits**: Follow Conventional Commits (e.g., `feat(cli): add new command`).
107107

108+
## Security & Privacy
109+
110+
This is a **privacy-first** tool. All data stays local. When implementing features that touch user data, follow these principles:
111+
112+
### PII Considerations
113+
114+
Before storing or logging any data, ask:
115+
1. **Does this contain PII?** (usernames, paths, emails, IPs)
116+
2. **Is storage necessary?** Can we derive what we need without storing raw PII?
117+
3. **What's the blast radius?** If this data leaks, what's exposed?
118+
119+
**Preferred patterns:**
120+
| Instead of | Use |
121+
|------------|-----|
122+
| `/Users/john/docs/invoice.pdf` | Hash of path + filename only |
123+
| Full error stack with paths | Sanitized error messages |
124+
| Logging request bodies | Logging request metadata only |
125+
126+
### Data Locality
127+
128+
- All extracted document data stays in local SQLite
129+
- No telemetry, no cloud sync, no external API calls (except AI providers)
130+
- User controls their data completely
131+
132+
### AI Provider Data
133+
134+
When sending data to AI providers:
135+
- Gemini/OpenAI: Data leaves machine (user accepts this by providing API key)
136+
- Ollama: Data stays local (default, privacy-first option)
137+
138+
Document these trade-offs clearly in user-facing docs.
139+
108140
## Adding New Features
109141

110142
1. **New AI Provider**: Update `packages/extract/src/index.ts` to implement the provider logic.

packages/cli/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
"dependencies": {
1818
"@doc-agent/core": "workspace:*",
1919
"@doc-agent/extract": "workspace:*",
20+
"@doc-agent/storage": "workspace:*",
2021
"@doc-agent/vector-store": "workspace:*",
2122
"@google/generative-ai": "^0.24.1",
2223
"@modelcontextprotocol/sdk": "^1.24.3",

packages/cli/src/cli.ts

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
#!/usr/bin/env node
22
import { exec } from 'node:child_process';
3+
import { resolve } from 'node:path';
34
import { promisify } from 'node:util';
45
import type { Config } from '@doc-agent/core';
56
import { extractDocument } from '@doc-agent/extract';
7+
import { storage } from '@doc-agent/storage';
68
import chalk from 'chalk';
79
import { Command } from 'commander';
810
import ora from 'ora';
@@ -51,6 +53,7 @@ program
5153
'Model to use (default: llama3.2-vision for ollama)',
5254
'llama3.2-vision'
5355
)
56+
.option('-d, --dry-run', 'Print JSON only, do not save to database', false)
5457
.action(async (file: string, options) => {
5558
try {
5659
if (options.provider === 'ollama') {
@@ -68,14 +71,16 @@ program
6871

6972
const result = await extractDocument(file, config);
7073

71-
spinner.succeed(chalk.green('Extraction complete!'));
74+
if (options.dryRun) {
75+
spinner.succeed(chalk.green('Extraction complete (dry run)'));
76+
} else {
77+
const absolutePath = resolve(file);
78+
await storage.saveDocument(result, absolutePath);
79+
spinner.succeed(chalk.green(`Saved: ${result.filename} (ID: ${result.id})`));
80+
}
81+
7282
console.log(JSON.stringify(result, null, 2));
7383
} catch (error) {
74-
// Only fail the spinner if it's running (ensureOllamaModel might have failed already)
75-
if (ora().isSpinning) {
76-
// This check is tricky because ora() creates a new instance.
77-
// We'll just log the error.
78-
}
7984
console.error(chalk.red('\nExtraction failed:'));
8085
console.error((error as Error).message);
8186
process.exit(1);

packages/cli/tsconfig.json

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,12 @@
66
},
77
"include": ["src/**/*"],
88
"exclude": ["node_modules", "dist", "**/*.test.ts"],
9-
"references": [{ "path": "../core" }, { "path": "../extract" }, { "path": "../vector-store" }],
9+
"references": [
10+
{ "path": "../core" },
11+
{ "path": "../extract" },
12+
{ "path": "../storage" },
13+
{ "path": "../vector-store" }
14+
],
1015
"ts-node": {
1116
"esm": true
1217
}

packages/storage/drizzle/0000_blue_legion.sql

Lines changed: 0 additions & 8 deletions
This file was deleted.
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
CREATE TABLE `documents` (
2+
`id` text PRIMARY KEY NOT NULL,
3+
`path_hash` text NOT NULL,
4+
`filename` text NOT NULL,
5+
`content_hash` text,
6+
`status` text DEFAULT 'pending' NOT NULL,
7+
`data` text NOT NULL,
8+
`created_at` integer DEFAULT '"2025-12-07T19:46:45.333Z"' NOT NULL
9+
);
10+
--> statement-breakpoint
11+
CREATE UNIQUE INDEX `documents_path_hash_unique` ON `documents` (`path_hash`);

packages/storage/drizzle/meta/0000_snapshot.json

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"version": "6",
33
"dialect": "sqlite",
4-
"id": "6eba05ff-3086-4f83-bc86-506b17db1c03",
4+
"id": "b3921c83-f822-4238-92be-3cba302553fe",
55
"prevId": "00000000-0000-0000-0000-000000000000",
66
"tables": {
77
"documents": {
@@ -14,15 +14,22 @@
1414
"notNull": true,
1515
"autoincrement": false
1616
},
17-
"path": {
18-
"name": "path",
17+
"path_hash": {
18+
"name": "path_hash",
1919
"type": "text",
2020
"primaryKey": false,
2121
"notNull": true,
2222
"autoincrement": false
2323
},
24-
"hash": {
25-
"name": "hash",
24+
"filename": {
25+
"name": "filename",
26+
"type": "text",
27+
"primaryKey": false,
28+
"notNull": true,
29+
"autoincrement": false
30+
},
31+
"content_hash": {
32+
"name": "content_hash",
2633
"type": "text",
2734
"primaryKey": false,
2835
"notNull": false,
@@ -49,10 +56,16 @@
4956
"primaryKey": false,
5057
"notNull": true,
5158
"autoincrement": false,
52-
"default": "'\"2025-12-07T11:19:23.284Z\"'"
59+
"default": "'\"2025-12-07T19:46:45.333Z\"'"
60+
}
61+
},
62+
"indexes": {
63+
"documents_path_hash_unique": {
64+
"name": "documents_path_hash_unique",
65+
"columns": ["path_hash"],
66+
"isUnique": true
5367
}
5468
},
55-
"indexes": {},
5669
"foreignKeys": {},
5770
"compositePrimaryKeys": {},
5871
"uniqueConstraints": {},

packages/storage/drizzle/meta/_journal.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55
{
66
"idx": 0,
77
"version": "6",
8-
"when": 1765106363288,
9-
"tag": "0000_blue_legion",
8+
"when": 1765136805336,
9+
"tag": "0000_init",
1010
"breakpoints": true
1111
}
1212
]

packages/storage/src/__tests__/repository.test.ts

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import type { DocumentData } from '@doc-agent/core';
22
import { beforeEach, describe, expect, it } from 'vitest';
33
import { createDb } from '../db';
4-
import { DocumentRepository } from '../index';
4+
import { computePathHash, DocumentRepository } from '../index';
55

66
describe('DocumentRepository', () => {
77
let repo: DocumentRepository;
@@ -32,10 +32,11 @@ describe('DocumentRepository', () => {
3232
...mockDoc,
3333
extractedAt: mockDoc.extractedAt.toISOString(),
3434
});
35-
expect(result?.path).toBe('/tmp/invoice.pdf');
35+
expect(result?.filename).toBe('invoice.pdf');
36+
expect(result?.pathHash).toBe(computePathHash('/tmp/invoice.pdf'));
3637
});
3738

38-
it('should update an existing document on save', async () => {
39+
it('should upsert by path (same file = update, not duplicate)', async () => {
3940
const mockDoc: DocumentData = {
4041
id: '123',
4142
filename: 'invoice.pdf',
@@ -47,22 +48,46 @@ describe('DocumentRepository', () => {
4748

4849
await repo.saveDocument(mockDoc, '/tmp/invoice.pdf');
4950

50-
// Update amount
51-
const updatedDoc = { ...mockDoc, amount: 200 };
51+
// Re-extract same file with new ID and updated data
52+
const updatedDoc = { ...mockDoc, id: '456', amount: 200 };
5253
await repo.saveDocument(updatedDoc, '/tmp/invoice.pdf');
5354

54-
const result = await repo.getDocument('123');
55-
expect(result?.data.amount).toBe(200);
55+
// Should have updated the existing record, not created a new one
56+
const list = await repo.listDocuments();
57+
expect(list).toHaveLength(1);
58+
expect(list[0].id).toBe('456'); // ID updated
59+
expect(list[0].data.amount).toBe(200); // Data updated
5660
});
5761

5862
it('should list all documents', async () => {
5963
const doc1 = { id: '1', filename: 'a.pdf', type: 'invoice' as const, extractedAt: new Date() };
6064
const doc2 = { id: '2', filename: 'b.pdf', type: 'receipt' as const, extractedAt: new Date() };
6165

62-
await repo.saveDocument(doc1, '/a');
63-
await repo.saveDocument(doc2, '/b');
66+
await repo.saveDocument(doc1, '/a.pdf');
67+
await repo.saveDocument(doc2, '/b.pdf');
6468

6569
const list = await repo.listDocuments();
6670
expect(list).toHaveLength(2);
6771
});
6872
});
73+
74+
describe('computePathHash', () => {
75+
it('should return consistent hash for same absolute path', () => {
76+
const hash1 = computePathHash('/tmp/invoice.pdf');
77+
const hash2 = computePathHash('/tmp/invoice.pdf');
78+
expect(hash1).toBe(hash2);
79+
});
80+
81+
it('should return different hash for different paths', () => {
82+
const hash1 = computePathHash('/tmp/a.pdf');
83+
const hash2 = computePathHash('/tmp/b.pdf');
84+
expect(hash1).not.toBe(hash2);
85+
});
86+
87+
it('should resolve relative paths to absolute', () => {
88+
// Same file, different ways of referring to it
89+
const hash1 = computePathHash('./test.pdf');
90+
const hash2 = computePathHash('test.pdf');
91+
expect(hash1).toBe(hash2);
92+
});
93+
});

packages/storage/src/index.ts

Lines changed: 37 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import { createHash } from 'node:crypto';
2+
import { basename, resolve } from 'node:path';
13
import type { DocumentData } from '@doc-agent/core';
24
import { eq } from 'drizzle-orm';
35
import { createDb, type DbClient } from './db';
@@ -9,6 +11,14 @@ export { createDb, type DbClient, getDbPath } from './db';
911
// Re-export schema types
1012
export { type Document, documents, type NewDocument } from './schema';
1113

14+
/**
15+
* Compute SHA256 hash of a path for PII-safe storage
16+
*/
17+
export function computePathHash(filePath: string): string {
18+
const absolutePath = resolve(filePath);
19+
return createHash('sha256').update(absolutePath).digest('hex');
20+
}
21+
1222
export class DocumentRepository {
1323
private db: DbClient;
1424

@@ -17,23 +27,27 @@ export class DocumentRepository {
1727
}
1828

1929
async saveDocument(docData: DocumentData, filePath: string): Promise<void> {
30+
const pathHash = computePathHash(filePath);
31+
const filename = basename(filePath);
32+
2033
const newDoc: NewDocument = {
2134
id: docData.id,
22-
path: filePath,
35+
pathHash,
36+
filename,
2337
status: 'pending',
2438
data: docData,
2539
createdAt: new Date(),
2640
};
2741

28-
// Upsert logic: if id exists, update data
42+
// Upsert logic: if same file path, update existing record
2943
await this.db
3044
.insert(documents)
3145
.values(newDoc)
3246
.onConflictDoUpdate({
33-
target: documents.id,
47+
target: documents.pathHash,
3448
set: {
49+
id: docData.id,
3550
data: docData,
36-
path: filePath,
3751
status: 'pending', // Reset status on update so it gets re-indexed
3852
},
3953
});
@@ -52,4 +66,22 @@ export class DocumentRepository {
5266
}
5367
}
5468

55-
export const storage = new DocumentRepository();
69+
// Lazy singleton - only initializes when first accessed
70+
let _storage: DocumentRepository | null = null;
71+
72+
export function getStorage(): DocumentRepository {
73+
if (!_storage) {
74+
_storage = new DocumentRepository();
75+
}
76+
return _storage;
77+
}
78+
79+
// Convenience alias for simple usage
80+
export const storage = {
81+
saveDocument: (...args: Parameters<DocumentRepository['saveDocument']>) =>
82+
getStorage().saveDocument(...args),
83+
getDocument: (...args: Parameters<DocumentRepository['getDocument']>) =>
84+
getStorage().getDocument(...args),
85+
listDocuments: (...args: Parameters<DocumentRepository['listDocuments']>) =>
86+
getStorage().listDocuments(...args),
87+
};

0 commit comments

Comments
 (0)