diff --git a/AGENTS.md b/AGENTS.md index c2757cb..3f00680 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -105,6 +105,38 @@ The project exposes an MCP server (`packages/cli/src/mcp/server.ts`) that allows - **Linting/Formatting**: Run `pnpm lint` and `pnpm format` (using Biome). - **Commits**: Follow Conventional Commits (e.g., `feat(cli): add new command`). +## Security & Privacy + +This is a **privacy-first** tool. All data stays local. When implementing features that touch user data, follow these principles: + +### PII Considerations + +Before storing or logging any data, ask: +1. **Does this contain PII?** (usernames, paths, emails, IPs) +2. **Is storage necessary?** Can we derive what we need without storing raw PII? +3. **What's the blast radius?** If this data leaks, what's exposed? + +**Preferred patterns:** +| Instead of | Use | +|------------|-----| +| `/Users/john/docs/invoice.pdf` | Hash of path + filename only | +| Full error stack with paths | Sanitized error messages | +| Logging request bodies | Logging request metadata only | + +### Data Locality + +- All extracted document data stays in local SQLite +- No telemetry, no cloud sync, no external API calls (except AI providers) +- User controls their data completely + +### AI Provider Data + +When sending data to AI providers: +- Gemini/OpenAI: Data leaves machine (user accepts this by providing API key) +- Ollama: Data stays local (default, privacy-first option) + +Document these trade-offs clearly in user-facing docs. + ## Adding New Features 1. **New AI Provider**: Update `packages/extract/src/index.ts` to implement the provider logic. diff --git a/packages/cli/package.json b/packages/cli/package.json index 6c2a06e..3c8e8d6 100644 --- a/packages/cli/package.json +++ b/packages/cli/package.json @@ -17,6 +17,7 @@ "dependencies": { "@doc-agent/core": "workspace:*", "@doc-agent/extract": "workspace:*", + "@doc-agent/storage": "workspace:*", "@doc-agent/vector-store": "workspace:*", "@google/generative-ai": "^0.24.1", "@modelcontextprotocol/sdk": "^1.24.3", diff --git a/packages/cli/src/cli.ts b/packages/cli/src/cli.ts index 05b6313..f7ca549 100644 --- a/packages/cli/src/cli.ts +++ b/packages/cli/src/cli.ts @@ -1,8 +1,10 @@ #!/usr/bin/env node import { exec } from 'node:child_process'; +import { resolve } from 'node:path'; import { promisify } from 'node:util'; import type { Config } from '@doc-agent/core'; import { extractDocument } from '@doc-agent/extract'; +import { storage } from '@doc-agent/storage'; import chalk from 'chalk'; import { Command } from 'commander'; import ora from 'ora'; @@ -51,6 +53,7 @@ program 'Model to use (default: llama3.2-vision for ollama)', 'llama3.2-vision' ) + .option('-d, --dry-run', 'Print JSON only, do not save to database', false) .action(async (file: string, options) => { try { if (options.provider === 'ollama') { @@ -68,14 +71,16 @@ program const result = await extractDocument(file, config); - spinner.succeed(chalk.green('Extraction complete!')); + if (options.dryRun) { + spinner.succeed(chalk.green('Extraction complete (dry run)')); + } else { + const absolutePath = resolve(file); + await storage.saveDocument(result, absolutePath); + spinner.succeed(chalk.green(`Saved: ${result.filename} (ID: ${result.id})`)); + } + console.log(JSON.stringify(result, null, 2)); } catch (error) { - // Only fail the spinner if it's running (ensureOllamaModel might have failed already) - if (ora().isSpinning) { - // This check is tricky because ora() creates a new instance. - // We'll just log the error. - } console.error(chalk.red('\nExtraction failed:')); console.error((error as Error).message); process.exit(1); diff --git a/packages/cli/tsconfig.json b/packages/cli/tsconfig.json index a341f5b..9f30449 100644 --- a/packages/cli/tsconfig.json +++ b/packages/cli/tsconfig.json @@ -6,7 +6,12 @@ }, "include": ["src/**/*"], "exclude": ["node_modules", "dist", "**/*.test.ts"], - "references": [{ "path": "../core" }, { "path": "../extract" }, { "path": "../vector-store" }], + "references": [ + { "path": "../core" }, + { "path": "../extract" }, + { "path": "../storage" }, + { "path": "../vector-store" } + ], "ts-node": { "esm": true } diff --git a/packages/storage/drizzle/0000_blue_legion.sql b/packages/storage/drizzle/0000_blue_legion.sql deleted file mode 100644 index 75fcbf0..0000000 --- a/packages/storage/drizzle/0000_blue_legion.sql +++ /dev/null @@ -1,8 +0,0 @@ -CREATE TABLE `documents` ( - `id` text PRIMARY KEY NOT NULL, - `path` text NOT NULL, - `hash` text, - `status` text DEFAULT 'pending' NOT NULL, - `data` text NOT NULL, - `created_at` integer DEFAULT '"2025-12-07T11:19:23.284Z"' NOT NULL -); diff --git a/packages/storage/drizzle/0000_init.sql b/packages/storage/drizzle/0000_init.sql new file mode 100644 index 0000000..bf05586 --- /dev/null +++ b/packages/storage/drizzle/0000_init.sql @@ -0,0 +1,11 @@ +CREATE TABLE `documents` ( + `id` text PRIMARY KEY NOT NULL, + `path_hash` text NOT NULL, + `filename` text NOT NULL, + `content_hash` text, + `status` text DEFAULT 'pending' NOT NULL, + `data` text NOT NULL, + `created_at` integer DEFAULT '"2025-12-07T19:46:45.333Z"' NOT NULL +); +--> statement-breakpoint +CREATE UNIQUE INDEX `documents_path_hash_unique` ON `documents` (`path_hash`); \ No newline at end of file diff --git a/packages/storage/drizzle/meta/0000_snapshot.json b/packages/storage/drizzle/meta/0000_snapshot.json index f53190b..ccb3bdd 100644 --- a/packages/storage/drizzle/meta/0000_snapshot.json +++ b/packages/storage/drizzle/meta/0000_snapshot.json @@ -1,7 +1,7 @@ { "version": "6", "dialect": "sqlite", - "id": "6eba05ff-3086-4f83-bc86-506b17db1c03", + "id": "b3921c83-f822-4238-92be-3cba302553fe", "prevId": "00000000-0000-0000-0000-000000000000", "tables": { "documents": { @@ -14,15 +14,22 @@ "notNull": true, "autoincrement": false }, - "path": { - "name": "path", + "path_hash": { + "name": "path_hash", "type": "text", "primaryKey": false, "notNull": true, "autoincrement": false }, - "hash": { - "name": "hash", + "filename": { + "name": "filename", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "content_hash": { + "name": "content_hash", "type": "text", "primaryKey": false, "notNull": false, @@ -49,10 +56,16 @@ "primaryKey": false, "notNull": true, "autoincrement": false, - "default": "'\"2025-12-07T11:19:23.284Z\"'" + "default": "'\"2025-12-07T19:46:45.333Z\"'" + } + }, + "indexes": { + "documents_path_hash_unique": { + "name": "documents_path_hash_unique", + "columns": ["path_hash"], + "isUnique": true } }, - "indexes": {}, "foreignKeys": {}, "compositePrimaryKeys": {}, "uniqueConstraints": {}, diff --git a/packages/storage/drizzle/meta/_journal.json b/packages/storage/drizzle/meta/_journal.json index 08c3ec2..a051bf8 100644 --- a/packages/storage/drizzle/meta/_journal.json +++ b/packages/storage/drizzle/meta/_journal.json @@ -5,8 +5,8 @@ { "idx": 0, "version": "6", - "when": 1765106363288, - "tag": "0000_blue_legion", + "when": 1765136805336, + "tag": "0000_init", "breakpoints": true } ] diff --git a/packages/storage/src/__tests__/repository.test.ts b/packages/storage/src/__tests__/repository.test.ts index 8f6939f..27dec3c 100644 --- a/packages/storage/src/__tests__/repository.test.ts +++ b/packages/storage/src/__tests__/repository.test.ts @@ -1,7 +1,7 @@ import type { DocumentData } from '@doc-agent/core'; import { beforeEach, describe, expect, it } from 'vitest'; import { createDb } from '../db'; -import { DocumentRepository } from '../index'; +import { computePathHash, DocumentRepository } from '../index'; describe('DocumentRepository', () => { let repo: DocumentRepository; @@ -32,10 +32,11 @@ describe('DocumentRepository', () => { ...mockDoc, extractedAt: mockDoc.extractedAt.toISOString(), }); - expect(result?.path).toBe('/tmp/invoice.pdf'); + expect(result?.filename).toBe('invoice.pdf'); + expect(result?.pathHash).toBe(computePathHash('/tmp/invoice.pdf')); }); - it('should update an existing document on save', async () => { + it('should upsert by path (same file = update, not duplicate)', async () => { const mockDoc: DocumentData = { id: '123', filename: 'invoice.pdf', @@ -47,22 +48,46 @@ describe('DocumentRepository', () => { await repo.saveDocument(mockDoc, '/tmp/invoice.pdf'); - // Update amount - const updatedDoc = { ...mockDoc, amount: 200 }; + // Re-extract same file with new ID and updated data + const updatedDoc = { ...mockDoc, id: '456', amount: 200 }; await repo.saveDocument(updatedDoc, '/tmp/invoice.pdf'); - const result = await repo.getDocument('123'); - expect(result?.data.amount).toBe(200); + // Should have updated the existing record, not created a new one + const list = await repo.listDocuments(); + expect(list).toHaveLength(1); + expect(list[0].id).toBe('456'); // ID updated + expect(list[0].data.amount).toBe(200); // Data updated }); it('should list all documents', async () => { const doc1 = { id: '1', filename: 'a.pdf', type: 'invoice' as const, extractedAt: new Date() }; const doc2 = { id: '2', filename: 'b.pdf', type: 'receipt' as const, extractedAt: new Date() }; - await repo.saveDocument(doc1, '/a'); - await repo.saveDocument(doc2, '/b'); + await repo.saveDocument(doc1, '/a.pdf'); + await repo.saveDocument(doc2, '/b.pdf'); const list = await repo.listDocuments(); expect(list).toHaveLength(2); }); }); + +describe('computePathHash', () => { + it('should return consistent hash for same absolute path', () => { + const hash1 = computePathHash('/tmp/invoice.pdf'); + const hash2 = computePathHash('/tmp/invoice.pdf'); + expect(hash1).toBe(hash2); + }); + + it('should return different hash for different paths', () => { + const hash1 = computePathHash('/tmp/a.pdf'); + const hash2 = computePathHash('/tmp/b.pdf'); + expect(hash1).not.toBe(hash2); + }); + + it('should resolve relative paths to absolute', () => { + // Same file, different ways of referring to it + const hash1 = computePathHash('./test.pdf'); + const hash2 = computePathHash('test.pdf'); + expect(hash1).toBe(hash2); + }); +}); diff --git a/packages/storage/src/index.ts b/packages/storage/src/index.ts index 71e0720..6b7d7f0 100644 --- a/packages/storage/src/index.ts +++ b/packages/storage/src/index.ts @@ -1,3 +1,5 @@ +import { createHash } from 'node:crypto'; +import { basename, resolve } from 'node:path'; import type { DocumentData } from '@doc-agent/core'; import { eq } from 'drizzle-orm'; import { createDb, type DbClient } from './db'; @@ -9,6 +11,14 @@ export { createDb, type DbClient, getDbPath } from './db'; // Re-export schema types export { type Document, documents, type NewDocument } from './schema'; +/** + * Compute SHA256 hash of a path for PII-safe storage + */ +export function computePathHash(filePath: string): string { + const absolutePath = resolve(filePath); + return createHash('sha256').update(absolutePath).digest('hex'); +} + export class DocumentRepository { private db: DbClient; @@ -17,23 +27,27 @@ export class DocumentRepository { } async saveDocument(docData: DocumentData, filePath: string): Promise { + const pathHash = computePathHash(filePath); + const filename = basename(filePath); + const newDoc: NewDocument = { id: docData.id, - path: filePath, + pathHash, + filename, status: 'pending', data: docData, createdAt: new Date(), }; - // Upsert logic: if id exists, update data + // Upsert logic: if same file path, update existing record await this.db .insert(documents) .values(newDoc) .onConflictDoUpdate({ - target: documents.id, + target: documents.pathHash, set: { + id: docData.id, data: docData, - path: filePath, status: 'pending', // Reset status on update so it gets re-indexed }, }); @@ -52,4 +66,22 @@ export class DocumentRepository { } } -export const storage = new DocumentRepository(); +// Lazy singleton - only initializes when first accessed +let _storage: DocumentRepository | null = null; + +export function getStorage(): DocumentRepository { + if (!_storage) { + _storage = new DocumentRepository(); + } + return _storage; +} + +// Convenience alias for simple usage +export const storage = { + saveDocument: (...args: Parameters) => + getStorage().saveDocument(...args), + getDocument: (...args: Parameters) => + getStorage().getDocument(...args), + listDocuments: (...args: Parameters) => + getStorage().listDocuments(...args), +}; diff --git a/packages/storage/src/schema.ts b/packages/storage/src/schema.ts index 59d387a..48ce17c 100644 --- a/packages/storage/src/schema.ts +++ b/packages/storage/src/schema.ts @@ -3,8 +3,9 @@ import { integer, sqliteTable, text } from 'drizzle-orm/sqlite-core'; export const documents = sqliteTable('documents', { id: text('id').primaryKey(), - path: text('path').notNull(), - hash: text('hash'), + pathHash: text('path_hash').notNull().unique(), + filename: text('filename').notNull(), + contentHash: text('content_hash'), status: text('status', { enum: ['pending', 'indexed', 'failed'] }) .notNull() .default('pending'), diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 424206c..3dcca75 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -41,6 +41,9 @@ importers: '@doc-agent/extract': specifier: workspace:* version: link:../extract + '@doc-agent/storage': + specifier: workspace:* + version: link:../storage '@doc-agent/vector-store': specifier: workspace:* version: link:../vector-store diff --git a/tsconfig.json b/tsconfig.json index 8c14324..8f35288 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -17,6 +17,8 @@ "@doc-agent/core/*": ["packages/core/src/*"], "@doc-agent/extract": ["packages/extract/src"], "@doc-agent/extract/*": ["packages/extract/src/*"], + "@doc-agent/storage": ["packages/storage/src"], + "@doc-agent/storage/*": ["packages/storage/src/*"], "@doc-agent/vector-store": ["packages/vector-store/src"], "@doc-agent/vector-store/*": ["packages/vector-store/src/*"] }