Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,38 @@ The project exposes an MCP server (`packages/cli/src/mcp/server.ts`) that allows
- **Linting/Formatting**: Run `pnpm lint` and `pnpm format` (using Biome).
- **Commits**: Follow Conventional Commits (e.g., `feat(cli): add new command`).

## Security & Privacy

This is a **privacy-first** tool. All data stays local. When implementing features that touch user data, follow these principles:

### PII Considerations

Before storing or logging any data, ask:
1. **Does this contain PII?** (usernames, paths, emails, IPs)
2. **Is storage necessary?** Can we derive what we need without storing raw PII?
3. **What's the blast radius?** If this data leaks, what's exposed?

**Preferred patterns:**
| Instead of | Use |
|------------|-----|
| `/Users/john/docs/invoice.pdf` | Hash of path + filename only |
| Full error stack with paths | Sanitized error messages |
| Logging request bodies | Logging request metadata only |

### Data Locality

- All extracted document data stays in local SQLite
- No telemetry, no cloud sync, no external API calls (except AI providers)
- User controls their data completely

### AI Provider Data

When sending data to AI providers:
- Gemini/OpenAI: Data leaves machine (user accepts this by providing API key)
- Ollama: Data stays local (default, privacy-first option)

Document these trade-offs clearly in user-facing docs.

## Adding New Features

1. **New AI Provider**: Update `packages/extract/src/index.ts` to implement the provider logic.
Expand Down
1 change: 1 addition & 0 deletions packages/cli/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
"dependencies": {
"@doc-agent/core": "workspace:*",
"@doc-agent/extract": "workspace:*",
"@doc-agent/storage": "workspace:*",
"@doc-agent/vector-store": "workspace:*",
"@google/generative-ai": "^0.24.1",
"@modelcontextprotocol/sdk": "^1.24.3",
Expand Down
17 changes: 11 additions & 6 deletions packages/cli/src/cli.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
#!/usr/bin/env node
import { exec } from 'node:child_process';
import { resolve } from 'node:path';
import { promisify } from 'node:util';
import type { Config } from '@doc-agent/core';
import { extractDocument } from '@doc-agent/extract';
import { storage } from '@doc-agent/storage';
import chalk from 'chalk';
import { Command } from 'commander';
import ora from 'ora';
Expand Down Expand Up @@ -51,6 +53,7 @@ program
'Model to use (default: llama3.2-vision for ollama)',
'llama3.2-vision'
)
.option('-d, --dry-run', 'Print JSON only, do not save to database', false)
.action(async (file: string, options) => {
try {
if (options.provider === 'ollama') {
Expand All @@ -68,14 +71,16 @@ program

const result = await extractDocument(file, config);

spinner.succeed(chalk.green('Extraction complete!'));
if (options.dryRun) {
spinner.succeed(chalk.green('Extraction complete (dry run)'));
} else {
const absolutePath = resolve(file);
await storage.saveDocument(result, absolutePath);
spinner.succeed(chalk.green(`Saved: ${result.filename} (ID: ${result.id})`));
}

console.log(JSON.stringify(result, null, 2));
} catch (error) {
// Only fail the spinner if it's running (ensureOllamaModel might have failed already)
if (ora().isSpinning) {
// This check is tricky because ora() creates a new instance.
// We'll just log the error.
}
console.error(chalk.red('\nExtraction failed:'));
console.error((error as Error).message);
process.exit(1);
Expand Down
7 changes: 6 additions & 1 deletion packages/cli/tsconfig.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,12 @@
},
"include": ["src/**/*"],
"exclude": ["node_modules", "dist", "**/*.test.ts"],
"references": [{ "path": "../core" }, { "path": "../extract" }, { "path": "../vector-store" }],
"references": [
{ "path": "../core" },
{ "path": "../extract" },
{ "path": "../storage" },
{ "path": "../vector-store" }
],
"ts-node": {
"esm": true
}
Expand Down
8 changes: 0 additions & 8 deletions packages/storage/drizzle/0000_blue_legion.sql

This file was deleted.

11 changes: 11 additions & 0 deletions packages/storage/drizzle/0000_init.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
CREATE TABLE `documents` (
`id` text PRIMARY KEY NOT NULL,
`path_hash` text NOT NULL,
`filename` text NOT NULL,
`content_hash` text,
`status` text DEFAULT 'pending' NOT NULL,
`data` text NOT NULL,
`created_at` integer DEFAULT '"2025-12-07T19:46:45.333Z"' NOT NULL
);
--> statement-breakpoint
CREATE UNIQUE INDEX `documents_path_hash_unique` ON `documents` (`path_hash`);
27 changes: 20 additions & 7 deletions packages/storage/drizzle/meta/0000_snapshot.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"version": "6",
"dialect": "sqlite",
"id": "6eba05ff-3086-4f83-bc86-506b17db1c03",
"id": "b3921c83-f822-4238-92be-3cba302553fe",
"prevId": "00000000-0000-0000-0000-000000000000",
"tables": {
"documents": {
Expand All @@ -14,15 +14,22 @@
"notNull": true,
"autoincrement": false
},
"path": {
"name": "path",
"path_hash": {
"name": "path_hash",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"hash": {
"name": "hash",
"filename": {
"name": "filename",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"content_hash": {
"name": "content_hash",
"type": "text",
"primaryKey": false,
"notNull": false,
Expand All @@ -49,10 +56,16 @@
"primaryKey": false,
"notNull": true,
"autoincrement": false,
"default": "'\"2025-12-07T11:19:23.284Z\"'"
"default": "'\"2025-12-07T19:46:45.333Z\"'"
}
},
"indexes": {
"documents_path_hash_unique": {
"name": "documents_path_hash_unique",
"columns": ["path_hash"],
"isUnique": true
}
},
"indexes": {},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {},
Expand Down
4 changes: 2 additions & 2 deletions packages/storage/drizzle/meta/_journal.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
{
"idx": 0,
"version": "6",
"when": 1765106363288,
"tag": "0000_blue_legion",
"when": 1765136805336,
"tag": "0000_init",
"breakpoints": true
}
]
Expand Down
43 changes: 34 additions & 9 deletions packages/storage/src/__tests__/repository.test.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import type { DocumentData } from '@doc-agent/core';
import { beforeEach, describe, expect, it } from 'vitest';
import { createDb } from '../db';
import { DocumentRepository } from '../index';
import { computePathHash, DocumentRepository } from '../index';

describe('DocumentRepository', () => {
let repo: DocumentRepository;
Expand Down Expand Up @@ -32,10 +32,11 @@ describe('DocumentRepository', () => {
...mockDoc,
extractedAt: mockDoc.extractedAt.toISOString(),
});
expect(result?.path).toBe('/tmp/invoice.pdf');
expect(result?.filename).toBe('invoice.pdf');
expect(result?.pathHash).toBe(computePathHash('/tmp/invoice.pdf'));
});

it('should update an existing document on save', async () => {
it('should upsert by path (same file = update, not duplicate)', async () => {
const mockDoc: DocumentData = {
id: '123',
filename: 'invoice.pdf',
Expand All @@ -47,22 +48,46 @@ describe('DocumentRepository', () => {

await repo.saveDocument(mockDoc, '/tmp/invoice.pdf');

// Update amount
const updatedDoc = { ...mockDoc, amount: 200 };
// Re-extract same file with new ID and updated data
const updatedDoc = { ...mockDoc, id: '456', amount: 200 };
await repo.saveDocument(updatedDoc, '/tmp/invoice.pdf');

const result = await repo.getDocument('123');
expect(result?.data.amount).toBe(200);
// Should have updated the existing record, not created a new one
const list = await repo.listDocuments();
expect(list).toHaveLength(1);
expect(list[0].id).toBe('456'); // ID updated
expect(list[0].data.amount).toBe(200); // Data updated
});

it('should list all documents', async () => {
const doc1 = { id: '1', filename: 'a.pdf', type: 'invoice' as const, extractedAt: new Date() };
const doc2 = { id: '2', filename: 'b.pdf', type: 'receipt' as const, extractedAt: new Date() };

await repo.saveDocument(doc1, '/a');
await repo.saveDocument(doc2, '/b');
await repo.saveDocument(doc1, '/a.pdf');
await repo.saveDocument(doc2, '/b.pdf');

const list = await repo.listDocuments();
expect(list).toHaveLength(2);
});
});

describe('computePathHash', () => {
it('should return consistent hash for same absolute path', () => {
const hash1 = computePathHash('/tmp/invoice.pdf');
const hash2 = computePathHash('/tmp/invoice.pdf');
expect(hash1).toBe(hash2);
});

it('should return different hash for different paths', () => {
const hash1 = computePathHash('/tmp/a.pdf');
const hash2 = computePathHash('/tmp/b.pdf');
expect(hash1).not.toBe(hash2);
});

it('should resolve relative paths to absolute', () => {
// Same file, different ways of referring to it
const hash1 = computePathHash('./test.pdf');
const hash2 = computePathHash('test.pdf');
expect(hash1).toBe(hash2);
});
});
42 changes: 37 additions & 5 deletions packages/storage/src/index.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import { createHash } from 'node:crypto';
import { basename, resolve } from 'node:path';
import type { DocumentData } from '@doc-agent/core';
import { eq } from 'drizzle-orm';
import { createDb, type DbClient } from './db';
Expand All @@ -9,6 +11,14 @@ export { createDb, type DbClient, getDbPath } from './db';
// Re-export schema types
export { type Document, documents, type NewDocument } from './schema';

/**
* Compute SHA256 hash of a path for PII-safe storage
*/
export function computePathHash(filePath: string): string {
const absolutePath = resolve(filePath);
return createHash('sha256').update(absolutePath).digest('hex');
}

export class DocumentRepository {
private db: DbClient;

Expand All @@ -17,23 +27,27 @@ export class DocumentRepository {
}

async saveDocument(docData: DocumentData, filePath: string): Promise<void> {
const pathHash = computePathHash(filePath);
const filename = basename(filePath);

const newDoc: NewDocument = {
id: docData.id,
path: filePath,
pathHash,
filename,
status: 'pending',
data: docData,
createdAt: new Date(),
};

// Upsert logic: if id exists, update data
// Upsert logic: if same file path, update existing record
await this.db
.insert(documents)
.values(newDoc)
.onConflictDoUpdate({
target: documents.id,
target: documents.pathHash,
set: {
id: docData.id,
data: docData,
path: filePath,
status: 'pending', // Reset status on update so it gets re-indexed
},
});
Expand All @@ -52,4 +66,22 @@ export class DocumentRepository {
}
}

export const storage = new DocumentRepository();
// Lazy singleton - only initializes when first accessed
let _storage: DocumentRepository | null = null;

export function getStorage(): DocumentRepository {
if (!_storage) {
_storage = new DocumentRepository();
}
return _storage;
}

// Convenience alias for simple usage
export const storage = {
saveDocument: (...args: Parameters<DocumentRepository['saveDocument']>) =>
getStorage().saveDocument(...args),
getDocument: (...args: Parameters<DocumentRepository['getDocument']>) =>
getStorage().getDocument(...args),
listDocuments: (...args: Parameters<DocumentRepository['listDocuments']>) =>
getStorage().listDocuments(...args),
};
5 changes: 3 additions & 2 deletions packages/storage/src/schema.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@ import { integer, sqliteTable, text } from 'drizzle-orm/sqlite-core';

export const documents = sqliteTable('documents', {
id: text('id').primaryKey(),
path: text('path').notNull(),
hash: text('hash'),
pathHash: text('path_hash').notNull().unique(),
filename: text('filename').notNull(),
contentHash: text('content_hash'),
status: text('status', { enum: ['pending', 'indexed', 'failed'] })
.notNull()
.default('pending'),
Expand Down
3 changes: 3 additions & 0 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions tsconfig.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
"@doc-agent/core/*": ["packages/core/src/*"],
"@doc-agent/extract": ["packages/extract/src"],
"@doc-agent/extract/*": ["packages/extract/src/*"],
"@doc-agent/storage": ["packages/storage/src"],
"@doc-agent/storage/*": ["packages/storage/src/*"],
"@doc-agent/vector-store": ["packages/vector-store/src"],
"@doc-agent/vector-store/*": ["packages/vector-store/src/*"]
}
Expand Down