|
| 1 | +import { resolve } from 'node:path'; |
| 2 | +import type { Config } from '@doc-agent/core'; |
| 3 | +import { extractDocument, type StreamChunk } from '@doc-agent/extract'; |
| 4 | +import { storage } from '@doc-agent/storage'; |
| 5 | +import kero from '@lytics/kero'; |
| 6 | +import chalk from 'chalk'; |
| 7 | +import ora from 'ora'; |
| 8 | +import prompts from 'prompts'; |
| 9 | +import { |
| 10 | + checkModelExists, |
| 11 | + checkOllamaInstalled, |
| 12 | + checkOllamaRunning, |
| 13 | + installOllama, |
| 14 | + type PullProgress, |
| 15 | + pullModel, |
| 16 | + startOllama, |
| 17 | + waitForOllama, |
| 18 | +} from '../services/ollama'; |
| 19 | + |
| 20 | +const logger = kero.createLogger({ |
| 21 | + level: |
| 22 | + (process.env.LOG_LEVEL as 'trace' | 'debug' | 'info' | 'warn' | 'error' | 'fatal') || 'info', |
| 23 | +}); |
| 24 | + |
| 25 | +export interface ExtractOptions { |
| 26 | + provider: 'gemini' | 'openai' | 'ollama'; |
| 27 | + model: string; |
| 28 | + dryRun: boolean; |
| 29 | +} |
| 30 | + |
| 31 | +/** |
| 32 | + * Format bytes to human readable string |
| 33 | + */ |
| 34 | +function formatBytes(bytes: number): string { |
| 35 | + if (bytes < 1024) return `${bytes} B`; |
| 36 | + if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`; |
| 37 | + if (bytes < 1024 * 1024 * 1024) return `${(bytes / 1024 / 1024).toFixed(1)} MB`; |
| 38 | + return `${(bytes / 1024 / 1024 / 1024).toFixed(1)} GB`; |
| 39 | +} |
| 40 | + |
| 41 | +/** |
| 42 | + * Ensure Ollama is ready (installed, running, model pulled) |
| 43 | + */ |
| 44 | +async function ensureOllamaReady(model: string): Promise<boolean> { |
| 45 | + const spinner = ora(); |
| 46 | + |
| 47 | + // Check if Ollama is installed |
| 48 | + spinner.start('Checking Ollama installation...'); |
| 49 | + const isInstalled = await checkOllamaInstalled(); |
| 50 | + |
| 51 | + if (!isInstalled) { |
| 52 | + spinner.stop(); |
| 53 | + logger.info('Ollama not installed'); |
| 54 | + |
| 55 | + const { install } = await prompts({ |
| 56 | + type: 'confirm', |
| 57 | + name: 'install', |
| 58 | + message: 'Ollama is not installed. Install via Homebrew?', |
| 59 | + initial: true, |
| 60 | + }); |
| 61 | + |
| 62 | + if (!install) { |
| 63 | + console.log(chalk.yellow('Please install Ollama manually: https://ollama.com/download')); |
| 64 | + return false; |
| 65 | + } |
| 66 | + |
| 67 | + spinner.start('Installing Ollama via Homebrew...'); |
| 68 | + try { |
| 69 | + await installOllama((msg) => { |
| 70 | + spinner.text = msg; |
| 71 | + }); |
| 72 | + spinner.succeed('Ollama installed'); |
| 73 | + logger.info('Ollama installed successfully'); |
| 74 | + } catch (error) { |
| 75 | + spinner.fail('Failed to install Ollama'); |
| 76 | + logger.error({ error: String(error) }, 'Ollama installation failed'); |
| 77 | + console.error(chalk.red(String(error))); |
| 78 | + return false; |
| 79 | + } |
| 80 | + } else { |
| 81 | + spinner.succeed('Ollama installed'); |
| 82 | + } |
| 83 | + |
| 84 | + // Check if Ollama is running |
| 85 | + spinner.start('Checking Ollama server...'); |
| 86 | + let isRunning = await checkOllamaRunning(); |
| 87 | + |
| 88 | + if (!isRunning) { |
| 89 | + spinner.stop(); |
| 90 | + logger.info('Ollama server not running'); |
| 91 | + |
| 92 | + const { start } = await prompts({ |
| 93 | + type: 'confirm', |
| 94 | + name: 'start', |
| 95 | + message: 'Ollama server is not running. Start it?', |
| 96 | + initial: true, |
| 97 | + }); |
| 98 | + |
| 99 | + if (!start) { |
| 100 | + console.log(chalk.yellow('Please start Ollama: ollama serve')); |
| 101 | + return false; |
| 102 | + } |
| 103 | + |
| 104 | + spinner.start('Starting Ollama server...'); |
| 105 | + startOllama(); |
| 106 | + |
| 107 | + isRunning = await waitForOllama(15000); |
| 108 | + if (!isRunning) { |
| 109 | + spinner.fail('Failed to start Ollama server'); |
| 110 | + logger.error('Ollama server failed to start'); |
| 111 | + return false; |
| 112 | + } |
| 113 | + spinner.succeed('Ollama server started'); |
| 114 | + logger.info('Ollama server started'); |
| 115 | + } else { |
| 116 | + spinner.succeed('Ollama server running'); |
| 117 | + } |
| 118 | + |
| 119 | + // Check if model exists |
| 120 | + spinner.start(`Checking model: ${model}...`); |
| 121 | + const modelExists = await checkModelExists(model); |
| 122 | + |
| 123 | + if (!modelExists) { |
| 124 | + spinner.text = `Pulling model: ${model}...`; |
| 125 | + logger.info({ model }, 'Pulling model'); |
| 126 | + |
| 127 | + try { |
| 128 | + await pullModel(model, (progress: PullProgress) => { |
| 129 | + if (progress.total && progress.completed) { |
| 130 | + const pct = Math.round((progress.completed / progress.total) * 100); |
| 131 | + spinner.text = `Pulling ${model}: ${pct}% (${formatBytes(progress.completed)}/${formatBytes(progress.total)})`; |
| 132 | + } else if (progress.status) { |
| 133 | + spinner.text = `${model}: ${progress.status}`; |
| 134 | + } |
| 135 | + }); |
| 136 | + spinner.succeed(`Model ready: ${model}`); |
| 137 | + logger.info({ model }, 'Model pulled successfully'); |
| 138 | + } catch (error) { |
| 139 | + spinner.fail(`Failed to pull model: ${model}`); |
| 140 | + logger.error({ model, error: String(error) }, 'Model pull failed'); |
| 141 | + return false; |
| 142 | + } |
| 143 | + } else { |
| 144 | + spinner.succeed(`Model ready: ${model}`); |
| 145 | + } |
| 146 | + |
| 147 | + return true; |
| 148 | +} |
| 149 | + |
| 150 | +/** |
| 151 | + * Run document extraction |
| 152 | + */ |
| 153 | +export async function runExtract(file: string, options: ExtractOptions): Promise<void> { |
| 154 | + const { provider, model, dryRun } = options; |
| 155 | + const absolutePath = resolve(file); |
| 156 | + |
| 157 | + logger.info({ file: absolutePath, provider, model, dryRun }, 'Starting extraction'); |
| 158 | + |
| 159 | + // For Ollama, ensure everything is ready |
| 160 | + if (provider === 'ollama') { |
| 161 | + const ready = await ensureOllamaReady(model); |
| 162 | + if (!ready) { |
| 163 | + process.exitCode = 1; |
| 164 | + return; |
| 165 | + } |
| 166 | + } |
| 167 | + |
| 168 | + // Build config |
| 169 | + const config: Config = { |
| 170 | + aiProvider: provider, |
| 171 | + geminiApiKey: process.env.GEMINI_API_KEY, |
| 172 | + geminiModel: provider === 'gemini' ? model : undefined, |
| 173 | + openaiApiKey: process.env.OPENAI_API_KEY, |
| 174 | + ollamaModel: provider === 'ollama' ? model : undefined, |
| 175 | + }; |
| 176 | + |
| 177 | + // Run extraction |
| 178 | + const spinner = ora('Extracting document data...').start(); |
| 179 | + let lastPrompt = ''; |
| 180 | + let responseBuffer = ''; |
| 181 | + |
| 182 | + try { |
| 183 | + const result = await extractDocument(absolutePath, config, { |
| 184 | + onStream: (chunk: StreamChunk) => { |
| 185 | + if (!chunk) return; |
| 186 | + |
| 187 | + if (chunk.type === 'log') { |
| 188 | + // Log via kero - use simple string logging to avoid issues |
| 189 | + const msg = `${chunk.message}${chunk.data ? ` ${JSON.stringify(chunk.data)}` : ''}`; |
| 190 | + if (chunk.level === 'error') { |
| 191 | + logger.error(msg); |
| 192 | + } else if (chunk.level === 'warn') { |
| 193 | + logger.warn(msg); |
| 194 | + } else if (chunk.level === 'debug') { |
| 195 | + logger.debug(msg); |
| 196 | + } else { |
| 197 | + logger.info(msg); |
| 198 | + } |
| 199 | + |
| 200 | + // Update spinner for info logs |
| 201 | + if (chunk.level === 'info') { |
| 202 | + spinner.text = chunk.message; |
| 203 | + } |
| 204 | + } else if (chunk.type === 'prompt') { |
| 205 | + lastPrompt = chunk.content; |
| 206 | + // Show OCR progress in spinner |
| 207 | + if (chunk.content.includes('OCR')) { |
| 208 | + spinner.text = chunk.content.split('\n')[0]; |
| 209 | + } |
| 210 | + // Log full prompt at debug level (only for system/user prompts, not OCR progress) |
| 211 | + if (chunk.content.includes('System:') || chunk.content.includes('User:')) { |
| 212 | + logger.debug(`Prompt to model:\n${chunk.content}`); |
| 213 | + } |
| 214 | + } else if (chunk.type === 'response') { |
| 215 | + responseBuffer += chunk.content; |
| 216 | + // Show that we're receiving response |
| 217 | + spinner.text = `Receiving response... (${responseBuffer.length} chars)`; |
| 218 | + } |
| 219 | + }, |
| 220 | + }); |
| 221 | + |
| 222 | + spinner.succeed('Extraction complete'); |
| 223 | + logger.info( |
| 224 | + { type: result.type, itemCount: result.items?.length ?? 0 }, |
| 225 | + 'Extraction successful' |
| 226 | + ); |
| 227 | + |
| 228 | + // Save to database (unless dry run) |
| 229 | + if (!dryRun) { |
| 230 | + const saveSpinner = ora('Saving to database...').start(); |
| 231 | + try { |
| 232 | + await storage.saveDocument(result, absolutePath); |
| 233 | + saveSpinner.succeed(`Saved: ${result.filename} (ID: ${result.id})`); |
| 234 | + logger.info({ id: result.id, filename: result.filename }, 'Document saved'); |
| 235 | + } catch (error) { |
| 236 | + saveSpinner.fail('Failed to save'); |
| 237 | + logger.error({ error: String(error) }, 'Save failed'); |
| 238 | + throw error; |
| 239 | + } |
| 240 | + } else { |
| 241 | + console.log(chalk.gray('(dry run - not saved to database)')); |
| 242 | + } |
| 243 | + |
| 244 | + // Print result |
| 245 | + console.log(chalk.gray('─'.repeat(40))); |
| 246 | + console.log(JSON.stringify(result, null, 2)); |
| 247 | + } catch (error) { |
| 248 | + spinner.fail('Extraction failed'); |
| 249 | + logger.error({ error: String(error) }, 'Extraction failed'); |
| 250 | + |
| 251 | + // Show the prompt for debugging if available |
| 252 | + if (lastPrompt) { |
| 253 | + console.log(chalk.gray('\n─── Last Prompt ───')); |
| 254 | + console.log(chalk.gray(lastPrompt.slice(-500))); // Last 500 chars |
| 255 | + } |
| 256 | + |
| 257 | + console.error(chalk.red(String(error))); |
| 258 | + process.exitCode = 1; |
| 259 | + } |
| 260 | +} |
0 commit comments