|
1 | | -use crate::ingest::{run_with_config, IngestConfig}; |
| 1 | +use anyhow::{Context, Result}; |
2 | 2 | use clap::Parser; |
| 3 | +use harald::ingest::embed; |
| 4 | +use harald::ingest::{run_with_config, IngestConfig}; |
| 5 | +use reqwest::Client; |
| 6 | +use serde_json::Value; |
| 7 | +use std::fs; |
3 | 8 | use std::path::PathBuf; |
4 | 9 |
|
5 | 10 | #[derive(Parser, Debug)] |
6 | | -#[command(author, version, about = "MarvelAI Marvel Ingest Wrapper", long_about = None)] |
| 11 | +#[command(author, version, about = "MarvelAI Marvel Ingest Tool", long_about = None)] |
7 | 12 | struct Args { |
8 | | - /// Path to the root directory for ingestion |
9 | | - #[arg(short, long, default_value = ".")] |
10 | | - root_dir: PathBuf, |
| 13 | + /// Path to the MarvelAIs.json file |
| 14 | + #[arg( |
| 15 | + short, |
| 16 | + long, |
| 17 | + default_value = "personality-archetypes/pop-culture/marvel/MarvelAIs.json" |
| 18 | + )] |
| 19 | + input: PathBuf, |
| 20 | + |
11 | 21 | /// Maximum characters to read per file |
12 | 22 | #[arg(long, default_value_t = 800)] |
13 | 23 | max_chars: usize, |
| 24 | + |
14 | 25 | /// Maximum tokens for embedding requests |
15 | 26 | #[arg(long, default_value_t = 600)] |
16 | 27 | max_tokens: usize, |
| 28 | + |
17 | 29 | /// Maximum number of files to process concurrently |
18 | 30 | #[arg(long)] |
19 | 31 | max_concurrent_files: Option<usize>, |
20 | 32 | } |
21 | 33 |
|
22 | 34 | #[tokio::main] |
23 | | -async fn main() { |
| 35 | +async fn main() -> Result<()> { |
24 | 36 | let args = Args::parse(); |
| 37 | + |
| 38 | + println!("=================================================="); |
| 39 | + println!("🚀 HARALD MARVELAI INGEST (Rust)"); |
| 40 | + println!("🔍 Processing MarvelAIs.json using JSONL format"); |
| 41 | + println!("=================================================="); |
| 42 | + |
| 43 | + // Verify input file exists |
| 44 | + if !args.input.exists() { |
| 45 | + eprintln!("❌ Input file not found: {}", args.input.display()); |
| 46 | + std::process::exit(1); |
| 47 | + } |
| 48 | + |
| 49 | + // Test embedding API first - exit early if it fails |
| 50 | + let client = Client::builder() |
| 51 | + .timeout(std::time::Duration::from_secs(10)) // Reduced from 30 to 10 seconds |
| 52 | + .build() |
| 53 | + .context("Failed to create HTTP client")?; |
| 54 | + |
| 55 | + println!("Testing embedding API with model harald-phi4"); |
| 56 | + |
| 57 | + // Test with a simple string first |
| 58 | + match test_embedding_api(&client, args.max_tokens).await { |
| 59 | + Ok(_) => println!(" ✅ Embedding API test successful"), |
| 60 | + Err(e) => { |
| 61 | + eprintln!(" ❌ Embedding API test failed: {}", e); |
| 62 | + eprintln!( |
| 63 | + "❌ Cannot proceed without working embedding API. Please check Ollama is running." |
| 64 | + ); |
| 65 | + std::process::exit(1); |
| 66 | + } |
| 67 | + } |
| 68 | + |
| 69 | + // Convert JSON to JSONL if needed |
| 70 | + let jsonl_path = prepare_jsonl_input(&args.input)?; |
| 71 | + |
| 72 | + // Create a temporary directory for processing |
| 73 | + let temp_dir = tempfile::TempDir::new().context("Failed to create temporary directory")?; |
| 74 | + |
| 75 | + // Copy JSONL to temp directory |
| 76 | + let temp_jsonl = temp_dir.path().join("MarvelAIs.jsonl"); |
| 77 | + fs::copy(&jsonl_path, &temp_jsonl).context("Failed to copy JSONL to temp directory")?; |
| 78 | + |
| 79 | + // Configure ingestion to use the temp directory |
25 | 80 | let config = IngestConfig { |
26 | | - root_dir: args.root_dir, |
| 81 | + root_dir: temp_dir.path().to_path_buf(), |
27 | 82 | max_chars: args.max_chars, |
28 | 83 | max_tokens: args.max_tokens, |
29 | 84 | max_concurrent_files: args.max_concurrent_files, |
30 | 85 | }; |
| 86 | + |
| 87 | + // Run the standard harald_ingest logic |
31 | 88 | match run_with_config(config).await { |
32 | | - Ok(_) => println!("MarvelAI ingest completed successfully."), |
33 | | - Err(e) => eprintln!("MarvelAI ingest failed: {}", e), |
| 89 | + Ok(stats) => { |
| 90 | + println!("✅ MarvelAI ingest completed successfully!"); |
| 91 | + println!("📁 Processed: {} files", stats.files_processed); |
| 92 | + println!("⏭️ Skipped: {} files", stats.files_skipped); |
| 93 | + println!("💾 Output: {}", stats.output_dir.display()); |
| 94 | + } |
| 95 | + Err(e) => { |
| 96 | + eprintln!("❌ MarvelAI ingest failed: {}", e); |
| 97 | + std::process::exit(1); |
| 98 | + } |
| 99 | + } |
| 100 | + |
| 101 | + Ok(()) |
| 102 | +} |
| 103 | + |
| 104 | +/// Test the embedding API with a simple request to ensure it's working |
| 105 | +async fn test_embedding_api(client: &Client, max_tokens: usize) -> Result<()> { |
| 106 | + let test_text = "test"; |
| 107 | + |
| 108 | + // Create a fast-fail config for testing - using localhost endpoint as recommended |
| 109 | + let test_config = embed::EmbedConfig { |
| 110 | + model: "harald-phi4".to_string(), |
| 111 | + endpoint: "http://localhost:11434/api/embeddings".to_string(), |
| 112 | + timeout_secs: 15, // Longer timeout to account for model loading |
| 113 | + max_retries: 2, // Allow 2 attempts for initial API warmup |
| 114 | + }; |
| 115 | + |
| 116 | + println!(" Testing: embedding '{}' (using {})", test_text, test_config.endpoint); |
| 117 | + println!(" Model warmup may take a moment on first request..."); |
| 118 | + |
| 119 | + match embed::embed_with_config(test_text, max_tokens, client, test_config).await { |
| 120 | + Ok(embedding) => { |
| 121 | + if embedding.is_empty() { |
| 122 | + return Err(anyhow::anyhow!("Received empty embedding vector")); |
| 123 | + } |
| 124 | + println!(" ✅ Embedding vectors received successfully ({} dimensions)", embedding.len()); |
| 125 | + Ok(()) |
| 126 | + } |
| 127 | + Err(e) => { |
| 128 | + println!(" ❌ Request failed: {}", e); |
| 129 | + |
| 130 | + // Provide helpful debugging information |
| 131 | + eprintln!(" 💡 Troubleshooting tips:"); |
| 132 | + eprintln!(" • Ensure 'ollama serve' is running in a terminal"); |
| 133 | + eprintln!(" • Verify harald-phi4 model is available: ollama list"); |
| 134 | + eprintln!(" • Check API endpoint: curl http://localhost:11434/api/version"); |
| 135 | + |
| 136 | + Err(anyhow::anyhow!( |
| 137 | + "Failed to generate embeddings with harald-phi4 model (fast test failed)" |
| 138 | + )) |
| 139 | + } |
| 140 | + } |
| 141 | +} |
| 142 | + |
| 143 | +/// Prepare JSONL input file from the MarvelAIs.json file |
| 144 | +fn prepare_jsonl_input(input_path: &PathBuf) -> Result<PathBuf> { |
| 145 | + // If it's already JSONL, return as-is |
| 146 | + if input_path.extension().and_then(|s| s.to_str()) == Some("jsonl") { |
| 147 | + return Ok(input_path.clone()); |
34 | 148 | } |
| 149 | + |
| 150 | + // Read and parse JSON file |
| 151 | + let json_content = fs::read_to_string(input_path) |
| 152 | + .with_context(|| format!("Failed to read JSON file: {}", input_path.display()))?; |
| 153 | + |
| 154 | + let json_value: Value = serde_json::from_str(&json_content) |
| 155 | + .with_context(|| format!("Failed to parse JSON file: {}", input_path.display()))?; |
| 156 | + |
| 157 | + // Create JSONL output path |
| 158 | + let mut jsonl_path = input_path.clone(); |
| 159 | + jsonl_path.set_extension("jsonl"); |
| 160 | + |
| 161 | + // Convert to JSONL |
| 162 | + let jsonl_content = match json_value { |
| 163 | + Value::Array(items) => { |
| 164 | + // Array of objects - convert each to a line |
| 165 | + items |
| 166 | + .iter() |
| 167 | + .map(|item| serde_json::to_string(item)) |
| 168 | + .collect::<Result<Vec<_>, _>>() |
| 169 | + .context("Failed to serialize JSON items")? |
| 170 | + .join("\n") |
| 171 | + } |
| 172 | + _ => { |
| 173 | + // Single object - just one line |
| 174 | + serde_json::to_string(&json_value).context("Failed to serialize JSON object")? |
| 175 | + } |
| 176 | + }; |
| 177 | + |
| 178 | + // Write JSONL file |
| 179 | + fs::write(&jsonl_path, &jsonl_content) |
| 180 | + .with_context(|| format!("Failed to write JSONL file: {}", jsonl_path.display()))?; |
| 181 | + |
| 182 | + let line_count = jsonl_content.lines().count(); |
| 183 | + println!( |
| 184 | + "Converting \"{}\" to JSONL at \"{}\"", |
| 185 | + input_path.display(), |
| 186 | + jsonl_path.display() |
| 187 | + ); |
| 188 | + println!("✅ JSONL conversion complete: {} lines", line_count); |
| 189 | + |
| 190 | + Ok(jsonl_path) |
35 | 191 | } |
0 commit comments