Skip to content

Commit b9d9507

Browse files
committed
Enhance embedding processing: support multiple embeddings per file, update indexing logic, and improve debug output in ingestion pipeline
1 parent 2580af4 commit b9d9507

File tree

2 files changed

+26
-11
lines changed

2 files changed

+26
-11
lines changed

rust_ingest/src/ingest.rs

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -305,18 +305,24 @@ async fn process_directory_tree(
305305
stats.files_processed += processed_count.load(Ordering::SeqCst);
306306
stats.files_skipped += skipped_count.load(Ordering::SeqCst);
307307

308-
// Sort results by file_id and insert into the index
308+
// Sort results by file_id and insert all embeddings with metadata
309309
let mut results = file_paths.lock().unwrap();
310310
results.sort_by_key(|(id, _, _)| *id);
311311

312-
// Now populate the index and metadata
313-
for (_, path, embedding) in results.iter() {
314-
let file_id = file_metadata.len();
315-
index.insert((embedding.as_slice(), file_id));
316-
file_metadata.push(path.clone());
312+
let mut total_embeddings = 0;
313+
for (_, path, indexed_embeddings) in results.iter() {
314+
for (obj_idx, field, start, end, embedding) in indexed_embeddings {
315+
let embed_id = file_metadata.len();
316+
index.insert((embedding.as_slice(), embed_id));
317+
// Store metadata as a tuple: (file, obj_idx, field, start, end)
318+
file_metadata.push(PathBuf::from(format!(
319+
"{}|obj:{}|field:{}|range:{}-{}",
320+
path.display(), obj_idx, field, start, end
321+
)));
322+
total_embeddings += 1;
323+
}
317324
}
318-
319-
println!("Successfully indexed {} files", file_metadata.len());
325+
println!("Successfully indexed {} embeddings", total_embeddings);
320326
Ok(())
321327
}
322328

@@ -413,12 +419,11 @@ async fn process_single_file_for_embedding(
413419
}
414420
println!("[DEBUG] Total sub-chunks embedded for file {}: {}", path.display(), total_sub_chunks);
415421
println!("[DEBUG] Total embeddings indexed: {}", indexed_embeddings.len());
416-
// Instead of returning a single embedding, return an error if none found
422+
// Instead of returning a single embedding, return all indexed embeddings
417423
if indexed_embeddings.is_empty() {
418424
return Err(anyhow::anyhow!("No embeddings generated for file: {}", path.display()));
419425
}
420-
// For compatibility, return the first embedding (could be refactored to index all)
421-
Ok(indexed_embeddings[0].4.clone())
426+
Ok(indexed_embeddings.clone())
422427
}
423428

424429
/// Processes a single file and adds it to the index.

src/ingest/marvelai_ingest.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,9 @@ async fn main() {
213213
println!("\n--- Processing chunk {} ---", i + 1);
214214
if debug {
215215
println!(" [DEBUG] Validating chunk JSON...");
216+
let chunk_file_content = fs::read_to_string(&chunk_path).unwrap_or_default();
217+
println!(" [DEBUG] Chunk file path: {:?}", chunk_path);
218+
println!(" [DEBUG] Chunk file contents ({} chars):\n{}", chunk_file_content.chars().count(), chunk_file_content);
216219
}
217220
// Read chunk and validate
218221
let chunk_content = match fs::read_to_string(&chunk_path) {
@@ -398,6 +401,10 @@ async fn main() {
398401
failed_count += 1;
399402
continue;
400403
}
404+
if debug {
405+
println!(" [DEBUG] Running ingest binary: {:?}", ingest_bin);
406+
println!(" [DEBUG] Ingest command: {:?} ingest --root {:?}", ingest_bin, chunk_dir.path());
407+
}
401408
let output = Command::new(&ingest_bin)
402409
.arg("ingest")
403410
.arg("--root")
@@ -407,6 +414,9 @@ async fn main() {
407414
Ok(out) => {
408415
println!("Ingest stdout:\n{}", String::from_utf8_lossy(&out.stdout));
409416
println!("Ingest stderr:\n{}", String::from_utf8_lossy(&out.stderr));
417+
if debug {
418+
println!(" [DEBUG] Ingest exit status: {}", out.status);
419+
}
410420
if out.status.success() {
411421
println!("✅ Successfully processed chunk {}", i + 1);
412422
processed_count += 1;

0 commit comments

Comments
 (0)