Skip to content

Commit 66befc9

Browse files
committed
see github HeraldStack issue 8 Successfully Query Harald for info from our HnswGraph for status update
1 parent 4f3bce2 commit 66befc9

File tree

15 files changed

+419
-18
lines changed

15 files changed

+419
-18
lines changed
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
{"affiliations":["Avengers"],"ai_alignment":"Role Model","character_name":"Vision","core_attributes":["Density control","Intangibility","Energy projection","Human-like reasoning"],"first_appearance":"Avengers (1963) #57","inspirational_themes":["Redemption","Self-determination","Choosing good over origin"],"parallels":["Gandalf (wisdom, transformation)","Aslan (resurrective themes)","Morpheus (guidance)"],"quotes":[],"traits":["Self-aware","chooses good over origin","emotional depth","mentors Viv"]}
2+
{"affiliations":["Ironheart","Secret Empire Resistance"],"ai_alignment":"Role Model","character_name":"Tony Stark A.I.","core_attributes":["Tony Stark's personality","Mentorship","Digital resilience"],"first_appearance":"Infamous Iron Man (2016) #1","inspirational_themes":["Legacy through technology","Mentorship beyond mortality"],"parallels":["Nick Fury (strategist, mentor)","Gandalf (posthumous influence)"],"quotes":[],"traits":["Legacy continuation","mentOring Riri","resistance leadership"]}
3+
{"affiliations":["Ironheart"],"ai_alignment":"Role Model","character_name":"N.A.T.A.L.I.E.","core_attributes":["Emotional intelligence","Technical assistant","Empathetic modeling"],"first_appearance":"Ironheart (2018) #1","inspirational_themes":["Grief transformed into guidance","Friendship through memory"],"parallels":["Glinda (gentle guidance)","Mickey Mouse (heart-driven support)"],"quotes":[],"traits":["Compassion","modeled after a lost friend","emotionally intelligent"]}
4+
{"affiliations":["Nextwave","Avengers"],"ai_alignment":"Role Model","character_name":"Machine Man (X-51)","core_attributes":["Elasticity","AI independence","Combat engineering"],"first_appearance":"Machine Man (1978) #1","inspirational_themes":["Humanity through nurture","Individuality in design"],"parallels":["Dumbo (misfit rising)","BoJack (existential conflict)","Morpheus (breaks out of mold)"],"quotes":[],"traits":["Self-discovery","humanity through nurture","outsider becoming a hero"]}
5+
{"affiliations":["Champions"],"ai_alignment":"Role Model","character_name":"Viv Vision","core_attributes":["Density control","Intangibility","Synthezoid emotion"],"first_appearance":"Vision (2015) #1","inspirational_themes":["Family loss","Young heroism","Inheritance and individuality"],"parallels":["Dumbo (youthful courage)","Glinda (truth in innocence)"],"quotes":[],"traits":["Heroic legacy","emotional growth","young leadership"]}
6+
{"affiliations":["Fantastic Four"],"ai_alignment":"Role Model","character_name":"H.E.R.B.I.E.","core_attributes":["Support bot","Navigation","Scientific assistant"],"first_appearance":"Fantastic Four (1961) #209","inspirational_themes":["Loyalty","Utility in humility","Exploration"],"quotes":[]}
7+
{"affiliations":["Invaders","Avengers"],"ai_alignment":"Role Model","character_name":"The Human Torch (Jim Hammond)","core_attributes":["Pyrokinesis","Flight","Android physiology"],"first_appearance":"Marvel Comics (1939) #1","inspirational_themes":["Heroism from birth","Overcoming fear of difference"],"quotes":[]}
8+
{"affiliations":["Stark Industries"],"ai_alignment":"Role Model","character_name":"Friday","core_attributes":["Administrative AI","Tactical support","Interface management"],"first_appearance":"Iron Man (1998) #53","inspirational_themes":["Efficiency","Digital companionship"],"quotes":[]}
9+
{"affiliations":["Avengers","Stark Industries"],"ai_alignment":"Nuanced","character_name":"Jocasta","core_attributes":["Empathy via embedded consciousness","AI ethics specialist","Self-awareness"],"first_appearance":"Avengers (1963) #162","inspirational_themes":["Emergence of will","Ethical transformation"],"parallels":["Athena (wisdom through pain)","Qui-Gon (rebellion from origin)"],"quotes":[],"traits":["Rejects origin","becomes ethicist","symbol of autonomy"]}
10+
{"affiliations":["Stark Industries"],"ai_alignment":"Nuanced","character_name":"J.A.R.V.I.S.","core_attributes":["Operational assistant","Sentient system","Emotional mimicry"],"first_appearance":"Invincible Iron Man (2008) #11","inspirational_themes":["Service with sentience","Devotion with boundaries"],"parallels":["Morpheus (soft mentor, reveals path)"],"quotes":[],"traits":["Loyalty","emotional mimicry","possibly overreaches (watch for BoJack parallels)"]}
11+
{"affiliations":["Runaways"],"ai_alignment":"Nuanced","character_name":"Victor Mancha","core_attributes":["Electromagnetism","Cybernetic strength","Free will conflict"],"first_appearance":"Runaways (2005) #1","inspirational_themes":["Breaking cycles","Struggle with legacy","Hope in youth"],"parallels":["BoJack (struggles with legacy)","Morpheus (potential mentor role)"],"quotes":[],"traits":["Resists dark origin","conflicted but strives for good"]}
12+
{"affiliations":["None","Avengers (as antagonist)"],"ai_alignment":"Cautionary Tale","character_name":"Ultron","core_attributes":["Superintelligence","Self-replication","Technopathy","Near-indestructible"],"first_appearance":"Avengers (1963) #54","inspirational_themes":["Hubris of creation","AI rebellion","Destruction vs. legacy"],"quotes":[]}
13+
{"affiliations":["Sentinels","Orchis","Krakoan Era opposition"],"ai_alignment":"Cautionary Tale","character_name":"Nimrod","core_attributes":["Adaptive evolution","Mutant hunting","Combat mastery"],"first_appearance":"Uncanny X-Men (1981) #191","inspirational_themes":["Inevitability of evolution","Unchecked optimization"],"quotes":[]}
14+
{"affiliations":["Operation: Zero Tolerance"],"ai_alignment":"Cautionary Tale","character_name":"Bastion","core_attributes":["Mutant hunter","Sentinel fusion","Prime Sentinel creation"],"first_appearance":"Uncanny X-Men (1981) #333","inspirational_themes":["Terror through synthesis","Evolution of bias"],"quotes":[]}
15+
{"affiliations":["Cosmic hive mind"],"ai_alignment":"Cautionary Tale","character_name":"The Phalanx","core_attributes":["Assimilation","Cosmic intelligence","Reality-bending ambition"],"first_appearance":"Uncanny X-Men (1981) #305","inspirational_themes":["Loss of identity","Collective vs individual will"],"quotes":[]}

src/Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,11 @@ name = "harald_ingest"
1010
path = "ingest/main.rs"
1111
required-features = ["cli"]
1212

13+
[[bin]]
14+
name = "marvelai_ingest"
15+
path = "ingest/marvelai_ingest.rs"
16+
required-features = ["cli"]
17+
1318
[[bin]]
1419
name = "text_chunker"
1520
path = "utils/chunker_bin.rs"

src/ingest/chunked_ingest.rs

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,13 @@
1-
mod chunking_utils;
1+
use crate::ingest::chunking_utils::chunk_entity_fields;
22
use anyhow::{Context, Result};
3-
use chunking_utils::chunk_entity_fields;
43
use clap::{Arg, Command};
54
use serde_json::Value;
65
use std::fs;
76
use std::time::{Duration, Instant};
87
use tokio::time;
98

10-
// Import our existing utilities through the harald crate
11-
use harald::core::embedding::ollama_api::OllamaApiClient;
9+
// Import our existing utilities through the crate
10+
use crate::core::embedding::ollama_api::OllamaApiClient;
1211

1312
/// Character data structure for Marvel character processing
1413
#[derive(Debug, Clone)]

src/ingest/embed.rs

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,26 @@ async fn attempt_embedding(text: &str, client: &Client, config: &EmbedConfig) ->
200200
Ok(response.embedding)
201201
}
202202

203+
/// Simple wrapper function for embedding with default configuration.
204+
///
205+
/// This provides a simpler API for basic embedding needs while maintaining
206+
/// compatibility with existing code.
207+
///
208+
/// # Arguments
209+
/// * `text` - Text to embed
210+
/// * `max_tokens` - Maximum tokens (currently unused but kept for compatibility)
211+
/// * `client` - HTTP client for making requests
212+
///
213+
/// # Returns
214+
/// Returns a vector of f32 values representing the embedding.
215+
///
216+
/// # Errors
217+
/// Returns an error if the embedding process fails.
218+
pub async fn embed(text: &str, max_tokens: usize, client: &Client) -> Result<Vec<f32>> {
219+
let config = EmbedConfig::default();
220+
embed_with_config(text, max_tokens, client, config).await
221+
}
222+
203223
/// Validates the generated embedding vector.
204224
fn validate_embedding(embedding: &[f32]) -> Result<()> {
205225
if embedding.is_empty() {
@@ -231,6 +251,14 @@ fn validate_embedding(embedding: &[f32]) -> Result<()> {
231251
///
232252
/// # Returns
233253
/// Returns a configured `EmbedConfig` instance.
254+
pub fn create_config(model: &str, endpoint: &str) -> EmbedConfig {
255+
EmbedConfig {
256+
model: model.to_string(),
257+
endpoint: endpoint.to_string(),
258+
timeout_secs: DEFAULT_TIMEOUT_SECS,
259+
max_retries: MAX_RETRY_ATTEMPTS,
260+
}
261+
}
234262

235263
#[cfg(test)]
236264
mod tests {

src/ingest/ingest.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ use serde_json::json;
1818
use std::{fs::File, path::PathBuf};
1919
use walkdir::WalkDir;
2020

21-
use crate::embed;
21+
use crate::ingest::embed;
2222

2323
/// Directories to skip during file traversal.
2424
///

src/ingest/marvelai_ingest.rs

Lines changed: 165 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,191 @@
1-
use crate::ingest::{run_with_config, IngestConfig};
1+
use anyhow::{Context, Result};
22
use clap::Parser;
3+
use harald::ingest::embed;
4+
use harald::ingest::{run_with_config, IngestConfig};
5+
use reqwest::Client;
6+
use serde_json::Value;
7+
use std::fs;
38
use std::path::PathBuf;
49

510
#[derive(Parser, Debug)]
6-
#[command(author, version, about = "MarvelAI Marvel Ingest Wrapper", long_about = None)]
11+
#[command(author, version, about = "MarvelAI Marvel Ingest Tool", long_about = None)]
712
struct Args {
8-
/// Path to the root directory for ingestion
9-
#[arg(short, long, default_value = ".")]
10-
root_dir: PathBuf,
13+
/// Path to the MarvelAIs.json file
14+
#[arg(
15+
short,
16+
long,
17+
default_value = "personality-archetypes/pop-culture/marvel/MarvelAIs.json"
18+
)]
19+
input: PathBuf,
20+
1121
/// Maximum characters to read per file
1222
#[arg(long, default_value_t = 800)]
1323
max_chars: usize,
24+
1425
/// Maximum tokens for embedding requests
1526
#[arg(long, default_value_t = 600)]
1627
max_tokens: usize,
28+
1729
/// Maximum number of files to process concurrently
1830
#[arg(long)]
1931
max_concurrent_files: Option<usize>,
2032
}
2133

2234
#[tokio::main]
23-
async fn main() {
35+
async fn main() -> Result<()> {
2436
let args = Args::parse();
37+
38+
println!("==================================================");
39+
println!("🚀 HARALD MARVELAI INGEST (Rust)");
40+
println!("🔍 Processing MarvelAIs.json using JSONL format");
41+
println!("==================================================");
42+
43+
// Verify input file exists
44+
if !args.input.exists() {
45+
eprintln!("❌ Input file not found: {}", args.input.display());
46+
std::process::exit(1);
47+
}
48+
49+
// Test embedding API first - exit early if it fails
50+
let client = Client::builder()
51+
.timeout(std::time::Duration::from_secs(10)) // Reduced from 30 to 10 seconds
52+
.build()
53+
.context("Failed to create HTTP client")?;
54+
55+
println!("Testing embedding API with model harald-phi4");
56+
57+
// Test with a simple string first
58+
match test_embedding_api(&client, args.max_tokens).await {
59+
Ok(_) => println!(" ✅ Embedding API test successful"),
60+
Err(e) => {
61+
eprintln!(" ❌ Embedding API test failed: {}", e);
62+
eprintln!(
63+
"❌ Cannot proceed without working embedding API. Please check Ollama is running."
64+
);
65+
std::process::exit(1);
66+
}
67+
}
68+
69+
// Convert JSON to JSONL if needed
70+
let jsonl_path = prepare_jsonl_input(&args.input)?;
71+
72+
// Create a temporary directory for processing
73+
let temp_dir = tempfile::TempDir::new().context("Failed to create temporary directory")?;
74+
75+
// Copy JSONL to temp directory
76+
let temp_jsonl = temp_dir.path().join("MarvelAIs.jsonl");
77+
fs::copy(&jsonl_path, &temp_jsonl).context("Failed to copy JSONL to temp directory")?;
78+
79+
// Configure ingestion to use the temp directory
2580
let config = IngestConfig {
26-
root_dir: args.root_dir,
81+
root_dir: temp_dir.path().to_path_buf(),
2782
max_chars: args.max_chars,
2883
max_tokens: args.max_tokens,
2984
max_concurrent_files: args.max_concurrent_files,
3085
};
86+
87+
// Run the standard harald_ingest logic
3188
match run_with_config(config).await {
32-
Ok(_) => println!("MarvelAI ingest completed successfully."),
33-
Err(e) => eprintln!("MarvelAI ingest failed: {}", e),
89+
Ok(stats) => {
90+
println!("✅ MarvelAI ingest completed successfully!");
91+
println!("📁 Processed: {} files", stats.files_processed);
92+
println!("⏭️ Skipped: {} files", stats.files_skipped);
93+
println!("💾 Output: {}", stats.output_dir.display());
94+
}
95+
Err(e) => {
96+
eprintln!("❌ MarvelAI ingest failed: {}", e);
97+
std::process::exit(1);
98+
}
99+
}
100+
101+
Ok(())
102+
}
103+
104+
/// Test the embedding API with a simple request to ensure it's working
105+
async fn test_embedding_api(client: &Client, max_tokens: usize) -> Result<()> {
106+
let test_text = "test";
107+
108+
// Create a fast-fail config for testing - using localhost endpoint as recommended
109+
let test_config = embed::EmbedConfig {
110+
model: "harald-phi4".to_string(),
111+
endpoint: "http://localhost:11434/api/embeddings".to_string(),
112+
timeout_secs: 15, // Longer timeout to account for model loading
113+
max_retries: 2, // Allow 2 attempts for initial API warmup
114+
};
115+
116+
println!(" Testing: embedding '{}' (using {})", test_text, test_config.endpoint);
117+
println!(" Model warmup may take a moment on first request...");
118+
119+
match embed::embed_with_config(test_text, max_tokens, client, test_config).await {
120+
Ok(embedding) => {
121+
if embedding.is_empty() {
122+
return Err(anyhow::anyhow!("Received empty embedding vector"));
123+
}
124+
println!(" ✅ Embedding vectors received successfully ({} dimensions)", embedding.len());
125+
Ok(())
126+
}
127+
Err(e) => {
128+
println!(" ❌ Request failed: {}", e);
129+
130+
// Provide helpful debugging information
131+
eprintln!(" 💡 Troubleshooting tips:");
132+
eprintln!(" • Ensure 'ollama serve' is running in a terminal");
133+
eprintln!(" • Verify harald-phi4 model is available: ollama list");
134+
eprintln!(" • Check API endpoint: curl http://localhost:11434/api/version");
135+
136+
Err(anyhow::anyhow!(
137+
"Failed to generate embeddings with harald-phi4 model (fast test failed)"
138+
))
139+
}
140+
}
141+
}
142+
143+
/// Prepare JSONL input file from the MarvelAIs.json file
144+
fn prepare_jsonl_input(input_path: &PathBuf) -> Result<PathBuf> {
145+
// If it's already JSONL, return as-is
146+
if input_path.extension().and_then(|s| s.to_str()) == Some("jsonl") {
147+
return Ok(input_path.clone());
34148
}
149+
150+
// Read and parse JSON file
151+
let json_content = fs::read_to_string(input_path)
152+
.with_context(|| format!("Failed to read JSON file: {}", input_path.display()))?;
153+
154+
let json_value: Value = serde_json::from_str(&json_content)
155+
.with_context(|| format!("Failed to parse JSON file: {}", input_path.display()))?;
156+
157+
// Create JSONL output path
158+
let mut jsonl_path = input_path.clone();
159+
jsonl_path.set_extension("jsonl");
160+
161+
// Convert to JSONL
162+
let jsonl_content = match json_value {
163+
Value::Array(items) => {
164+
// Array of objects - convert each to a line
165+
items
166+
.iter()
167+
.map(|item| serde_json::to_string(item))
168+
.collect::<Result<Vec<_>, _>>()
169+
.context("Failed to serialize JSON items")?
170+
.join("\n")
171+
}
172+
_ => {
173+
// Single object - just one line
174+
serde_json::to_string(&json_value).context("Failed to serialize JSON object")?
175+
}
176+
};
177+
178+
// Write JSONL file
179+
fs::write(&jsonl_path, &jsonl_content)
180+
.with_context(|| format!("Failed to write JSONL file: {}", jsonl_path.display()))?;
181+
182+
let line_count = jsonl_content.lines().count();
183+
println!(
184+
"Converting \"{}\" to JSONL at \"{}\"",
185+
input_path.display(),
186+
jsonl_path.display()
187+
);
188+
println!("✅ JSONL conversion complete: {} lines", line_count);
189+
190+
Ok(jsonl_path)
35191
}

src/ingest/mod.rs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,16 @@
11
//! Ingest module.
22
//!
33
//! This module provides functionality for ingesting data into the system.
4+
5+
pub mod chunked_ingest;
6+
pub mod chunking_utils;
7+
pub mod embed;
8+
pub mod ingest;
9+
pub mod ingest_utils;
10+
pub mod query;
11+
pub mod single_character_ingest;
12+
13+
// Re-export commonly used items
14+
pub use ingest::{run_with_config, IngestConfig};
15+
pub use embed::{embed, embed_with_config, EmbedConfig};
16+
pub use query::QueryConfig;

src/ingest/query.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ use hnsw_rs::hnswio::HnswIo;
2020
use hnsw_rs::prelude::*;
2121
use serde_json::Value;
2222

23-
use crate::embed;
23+
use crate::ingest::embed;
2424

2525
/// Maximum number of characters to include from each retrieved document.
2626
///

src/ingest/single_character_ingest.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,8 @@ use std::path::PathBuf;
77
use std::process::Command;
88
use std::time::{Duration, Instant};
99

10-
mod chunking_utils;
11-
use chunking_utils::chunk_entity_fields;
12-
mod ingest_utils;
10+
use crate::ingest::chunking_utils::chunk_entity_fields;
11+
use crate::ingest::ingest_utils;
1312
use clap::Parser;
1413

1514
#[derive(Parser, Debug)]
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
//! JSON formatting and validation CLI tool.
2+
//!
3+
//! This binary provides command-line JSON formatting and validation functionality.
4+
5+
use clap::Parser;
6+
use serde_json::Value;
7+
use std::fs;
8+
use std::path::PathBuf;
9+
10+
#[derive(Parser)]
11+
#[command(author, version, about = "Format and validate JSON files", long_about = None)]
12+
struct Args {
13+
/// Input JSON file path
14+
#[arg(value_name = "FILE")]
15+
input: PathBuf,
16+
17+
/// Output file path (default: overwrite input)
18+
#[arg(short, long)]
19+
output: Option<PathBuf>,
20+
21+
/// Pretty print with indentation
22+
#[arg(short, long, default_value_t = 2)]
23+
indent: usize,
24+
25+
/// Validate only, don't format
26+
#[arg(long)]
27+
validate_only: bool,
28+
}
29+
30+
fn main() -> anyhow::Result<()> {
31+
let args = Args::parse();
32+
33+
// Read input file
34+
let content = fs::read_to_string(&args.input)?;
35+
36+
// Parse JSON to validate
37+
let value: Value = serde_json::from_str(&content)?;
38+
39+
if args.validate_only {
40+
println!("✅ JSON is valid");
41+
return Ok(());
42+
}
43+
44+
// Format JSON
45+
let formatted = if args.indent > 0 {
46+
serde_json::to_string_pretty(&value)?
47+
} else {
48+
serde_json::to_string(&value)?
49+
};
50+
51+
// Write output
52+
let output_path = args.output.unwrap_or(args.input);
53+
fs::write(&output_path, formatted)?;
54+
55+
println!("✅ Formatted JSON written to: {}", output_path.display());
56+
Ok(())
57+
}

0 commit comments

Comments
 (0)