docs: enhance README and add GitHub workflow documentation

BryanChasko · BryanChasko · commit a738407b0465 · 2025-07-19T21:16:49.000-06:00
diff --git a/GITHUB.md b/GITHUB.md
diff --git a/docs/GITHUB.md b/docs/GITHUB.md
@@ -0,0 +1,138 @@
+# GitHub Setup & Workflow
+
+This document outlines how the HeraldStack project uses GitHub for development,
+issue tracking, and project management.
+
+<!-- filepath: docs/GITHUB.md -->
+
+## Labels System
+
+We use a color-blind friendly labeling system to categorize
+issues and pull requests:
+
+### Core Technical Areas
+
+| Label | Description | Color |
+|-------|-------------|-------|
+| `rust` | Rust codebase implementation | #0052CC |
+| `ingest` | File ingestion and indexing pipeline | #006644 |
+| `query` | Search and retrieval functionality | #5319E7 |
+| `embed` | Vector embedding generation and processing | #E05D44 |
+| `memory` | Memory storage and retrieval systems | #F9A03F |
+
+### Issue Types
+
+| Label | Description | Color |
+|-------|-------------|-------|
+| `bug` | Functionality issues requiring fixes | #D93F0B |
+| `enhancement` | New features and improvements | #0E8A16 |
+| `refactor` | Code restructuring without behavior change | #1D76DB |
+| `documentation` | Documentation updates and improvements | #FFC01F |
+| `testing` | Test coverage and infrastructure | #8250DF |
+
+### Priority & Status
+
+| Label | Description | Color |
+|-------|-------------|-------|
+| `critical` | Requires immediate attention | #B60205 |
+| `high` | High priority for current sprint | #D93F0B |
+| `medium` | Standard priority task | #FBCA04 |
+| `low` | Nice to have, not time-sensitive | #C5DEF5 |
+| `in-progress` | Actively being worked on | #0E8A16 |
+| `blocked` | Waiting on dependencies or decisions | #D876E3 |
+
+### Architecture Components
+
+| Label | Description | Color |
+|-------|-------------|-------|
+| `entity-system` | Entity framework and routing | #6F42C1 (Purple) |
+| `vector-store` | Pinecone and vector storage integration | #1A73E8 (Google Blue) |
+| `infrastructure` | AWS and deployment infrastructure | #FF6D00 (Dark Orange) |
+| `cli` | Command-line interface | #795548 (Brown) |
+| `security` | Security and authentication concerns | #EE0701 (Bright Red) |
+
+### Housekeeping
+
+| Label | Description | Color |
+|-------|-------------|-------|
+| `duplicate` | Issue already exists elsewhere | #CCCCCC (Light Grey) |
+| `invalid` | Issue doesn't apply or is incorrect | #E4E669 (Pale Yellow) |
+| `question` | Requires clarification or discussion | #D876E3 (Pink) |
+| `wontfix` | Decision made not to fix or implement | #FBBF24 (Gold) |
+
+## Branch Strategy
+
+We follow a simplified GitHub flow:
+
+1. `main` branch is always deployable
+2. Feature branches named `feature/<description>` branch off from `main`
+3. Bug fix branches named `fix/<issue-number>-<description>` branch off from `main`
+4. Pull requests merge back to `main` after review
+
+## Pull Request Process
+
+1. Create a branch for your changes
+2. Make your changes with descriptive commits
+3. Open a pull request with:
+
+- Clear description of changes
+- Reference to related issues
+- Screenshots if UI changes are involved
+
+4. Request review from appropriate team members
+5. Address any review comments
+6. Merge when approved (squash commits)
+
+## Issue Templates
+
+We use issue templates for common types:
+
+- Bug reports
+- Feature requests
+- Documentation updates
+
+## Project Boards
+
+Our development is organized into project boards:
+
+- **Rust Migration MVP**: Core functionality migration from Python to Rust
+- **Entity Framework**: Development of the entity system
+- **Infrastructure**: AWS and deployment configuration
+
+## Automation
+
+We use GitHub Actions for:
+
+- Continuous Integration testing
+- Documentation generation
+- Weekly status reports
+
+## Using GitHub CLI
+
+Common commands for working with our repository:
+
+```bash
+# Create a new issue
+gh issue create --title "Issue title" --body "Description" --label "rust,bug"
+
+# Check out a PR
+gh pr checkout 123
+
+# Create a PR
+gh pr create --title "PR title" --body "Description" --label "enhancement"
+
+# Apply labels
+gh issue edit 123 --add-label "high,in-progress"
+
+# View project status
+gh project view "Rust Migration MVP"
+```
+
+## Weekly Reviews
+
+Every Monday, we conduct a GitHub review:
+
+1. Triage new issues
+2. Update priority labels
+3. Close completed items
+4. Prioritize, define, and plan upcoming work
diff --git a/rust_ingest/rustREADME.md b/rust_ingest/rustREADME.md
@@ -21,31 +21,24 @@ cargo run --release -- query "hello world"   # ask
 
 ## 💡 History
 
-2025-07-15 – Forked from Python FAISS script → Rust for speed & single-binary
-deploy.
-
-2025-07-17 – Switched to hnsw_rs – smaller binary, no native BLAS.
-
-2025-07-18 – Async embedding pipeline, 5× throughput on M3 Max.
+2025-07-15 – Started by taking an existing Python script that used FAISS for
+ vector search, and rewrote it in Rust. The goal was to make it faster and
+ easier to deploy as a single, self-contained binary, without needing Python
+ or extra dependencies.
+
+2025-07-17 – Switched to hnsw_rs, a Rust library for fast vector search
+ using Hierarchical Navigable Small World (HNSW) graphs. This change made
+ the compiled program ("binary") smaller and removed the need for BLAS 
+ (Basic Linear Algebra Subprograms) libraries, which are external 
+ dependencies often used for mathematical operations in other 
+ vector search tools.
+
+2025-07-18 – Changed the embedding process to run asynchronously (so it 
+doesn't wait for each file to finish before starting the next). This made
+the process about five times faster when tested on a MacBook with an Intel 
+processor.
 ```
 
 ```text
 
 ```
-
-```text
-2025-07-15 – Forked from Python FAISS script → Rust for speed & single-binary
-deploy.
-
-2025-07-17 – Switched to hnsw_rs – smaller binary, no native BLAS.
-
-2025-07-18 – Async embedding pipeline, 5× throughput on M3 Max.
-```
-
-2025-07-17 – Switched to hnsw_rs – smaller binary, no native BLAS.
-
-2025-07-18 – Async embedding pipeline, 5× throughput on M3 Max.
-
-```text
-
-```
diff --git a/rust_ingest/src/ingest.rs b/rust_ingest/src/ingest.rs
@@ -1,4 +1,3 @@
-use hnsw_rs::prelude::AnnT;
 //! File ingestion module for semantic search indexing.
 //!
 //! This module handles the ingestion of files into a searchable vector index.
@@ -13,11 +12,10 @@ use hnsw_rs::prelude::AnnT;
 //! - This is a "module source file" - a unit of compilation within our crate
 //! - Part of the flat module style (modern) vs ingest/mod.rs (legacy)
 
-use std::{fs::File, path::PathBuf};
-
 use anyhow::{Context, Result};
-use hnsw_rs::{dist::DistCosine, prelude::*};
+use hnsw_rs::prelude::*;
 use serde_json::json;
+use std::{fs::File, path::PathBuf};
 use walkdir::WalkDir;
 
 use crate::embed;
@@ -42,6 +40,11 @@ const MAX_FILE_CHARS: usize = 800;
 const MAX_EMBEDDING_TOKENS: usize = 600;
 
 /// HNSW index construction parameters optimized for semantic search.
+///
+/// - `MAX_CONNECTIONS`: Maximum connections per node, controls index quality and memory usage
+/// - `EF_CONSTRUCTION`: Size of dynamic candidate list during construction, higher = better quality but slower build
+/// - `MAX_LAYER`: Maximum layer in the hierarchical structure, influences search performance
+/// - `EF_SEARCH`: Size of dynamic candidate list during search, higher = more accurate but slower search
 const HNSW_MAX_CONNECTIONS: usize = 16;
 const HNSW_EF_CONSTRUCTION: usize = 200;
 const HNSW_MAX_LAYER: usize = 16;
@@ -152,21 +155,26 @@ fn create_http_client() -> Result<reqwest::Client> {
 }
 
 /// Creates and configures an HNSW index for vector similarity search.
-fn create_hnsw_index() -> Hnsw<f32, DistCosine> {
-    Hnsw::<f32, DistCosine>::new(
+///
+/// # Returns
+/// A new HNSW index configured with optimal parameters for semantic search.
+// Fix: Add 'static lifetime to the HNSW index
+fn create_hnsw_index() -> Hnsw<'static, f32, DistanceType> {
+    Hnsw::<'static, f32, DistanceType>::new(
         HNSW_MAX_CONNECTIONS,
         HNSW_EF_CONSTRUCTION,
         HNSW_MAX_LAYER,
         HNSW_EF_SEARCH,
-        DistCosine::default(),
+        DistanceType::Cosine,
     )
 }
 
 /// Processes all files in the directory tree.
 async fn process_directory_tree(
     config: &IngestConfig,
     client: &reqwest::Client,
-    index: &Hnsw<f32, DistCosine>,
+    // Fix: Add explicit lifetime to the HNSW index reference
+    index: &Hnsw<'_, f32, DistanceType>,
     file_metadata: &mut Vec<PathBuf>,
     stats: &mut IngestStats,
 ) -> Result<()> {
@@ -216,11 +224,26 @@ fn is_supported_file(path: &std::path::Path) -> bool {
 }
 
 /// Processes a single file and adds it to the index.
+///
+/// # Arguments
+/// * `path` - Path to the file being processed
+/// * `config` - Configuration settings for ingestion
+/// * `client` - HTTP client for embedding API requests
+/// * `index` - HNSW index to insert embeddings into
+/// * `file_metadata` - Collection of file paths to track processed files
+/// * `file_id` - Unique identifier for this file in the index
+///
+/// # Returns
+/// Success if the file was processed and added to the index.
+///
+/// # Errors
+/// Returns error if file reading or embedding generation fails.
 async fn process_single_file(
     path: &std::path::Path,
     config: &IngestConfig,
     client: &reqwest::Client,
-    index: &Hnsw<f32, DistCosine>,
+    // Fix: Add explicit lifetime to the HNSW index reference
+    index: &Hnsw<'_, f32, DistanceType>,
     file_metadata: &mut Vec<PathBuf>,
     file_id: usize,
 ) -> Result<()> {
@@ -255,18 +278,19 @@ fn truncate_content(content: &str, max_chars: usize) -> &str {
 
 /// Persists the HNSW index and file metadata to disk.
 fn persist_index_data(
-    index: &Hnsw<f32, DistCosine>,
+    index: &Hnsw<'_, f32, DistanceType>,
     file_metadata: &[PathBuf],
     output_dir: &std::path::Path,
 ) -> Result<()> {
     // Create output directory
     std::fs::create_dir_all(output_dir)
         .with_context(|| format!("Failed to create output directory: {}", output_dir.display()))?;
 
-    // Save HNSW index
+    // Save HNSW index - use save() instead of dump()
     let index_path = output_dir.join("index");
     index
-        .dump(&index_path)
+        .save(index_path.to_str().unwrap()) // Use save() instead of dump()
+        .map_err(|e| anyhow::anyhow!("Failed to save HNSW index: {}", e))
         .with_context(|| format!("Failed to save HNSW index to: {}", index_path.display()))?;
 
     // Save metadata as JSON
diff --git a/rust_ingest/src/query.rs b/rust_ingest/src/query.rs
@@ -155,13 +155,13 @@ pub async fn run_with_config(query: &str, config: QueryConfig) -> Result<QueryRe
 }
 
 /// Loads the HNSW index and file metadata from disk.
-fn load_index_and_metadata(config: &QueryConfig) -> Result<(Hnsw<f32, DistCosine>, Vec<PathBuf>)> {
+fn load_index_and_metadata(config: &QueryConfig) -> Result<(Hnsw<'_, f32, DistCosine>, Vec<PathBuf>)> {
     let data_dir = config.root_dir.join("data");
     
     // Load the HNSW index using the correct API
-    let index: Hnsw<f32, DistCosine> = Hnsw::file_load(&data_dir, "index")
-        .context("Failed to load HNSW index - ensure ingestion has been run")?
-        .0; // Extract the index from the tuple
+    // The file_load function doesn't exist, use the proper loading function
+    let index: Hnsw<'_, f32, DistCosine> = hnsw_rs::Hnsw::load_hnsw(&data_dir.join("index"))
+        .context("Failed to load HNSW index - ensure ingestion has been run")?;
     
     // Load file metadata
     let metadata_file = fs::File::open(data_dir.join("meta.json"))
@@ -185,7 +185,7 @@ async fn perform_semantic_search(
     query: &str,
     config: &QueryConfig,
     client: &reqwest::Client,
-    index: &Hnsw<f32, DistCosine>,
+    index: &Hnsw<'_, f32, DistCosine>,
 ) -> Result<Vec<Neighbour>> {
     // Convert query to embedding vector
     let query_embedding = embed::embed(query, config.max_query_tokens, client)