Skip to content

Commit cce57de

Browse files
ruvnetclaude
andcommitted
fix: Address code review issues from gist analysis
## Fixes Applied ### 1. Fabricated Benchmarks - Rewrote docs/benchmarks/BENCHMARK_COMPARISON.md - removed false "100-4,400x faster" claims - Fixed benchmarks/graph/src/comparison-runner.ts - removed hardcoded latency multipliers - Fixed benchmarks/src/results-analyzer.ts - removed simulated histogram data ### 2. Fake Text Embeddings - Added prominent warnings to agenticdb.rs about hash-based placeholder - Added compile-time deprecation warning in lib.rs - Created integration guide with 4 real embedding options (ONNX, Candle, API, Python) ### 3. Incomplete GNN Training - Implemented Loss::compute() for MSE, CrossEntropy, BinaryCrossEntropy - Implemented Loss::gradient() for backpropagation - Added 6 new verification tests ### 4. Distance Function Bugs - Fixed inverted dequantization formula in ruvector-router-core (was /scale, now *scale) - Improved scale handling in ruvector-core quantization (now uses average scale) ### 5. Empty Transaction Tests - Implemented 10+ critical tests: dirty reads, phantom reads, MVCC, deadlock detection - All 31 transaction tests now passing Addresses issues from: https://gist.github.com/couzic/93126a1c12b8d77651f93a7805b4bd60 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent ef54ee9 commit cce57de

File tree

16 files changed

+19034
-3056
lines changed

16 files changed

+19034
-3056
lines changed

benchmarks/graph/src/comparison-runner.ts

Lines changed: 10 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -96,8 +96,8 @@ async function runNeo4jBenchmarks(scenario: string): Promise<BenchmarkMetrics[]>
9696
memory_mb: 0, // Would need Neo4j metrics API
9797
cpu_percent: 0,
9898
latency_p50: duration,
99-
latency_p95: duration * 1.2,
100-
latency_p99: duration * 1.5
99+
latency_p95: 0, // Cannot accurately estimate without percentile data
100+
latency_p99: 0 // Cannot accurately estimate without percentile data
101101
});
102102
}
103103

@@ -203,8 +203,8 @@ function parseCriterionOutput(output: string, system: 'ruvector' | 'neo4j', scen
203203
memory_mb: 0,
204204
cpu_percent: 0,
205205
latency_p50: p50,
206-
latency_p95: p50 * 1.2,
207-
latency_p99: p50 * 1.5
206+
latency_p95: 0, // Would need to parse from criterion percentile output
207+
latency_p99: 0 // Would need to parse from criterion percentile output
208208
});
209209
}
210210
}
@@ -224,23 +224,12 @@ function loadBaselineMetrics(system: string, scenario: string): BenchmarkMetrics
224224
return JSON.parse(data);
225225
}
226226

227-
// Return estimated baseline if no data available
228-
console.warn(`No baseline data for ${system} ${scenario}, using estimates`);
229-
230-
return [
231-
{
232-
system: system as 'ruvector' | 'neo4j',
233-
scenario,
234-
operation: 'node_insertion',
235-
duration_ms: 100,
236-
throughput_ops: 10000,
237-
memory_mb: 512,
238-
cpu_percent: 50,
239-
latency_p50: 100,
240-
latency_p95: 150,
241-
latency_p99: 200
242-
}
243-
];
227+
// Error: no baseline data available
228+
throw new Error(
229+
`No baseline data available for ${system} ${scenario}. ` +
230+
`Cannot run comparison without actual measured data. ` +
231+
`Please run benchmarks on both systems first and save results to ${baselinePath}`
232+
);
244233
}
245234

246235
/**

benchmarks/src/results-analyzer.ts

Lines changed: 11 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -172,33 +172,17 @@ export class ResultsAnalyzer {
172172

173173
// Create latency histogram
174174
private createLatencyHistogram(latency: LatencyMetrics): HistogramBucket[] {
175-
const buckets: HistogramBucket[] = [
176-
{ min: 0, max: 10, count: 0, percentage: 0 },
177-
{ min: 10, max: 25, count: 0, percentage: 0 },
178-
{ min: 25, max: 50, count: 0, percentage: 0 },
179-
{ min: 50, max: 100, count: 0, percentage: 0 },
180-
{ min: 100, max: 200, count: 0, percentage: 0 },
181-
{ min: 200, max: 500, count: 0, percentage: 0 },
182-
{ min: 500, max: Infinity, count: 0, percentage: 0 },
183-
];
184-
185-
// Estimate distribution based on percentiles
186-
// This is a rough approximation - ideally we'd have raw data
187-
const total = 1000000; // Assume 1M samples
188-
189-
buckets[0].count = Math.floor(total * 0.5); // 50% under 10ms
190-
buckets[1].count = Math.floor(total * 0.25); // 25% 10-25ms
191-
buckets[2].count = Math.floor(total * 0.15); // 15% 25-50ms
192-
buckets[3].count = Math.floor(total * 0.08); // 8% 50-100ms
193-
buckets[4].count = Math.floor(total * 0.015); // 1.5% 100-200ms
194-
buckets[5].count = Math.floor(total * 0.004); // 0.4% 200-500ms
195-
buckets[6].count = Math.floor(total * 0.001); // 0.1% 500ms+
196-
197-
buckets.forEach(bucket => {
198-
bucket.percentage = (bucket.count / total) * 100;
199-
});
200-
201-
return buckets;
175+
// NOTE: This function cannot create accurate histograms without raw latency samples.
176+
// We only have percentile data (p50, p95, p99), which is insufficient for distribution.
177+
// Returning empty histogram to avoid fabricating data.
178+
179+
console.warn(
180+
'Cannot generate latency histogram without raw sample data. ' +
181+
'Only percentile metrics (p50, p95, p99) are available. ' +
182+
'To get accurate histograms, modify metrics collection to store raw latency samples.'
183+
);
184+
185+
return []; // Return empty array instead of fabricated data
202186
}
203187

204188
// Detect anomalies

crates/ruvector-core/src/agenticdb.rs

Lines changed: 76 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,22 @@
11
//! AgenticDB API Compatibility Layer
22
//!
3+
//! # ⚠️ CRITICAL WARNING: PLACEHOLDER EMBEDDINGS
4+
//!
5+
//! **THIS MODULE USES HASH-BASED PLACEHOLDER EMBEDDINGS - NOT REAL SEMANTIC EMBEDDINGS**
6+
//!
7+
//! The `generate_text_embedding()` function creates embeddings using a simple hash function
8+
//! that does NOT understand semantic meaning. Similarity is based on character overlap, NOT meaning.
9+
//!
10+
//! **For Production Use:**
11+
//! - Integrate a real embedding model (sentence-transformers, OpenAI, Anthropic, Cohere)
12+
//! - Use ONNX Runtime, candle, or Python bindings for inference
13+
//! - See `/examples/onnx-embeddings` for a production-ready integration example
14+
//!
15+
//! **What This Means:**
16+
//! - "dog" and "cat" will NOT be similar (different characters)
17+
//! - "dog" and "god" WILL be similar (same characters, different order)
18+
//! - Semantic search will not work as expected
19+
//!
320
//! Provides a drop-in replacement for agenticDB with 5-table schema:
421
//! - vectors_table: Core embeddings + metadata
522
//! - reflexion_episodes: Self-critique memories
@@ -658,20 +675,68 @@ impl AgenticDB {
658675

659676
/// Generate text embedding from text.
660677
///
661-
/// # ⚠️ WARNING: PLACEHOLDER IMPLEMENTATION
678+
/// # ⚠️⚠️⚠️ CRITICAL WARNING: THIS IS A PLACEHOLDER - NOT REAL EMBEDDINGS ⚠️⚠️⚠️
679+
///
680+
/// **THIS FUNCTION DOES NOT CREATE SEMANTIC EMBEDDINGS!**
681+
///
682+
/// This uses a simple hash-based embedding that does NOT understand semantic meaning.
683+
/// Text similarity will be based on character overlap, NOT actual meaning.
684+
///
685+
/// ## Why This Exists
686+
/// This placeholder allows the AgenticDB API to work for testing and demonstration,
687+
/// but it will NOT provide meaningful semantic search results.
688+
///
689+
/// ## Examples of What Won't Work
690+
/// - "dog" and "cat" will NOT be similar (different characters)
691+
/// - "happy" and "joyful" will NOT be similar (different characters)
692+
/// - "car" and "automobile" will NOT be similar (different characters)
693+
/// - But "dog" and "god" WILL be similar (same characters, different order) ❌
694+
///
695+
/// ## For Production Use - Choose ONE:
696+
///
697+
/// ### Option 1: ONNX Runtime (Recommended)
698+
/// ```rust
699+
/// // See /examples/onnx-embeddings for complete example
700+
/// use ort::{Session, Environment, Value};
701+
/// let session = Session::builder()?
702+
/// .with_model_from_file("all-MiniLM-L6-v2.onnx")?;
703+
/// ```
704+
///
705+
/// ### Option 2: Candle (Pure Rust)
706+
/// ```rust
707+
/// use candle_core::{Device, Tensor};
708+
/// use candle_transformers::models::bert;
709+
/// ```
710+
///
711+
/// ### Option 3: API-based (OpenAI, Cohere, Anthropic)
712+
/// ```rust
713+
/// use reqwest;
714+
/// let response = client.post("https://api.openai.com/v1/embeddings")
715+
/// .json(&json!({ "model": "text-embedding-3-small", "input": text }))
716+
/// .send().await?;
717+
/// ```
662718
///
663-
/// This uses a simple hash-based embedding that does NOT understand
664-
/// semantic meaning. Text similarity will be based on character overlap,
665-
/// not actual meaning.
719+
/// ### Option 4: Python Bindings
720+
/// ```rust
721+
/// use pyo3::prelude::*;
722+
/// let embeddings = Python::with_gil(|py| {
723+
/// let sentence_transformers = py.import("sentence_transformers")?;
724+
/// let model = sentence_transformers.getattr("SentenceTransformer")?
725+
/// .call1(("all-MiniLM-L6-v2",))?;
726+
/// model.call_method1("encode", (text,))
727+
/// });
728+
/// ```
666729
///
667-
/// For real semantic search, integrate an actual embedding model:
668-
/// - `sentence-transformers` via Python bindings
669-
/// - `candle` for native Rust inference
670-
/// - ONNX Runtime for cross-platform models
671-
/// - OpenAI/Anthropic embedding APIs
730+
/// ## Replace This Function
731+
/// To use real embeddings, replace this entire function implementation with
732+
/// one of the above options. The function signature should remain the same.
672733
fn generate_text_embedding(&self, text: &str) -> Result<Vec<f32>> {
673-
// ⚠️ PLACEHOLDER: Hash-based embedding - NOT semantic
674-
// This is for demonstration and testing only
734+
// ⚠️⚠️⚠️ PLACEHOLDER IMPLEMENTATION - NOT SEMANTIC EMBEDDINGS ⚠️⚠️⚠️
735+
//
736+
// This is a hash-based embedding for demonstration and testing ONLY.
737+
// DO NOT use in production for semantic search!
738+
//
739+
// This will be replaced with a compile-time warning in future versions.
675740
let mut embedding = vec![0.0; self.dimensions];
676741
let bytes = text.as_bytes();
677742

crates/ruvector-core/src/lib.rs

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,14 @@
1010
//! - **Persistence**: REDB-based storage with config persistence
1111
//! - **Search**: ~2.5K queries/sec on 10K vectors (benchmarked)
1212
//!
13-
//! ## Experimental/Incomplete Features
13+
//! ## ⚠️ Experimental/Incomplete Features - READ BEFORE USE
1414
//!
15-
//! - **AgenticDB**: Uses placeholder hash-based embeddings (NOT semantic)
16-
//! - Replace `generate_text_embedding` with real model for production use
15+
//! - **AgenticDB**: ⚠️⚠️⚠️ **CRITICAL WARNING** ⚠️⚠️⚠️
16+
//! - Uses PLACEHOLDER hash-based embeddings, NOT real semantic embeddings
17+
//! - "dog" and "cat" will NOT be similar (different characters)
18+
//! - "dog" and "god" WILL be similar (same characters) - **This is wrong!**
19+
//! - **MUST integrate real embedding model for production** (ONNX, Candle, or API)
20+
//! - See [`agenticdb`] module docs and `/examples/onnx-embeddings` for integration
1721
//! - **Advanced Features**: Conformal prediction, hybrid search - functional but less tested
1822
//!
1923
//! ## What This Is NOT
@@ -67,6 +71,18 @@ pub use advanced_features::{
6771
#[cfg(feature = "storage")]
6872
pub use agenticdb::AgenticDB;
6973

74+
// Compile-time warning about AgenticDB limitations
75+
#[cfg(feature = "storage")]
76+
const _: () = {
77+
// This will appear in cargo build output as a note
78+
#[deprecated(
79+
since = "0.1.0",
80+
note = "AgenticDB uses placeholder hash-based embeddings. For semantic search, integrate a real embedding model (ONNX, Candle, or API). See /examples/onnx-embeddings for production setup."
81+
)]
82+
const AGENTICDB_EMBEDDING_WARNING: () = ();
83+
let _ = AGENTICDB_EMBEDDING_WARNING;
84+
};
85+
7086
pub use error::{Result, RuvectorError};
7187
pub use types::{DistanceMetric, SearchQuery, SearchResult, VectorEntry, VectorId};
7288
pub use vector_db::VectorDB;

crates/ruvector-core/src/quantization.rs

Lines changed: 121 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,13 @@ impl QuantizedVector for ScalarQuantized {
4949
fn distance(&self, other: &Self) -> f32 {
5050
// Fast int8 distance calculation
5151
// Use i32 to avoid overflow: max diff is 255, and 255*255=65025 fits in i32
52+
53+
// Scale handling: We use the average of both scales for balanced comparison.
54+
// Using max(scale) would bias toward the vector with larger range,
55+
// while average provides a more symmetric distance metric.
56+
// This ensures distance(a, b) ≈ distance(b, a) in the reconstructed space.
57+
let avg_scale = (self.scale + other.scale) / 2.0;
58+
5259
self.data
5360
.iter()
5461
.zip(&other.data)
@@ -58,7 +65,7 @@ impl QuantizedVector for ScalarQuantized {
5865
})
5966
.sum::<f32>()
6067
.sqrt()
61-
* self.scale.max(other.scale)
68+
* avg_scale
6269
}
6370

6471
fn reconstruct(&self) -> Vec<f32> {
@@ -307,4 +314,117 @@ mod tests {
307314
let dist = q1.distance(&q2);
308315
assert_eq!(dist, 2.0); // 2 bits differ
309316
}
317+
318+
#[test]
319+
fn test_scalar_quantization_roundtrip() {
320+
// Test that quantize -> reconstruct produces values close to original
321+
let test_vectors = vec![
322+
vec![1.0, 2.0, 3.0, 4.0, 5.0],
323+
vec![-10.0, -5.0, 0.0, 5.0, 10.0],
324+
vec![0.1, 0.2, 0.3, 0.4, 0.5],
325+
vec![100.0, 200.0, 300.0, 400.0, 500.0],
326+
];
327+
328+
for vector in test_vectors {
329+
let quantized = ScalarQuantized::quantize(&vector);
330+
let reconstructed = quantized.reconstruct();
331+
332+
assert_eq!(vector.len(), reconstructed.len());
333+
334+
for (orig, recon) in vector.iter().zip(reconstructed.iter()) {
335+
// With 8-bit quantization, max error is roughly (max-min)/255
336+
let max = vector.iter().copied().fold(f32::NEG_INFINITY, f32::max);
337+
let min = vector.iter().copied().fold(f32::INFINITY, f32::min);
338+
let max_error = (max - min) / 255.0 * 2.0; // Allow 2x for rounding
339+
340+
assert!(
341+
(orig - recon).abs() < max_error,
342+
"Roundtrip error too large: orig={}, recon={}, error={}",
343+
orig,
344+
recon,
345+
(orig - recon).abs()
346+
);
347+
}
348+
}
349+
}
350+
351+
#[test]
352+
fn test_scalar_distance_symmetry() {
353+
// Test that distance(a, b) == distance(b, a)
354+
let v1 = vec![1.0, 2.0, 3.0, 4.0, 5.0];
355+
let v2 = vec![2.0, 3.0, 4.0, 5.0, 6.0];
356+
357+
let q1 = ScalarQuantized::quantize(&v1);
358+
let q2 = ScalarQuantized::quantize(&v2);
359+
360+
let dist_ab = q1.distance(&q2);
361+
let dist_ba = q2.distance(&q1);
362+
363+
// Distance should be symmetric (within floating point precision)
364+
assert!(
365+
(dist_ab - dist_ba).abs() < 0.01,
366+
"Distance is not symmetric: d(a,b)={}, d(b,a)={}",
367+
dist_ab,
368+
dist_ba
369+
);
370+
}
371+
372+
#[test]
373+
fn test_scalar_distance_different_scales() {
374+
// Test distance calculation with vectors that have different scales
375+
let v1 = vec![1.0, 2.0, 3.0, 4.0, 5.0]; // range: 4.0
376+
let v2 = vec![10.0, 20.0, 30.0, 40.0, 50.0]; // range: 40.0
377+
378+
let q1 = ScalarQuantized::quantize(&v1);
379+
let q2 = ScalarQuantized::quantize(&v2);
380+
381+
let dist_ab = q1.distance(&q2);
382+
let dist_ba = q2.distance(&q1);
383+
384+
// With average scaling, symmetry should be maintained
385+
assert!(
386+
(dist_ab - dist_ba).abs() < 0.01,
387+
"Distance with different scales not symmetric: d(a,b)={}, d(b,a)={}",
388+
dist_ab,
389+
dist_ba
390+
);
391+
}
392+
393+
#[test]
394+
fn test_scalar_quantization_edge_cases() {
395+
// Test with all same values
396+
let same_values = vec![5.0, 5.0, 5.0, 5.0];
397+
let quantized = ScalarQuantized::quantize(&same_values);
398+
let reconstructed = quantized.reconstruct();
399+
400+
for (orig, recon) in same_values.iter().zip(reconstructed.iter()) {
401+
assert!((orig - recon).abs() < 0.01);
402+
}
403+
404+
// Test with extreme ranges
405+
let extreme = vec![f32::MIN / 1e10, 0.0, f32::MAX / 1e10];
406+
let quantized = ScalarQuantized::quantize(&extreme);
407+
let reconstructed = quantized.reconstruct();
408+
409+
assert_eq!(extreme.len(), reconstructed.len());
410+
}
411+
412+
#[test]
413+
fn test_binary_distance_symmetry() {
414+
// Test that binary distance is symmetric
415+
let v1 = vec![1.0, -1.0, 1.0, -1.0];
416+
let v2 = vec![1.0, 1.0, -1.0, -1.0];
417+
418+
let q1 = BinaryQuantized::quantize(&v1);
419+
let q2 = BinaryQuantized::quantize(&v2);
420+
421+
let dist_ab = q1.distance(&q2);
422+
let dist_ba = q2.distance(&q1);
423+
424+
assert_eq!(
425+
dist_ab, dist_ba,
426+
"Binary distance not symmetric: d(a,b)={}, d(b,a)={}",
427+
dist_ab, dist_ba
428+
);
429+
}
310430
}

0 commit comments

Comments
 (0)