diff --git a/rust/types/src/collection_configuration.rs b/rust/types/src/collection_configuration.rs index 80a24a19003..9092ff52534 100644 --- a/rust/types/src/collection_configuration.rs +++ b/rust/types/src/collection_configuration.rs @@ -1079,4 +1079,365 @@ mod tests { )) ); } + + #[cfg(feature = "testing")] + mod proptests { + use super::*; + use crate::hnsw_configuration::Space; + use crate::proptest_utils::strategies::{ + embedding_function_strategy, internal_collection_configuration_strategy, + internal_hnsw_configuration_strategy, internal_spann_configuration_strategy, + knn_index_strategy, + }; + use crate::{HnswConfiguration, MetadataValue, SpannConfiguration}; + use proptest::prelude::*; + use proptest::test_runner::TestCaseError; + + fn space_to_metadata_str(space: &Space) -> &'static str { + match space { + Space::L2 => "l2", + Space::Cosine => "cosine", + Space::Ip => "ip", + } + } + + fn metadata_from_hnsw_config(config: &InternalHnswConfiguration) -> Metadata { + let mut metadata = Metadata::new(); + metadata.insert( + "hnsw:space".to_string(), + MetadataValue::Str(space_to_metadata_str(&config.space).to_string()), + ); + metadata.insert( + "hnsw:construction_ef".to_string(), + MetadataValue::Int(config.ef_construction as i64), + ); + metadata.insert( + "hnsw:search_ef".to_string(), + MetadataValue::Int(config.ef_search as i64), + ); + metadata.insert( + "hnsw:M".to_string(), + MetadataValue::Int(config.max_neighbors as i64), + ); + metadata.insert( + "hnsw:num_threads".to_string(), + MetadataValue::Int(config.num_threads as i64), + ); + metadata.insert( + "hnsw:resize_factor".to_string(), + MetadataValue::Float(config.resize_factor), + ); + metadata.insert( + "hnsw:sync_threshold".to_string(), + MetadataValue::Int(config.sync_threshold as i64), + ); + metadata.insert( + "hnsw:batch_size".to_string(), + MetadataValue::Int(config.batch_size as i64), + ); + metadata + } + + fn metadata_hnsw_strategy() -> impl Strategy + { + internal_hnsw_configuration_strategy().prop_map(|config| { + let metadata = metadata_from_hnsw_config(&config); + (config, metadata) + }) + } + + fn assert_spann_is_default_with_space( + config: &InternalSpannConfiguration, + expected_space: Space, + ) -> Result<(), TestCaseError> { + let default_config = InternalSpannConfiguration { + space: expected_space, + ..InternalSpannConfiguration::default() + }; + prop_assert_eq!(config, &default_config); + Ok(()) + } + + fn assert_hnsw_is_default_with_space( + config: &InternalHnswConfiguration, + expected_space: Space, + ) -> Result<(), TestCaseError> { + let default_config = InternalHnswConfiguration { + space: expected_space, + ..InternalHnswConfiguration::default() + }; + prop_assert_eq!(config, &default_config); + Ok(()) + } + + proptest! { + #[test] + fn try_from_config_roundtrip_internal( + internal_config in internal_collection_configuration_strategy() + ) { + let collection_config: CollectionConfiguration = internal_config.clone().into(); + let default_knn = match &internal_config.vector_index { + VectorIndexConfiguration::Hnsw(_) => KnnIndex::Hnsw, + VectorIndexConfiguration::Spann(_) => KnnIndex::Spann, + }; + + let result = InternalCollectionConfiguration::try_from_config( + collection_config.clone(), + default_knn, + None, + ) + .expect("conversion should succeed"); + + let embedding_function = internal_config.embedding_function.clone(); + let expected_vector_index = match &internal_config.vector_index { + VectorIndexConfiguration::Hnsw(original) => { + let external: HnswConfiguration = original.clone().into(); + let expected_internal: InternalHnswConfiguration = external.clone().into(); + match &result.vector_index { + VectorIndexConfiguration::Hnsw(converted) => { + prop_assert_eq!(converted, &expected_internal); + } + _ => prop_assert!(false, "expected HNSW configuration"), + } + VectorIndexConfiguration::Hnsw(expected_internal) + } + VectorIndexConfiguration::Spann(original) => { + let external: SpannConfiguration = original.clone().into(); + let expected_internal: InternalSpannConfiguration = external.clone().into(); + match &result.vector_index { + VectorIndexConfiguration::Spann(converted) => { + prop_assert_eq!(converted, &expected_internal); + } + _ => prop_assert!(false, "expected SPANN configuration"), + } + VectorIndexConfiguration::Spann(expected_internal) + } + }; + + prop_assert_eq!( + result.embedding_function.clone(), + embedding_function.clone() + ); + let expected = InternalCollectionConfiguration { + vector_index: expected_vector_index, + embedding_function: embedding_function.clone(), + }; + prop_assert_eq!(result, expected); + + let opposite_knn = match &internal_config.vector_index { + VectorIndexConfiguration::Hnsw(_) => KnnIndex::Spann, + VectorIndexConfiguration::Spann(_) => KnnIndex::Hnsw, + }; + let opposite_result = InternalCollectionConfiguration::try_from_config( + collection_config, + opposite_knn, + None, + ) + .expect("conversion for opposite default should succeed"); + + prop_assert_eq!( + opposite_result.embedding_function.clone(), + internal_config.embedding_function.clone() + ); + + match (&internal_config.vector_index, &opposite_result.vector_index) { + (VectorIndexConfiguration::Hnsw(original), VectorIndexConfiguration::Spann(spann)) => { + let expected_space = original.space.clone(); + assert_spann_is_default_with_space(spann, expected_space)?; + } + (VectorIndexConfiguration::Spann(original), VectorIndexConfiguration::Hnsw(hnsw)) => { + let expected_space = original.space.clone(); + assert_hnsw_is_default_with_space(hnsw, expected_space)?; + } + _ => prop_assert!(false, "unexpected opposite conversion result"), + } + } + } + + proptest! { + #[test] + fn try_from_config_uses_metadata_when_configs_absent( + (expected_hnsw, metadata) in metadata_hnsw_strategy(), + embedding in embedding_function_strategy(), + knn in knn_index_strategy(), + ) { + let collection_config = CollectionConfiguration { + hnsw: None, + spann: None, + embedding_function: embedding.clone(), + }; + + let result = InternalCollectionConfiguration::try_from_config( + collection_config, + knn, + Some(metadata.clone()), + ) + .expect("conversion should succeed"); + + match (knn, &result.vector_index) { + (KnnIndex::Hnsw, VectorIndexConfiguration::Hnsw(hnsw)) => { + prop_assert_eq!(hnsw, &expected_hnsw); + } + (KnnIndex::Spann, VectorIndexConfiguration::Spann(spann)) => { + prop_assert_eq!(spann.space.clone(), expected_hnsw.space.clone()); + assert_spann_is_default_with_space(spann, expected_hnsw.space.clone())?; + } + _ => prop_assert!(false, "unexpected vector index variant"), + } + prop_assert_eq!(result.embedding_function.clone(), embedding); + } + } + + proptest! { + #[test] + fn try_from_config_uses_metadata_when_hnsw_config_is_default_values( + (expected_hnsw, metadata) in metadata_hnsw_strategy(), + embedding in embedding_function_strategy(), + ) { + let collection_config = CollectionConfiguration { + hnsw: Some(HnswConfiguration::default()), + spann: None, + embedding_function: embedding.clone(), + }; + + let result = InternalCollectionConfiguration::try_from_config( + collection_config, + KnnIndex::Hnsw, + Some(metadata.clone()), + ) + .expect("conversion should succeed"); + + match &result.vector_index { + VectorIndexConfiguration::Hnsw(hnsw) => { + prop_assert_eq!(hnsw, &expected_hnsw); + } + _ => prop_assert!(false, "expected hnsw configuration"), + } + prop_assert_eq!(result.embedding_function.clone(), embedding); + } + } + + proptest! { + #[test] + fn try_from_config_prefers_spann_when_default_is_spann( + hnsw_config in internal_hnsw_configuration_strategy(), + embedding in embedding_function_strategy(), + ) { + let embedding_clone = embedding.clone(); + let collection_config = CollectionConfiguration { + hnsw: Some(hnsw_config.clone().into()), + spann: None, + embedding_function: embedding, + }; + + let result = InternalCollectionConfiguration::try_from_config( + collection_config, + KnnIndex::Spann, + None, + ) + .expect("conversion should succeed"); + + let expected_space = hnsw_config.space.clone(); + match &result.vector_index { + VectorIndexConfiguration::Spann(spann) => { + prop_assert_eq!(spann.space.clone(), expected_space.clone()); + assert_spann_is_default_with_space(spann, expected_space)?; + } + _ => prop_assert!(false, "expected spann configuration"), + } + prop_assert_eq!(result.embedding_function.clone(), embedding_clone); + } + } + + proptest! { + #[test] + fn try_from_config_prefers_hnsw_when_default_is_hnsw( + spann_config in internal_spann_configuration_strategy(), + embedding in embedding_function_strategy(), + ) { + let embedding_clone = embedding.clone(); + let collection_config = CollectionConfiguration { + hnsw: None, + spann: Some(spann_config.clone().into()), + embedding_function: embedding, + }; + + let result = InternalCollectionConfiguration::try_from_config( + collection_config, + KnnIndex::Hnsw, + None, + ) + .expect("conversion should succeed"); + + let expected_space = spann_config.space.clone(); + match &result.vector_index { + VectorIndexConfiguration::Hnsw(hnsw) => { + prop_assert_eq!(hnsw.space.clone(), expected_space.clone()); + assert_hnsw_is_default_with_space(hnsw, expected_space)?; + } + _ => prop_assert!(false, "expected hnsw configuration"), + } + prop_assert_eq!(result.embedding_function.clone(), embedding_clone); + } + } + + proptest! { + #[test] + fn try_from_config_defaults_when_configs_absent( + embedding in embedding_function_strategy(), + knn in knn_index_strategy(), + ) { + let collection_config = CollectionConfiguration { + hnsw: None, + spann: None, + embedding_function: embedding.clone(), + }; + + let result = InternalCollectionConfiguration::try_from_config( + collection_config, + knn, + None, + ) + .expect("conversion should succeed"); + + match (knn, &result.vector_index) { + (KnnIndex::Hnsw, VectorIndexConfiguration::Hnsw(hnsw)) => { + prop_assert_eq!(hnsw, &InternalHnswConfiguration::default()); + } + (KnnIndex::Spann, VectorIndexConfiguration::Spann(spann)) => { + prop_assert_eq!(spann, &InternalSpannConfiguration::default()); + } + _ => prop_assert!(false, "unexpected vector index variant"), + } + prop_assert_eq!(result.embedding_function.clone(), embedding); + } + } + + proptest! { + #[test] + fn try_from_config_errors_on_multiple_configs( + hnsw_config in internal_hnsw_configuration_strategy(), + spann_config in internal_spann_configuration_strategy(), + embedding in embedding_function_strategy(), + knn in knn_index_strategy(), + ) { + let collection_config = CollectionConfiguration { + hnsw: Some(hnsw_config.into()), + spann: Some(spann_config.into()), + embedding_function: embedding, + }; + + let result = InternalCollectionConfiguration::try_from_config( + collection_config, + knn, + None, + ); + + prop_assert!(matches!( + result, + Err(CollectionConfigurationToInternalConfigurationError::MultipleVectorIndexConfigurations) + )); + } + } + } } diff --git a/rust/types/src/collection_schema.rs b/rust/types/src/collection_schema.rs index 50df6fc634f..f168c96fd96 100644 --- a/rust/types/src/collection_schema.rs +++ b/rust/types/src/collection_schema.rs @@ -5583,4 +5583,767 @@ mod tests { assert!(schema.keys.contains_key(EMBEDDING_KEY)); assert_eq!(schema.keys.len(), 3); } + + #[cfg(feature = "testing")] + mod proptests { + use super::*; + use crate::proptest_utils::strategies::{ + embedding_function_strategy, internal_collection_configuration_strategy, + internal_hnsw_configuration_strategy, internal_spann_configuration_strategy, + knn_index_strategy, space_strategy, + }; + use crate::{ + strategies::TEST_NAME_PATTERN, HnswIndexConfig, SpannIndexConfig, VectorIndexConfig, + DOCUMENT_KEY, EMBEDDING_KEY, + }; + use proptest::prelude::*; + use proptest::strategy::BoxedStrategy; + use proptest::string::string_regex; + use serde_json::json; + + fn default_embedding_function_strategy( + ) -> impl Strategy> { + proptest::option::of(prop_oneof![ + Just(EmbeddingFunctionConfiguration::Unknown), + Just(EmbeddingFunctionConfiguration::Known( + EmbeddingFunctionNewConfiguration { + name: "default".to_string(), + config: json!({ "alpha": 1 }), + } + )), + ]) + } + + fn sparse_embedding_function_strategy( + ) -> impl Strategy> { + let known_strategy = string_regex(TEST_NAME_PATTERN).unwrap().prop_map(|name| { + EmbeddingFunctionConfiguration::Known(EmbeddingFunctionNewConfiguration { + name, + config: json!({ "alpha": 1 }), + }) + }); + + proptest::option::of(prop_oneof![ + Just(EmbeddingFunctionConfiguration::Unknown), + known_strategy, + ]) + } + + fn non_default_internal_collection_configuration_strategy( + ) -> impl Strategy { + internal_collection_configuration_strategy() + .prop_filter("non-default configuration", |config| !config.is_default()) + } + + fn partial_hnsw_index_config_strategy() -> impl Strategy { + ( + proptest::option::of(1usize..=512), + proptest::option::of(1usize..=128), + proptest::option::of(1usize..=512), + proptest::option::of(1usize..=64), + proptest::option::of(2usize..=4096), + proptest::option::of(2usize..=4096), + proptest::option::of(prop_oneof![ + Just(0.5f64), + Just(1.0f64), + Just(1.5f64), + Just(2.0f64) + ]), + ) + .prop_map( + |( + ef_construction, + max_neighbors, + ef_search, + num_threads, + batch_size, + sync_threshold, + resize_factor, + )| HnswIndexConfig { + ef_construction, + max_neighbors, + ef_search, + num_threads, + batch_size, + sync_threshold, + resize_factor, + }, + ) + } + + fn partial_spann_index_config_strategy() -> BoxedStrategy { + // Use internal strategy and convert, allowing None values by randomly setting some to None + internal_spann_configuration_strategy() + .prop_map(|config| { + // For property testing, we'll test with full configs - the merge logic handles None correctly + SpannIndexConfig { + search_nprobe: Some(config.search_nprobe), + search_rng_factor: Some(config.search_rng_factor), + search_rng_epsilon: Some(config.search_rng_epsilon), + nreplica_count: Some(config.nreplica_count), + write_rng_factor: Some(config.write_rng_factor), + write_rng_epsilon: Some(config.write_rng_epsilon), + split_threshold: Some(config.split_threshold), + num_samples_kmeans: Some(config.num_samples_kmeans), + initial_lambda: Some(config.initial_lambda), + reassign_neighbor_count: Some(config.reassign_neighbor_count), + merge_threshold: Some(config.merge_threshold), + num_centers_to_merge_to: Some(config.num_centers_to_merge_to), + write_nprobe: Some(config.write_nprobe), + ef_construction: Some(config.ef_construction), + ef_search: Some(config.ef_search), + max_neighbors: Some(config.max_neighbors), + } + }) + .boxed() + } + + proptest! { + #[test] + fn merge_hnsw_configs_preserves_user_overrides( + base in partial_hnsw_index_config_strategy(), + user in partial_hnsw_index_config_strategy(), + ) { + let merged = Schema::merge_hnsw_configs(Some(&base), Some(&user)) + .expect("merge should return Some when both are Some"); + + // Property: user values always take precedence when Some + if user.ef_construction.is_some() { + prop_assert_eq!(merged.ef_construction, user.ef_construction); + } + if user.max_neighbors.is_some() { + prop_assert_eq!(merged.max_neighbors, user.max_neighbors); + } + if user.ef_search.is_some() { + prop_assert_eq!(merged.ef_search, user.ef_search); + } + if user.num_threads.is_some() { + prop_assert_eq!(merged.num_threads, user.num_threads); + } + if user.batch_size.is_some() { + prop_assert_eq!(merged.batch_size, user.batch_size); + } + if user.sync_threshold.is_some() { + prop_assert_eq!(merged.sync_threshold, user.sync_threshold); + } + if user.resize_factor.is_some() { + prop_assert_eq!(merged.resize_factor, user.resize_factor); + } + } + + #[test] + fn merge_hnsw_configs_falls_back_to_base_when_user_is_none( + base in partial_hnsw_index_config_strategy(), + ) { + let merged = Schema::merge_hnsw_configs(Some(&base), None) + .expect("merge should return Some when base is Some"); + + // Property: when user is None, base values are preserved + prop_assert_eq!(merged, base); + } + + #[test] + fn merge_hnsw_configs_returns_user_when_base_is_none( + user in partial_hnsw_index_config_strategy(), + ) { + let merged = Schema::merge_hnsw_configs(None, Some(&user)) + .expect("merge should return Some when user is Some"); + + // Property: when base is None, user values are preserved + prop_assert_eq!(merged, user); + } + + #[test] + fn merge_spann_configs_preserves_user_overrides( + base in partial_spann_index_config_strategy(), + user in partial_spann_index_config_strategy(), + ) { + let merged = Schema::merge_spann_configs(Some(&base), Some(&user)) + .expect("merge should return Some when both are Some"); + + // Property: user values always take precedence when Some + if user.search_nprobe.is_some() { + prop_assert_eq!(merged.search_nprobe, user.search_nprobe); + } + if user.search_rng_epsilon.is_some() { + prop_assert_eq!(merged.search_rng_epsilon, user.search_rng_epsilon); + } + if user.split_threshold.is_some() { + prop_assert_eq!(merged.split_threshold, user.split_threshold); + } + if user.ef_construction.is_some() { + prop_assert_eq!(merged.ef_construction, user.ef_construction); + } + if user.ef_search.is_some() { + prop_assert_eq!(merged.ef_search, user.ef_search); + } + if user.max_neighbors.is_some() { + prop_assert_eq!(merged.max_neighbors, user.max_neighbors); + } + } + + #[test] + fn merge_spann_configs_falls_back_to_base_when_user_is_none( + base in partial_spann_index_config_strategy(), + ) { + let merged = Schema::merge_spann_configs(Some(&base), None) + .expect("merge should return Some when base is Some"); + + // Property: when user is None, base values are preserved + prop_assert_eq!(merged, base); + } + + #[test] + fn merge_vector_index_config_preserves_user_overrides( + base in vector_index_config_strategy(), + user in vector_index_config_strategy(), + knn in knn_index_strategy(), + ) { + let merged = Schema::merge_vector_index_config(&base, &user, knn); + + // Property: user values take precedence for top-level fields + if user.space.is_some() { + prop_assert_eq!(merged.space, user.space); + } + if user.embedding_function.is_some() { + prop_assert_eq!(merged.embedding_function, user.embedding_function); + } + if user.source_key.is_some() { + prop_assert_eq!(merged.source_key, user.source_key); + } + + // Property: nested configs are merged according to merge rules + match knn { + KnnIndex::Hnsw => { + if let (Some(_base_hnsw), Some(user_hnsw)) = (&base.hnsw, &user.hnsw) { + let merged_hnsw = merged.hnsw.as_ref().expect("hnsw should be Some"); + if user_hnsw.ef_construction.is_some() { + prop_assert_eq!(merged_hnsw.ef_construction, user_hnsw.ef_construction); + } + } + } + KnnIndex::Spann => { + if let (Some(_base_spann), Some(user_spann)) = (&base.spann, &user.spann) { + let merged_spann = merged.spann.as_ref().expect("spann should be Some"); + if user_spann.search_nprobe.is_some() { + prop_assert_eq!(merged_spann.search_nprobe, user_spann.search_nprobe); + } + } + } + } + } + } + + fn expected_vector_index_config( + config: &InternalCollectionConfiguration, + ) -> VectorIndexConfig { + match &config.vector_index { + VectorIndexConfiguration::Hnsw(hnsw_config) => VectorIndexConfig { + space: Some(hnsw_config.space.clone()), + embedding_function: config.embedding_function.clone(), + source_key: None, + hnsw: Some(HnswIndexConfig { + ef_construction: Some(hnsw_config.ef_construction), + max_neighbors: Some(hnsw_config.max_neighbors), + ef_search: Some(hnsw_config.ef_search), + num_threads: Some(hnsw_config.num_threads), + batch_size: Some(hnsw_config.batch_size), + sync_threshold: Some(hnsw_config.sync_threshold), + resize_factor: Some(hnsw_config.resize_factor), + }), + spann: None, + }, + VectorIndexConfiguration::Spann(spann_config) => VectorIndexConfig { + space: Some(spann_config.space.clone()), + embedding_function: config.embedding_function.clone(), + source_key: None, + hnsw: None, + spann: Some(SpannIndexConfig { + search_nprobe: Some(spann_config.search_nprobe), + search_rng_factor: Some(spann_config.search_rng_factor), + search_rng_epsilon: Some(spann_config.search_rng_epsilon), + nreplica_count: Some(spann_config.nreplica_count), + write_rng_factor: Some(spann_config.write_rng_factor), + write_rng_epsilon: Some(spann_config.write_rng_epsilon), + split_threshold: Some(spann_config.split_threshold), + num_samples_kmeans: Some(spann_config.num_samples_kmeans), + initial_lambda: Some(spann_config.initial_lambda), + reassign_neighbor_count: Some(spann_config.reassign_neighbor_count), + merge_threshold: Some(spann_config.merge_threshold), + num_centers_to_merge_to: Some(spann_config.num_centers_to_merge_to), + write_nprobe: Some(spann_config.write_nprobe), + ef_construction: Some(spann_config.ef_construction), + ef_search: Some(spann_config.ef_search), + max_neighbors: Some(spann_config.max_neighbors), + }), + }, + } + } + + fn non_special_key_strategy() -> BoxedStrategy { + string_regex(TEST_NAME_PATTERN) + .unwrap() + .prop_filter("exclude special keys", |key| { + key != DOCUMENT_KEY && key != EMBEDDING_KEY + }) + .boxed() + } + + fn source_key_strategy() -> BoxedStrategy> { + proptest::option::of(prop_oneof![ + Just(DOCUMENT_KEY.to_string()), + string_regex(TEST_NAME_PATTERN).unwrap(), + ]) + .boxed() + } + + fn fts_index_type_strategy() -> impl Strategy { + any::().prop_map(|enabled| FtsIndexType { + enabled, + config: FtsIndexConfig {}, + }) + } + + fn string_inverted_index_type_strategy() -> impl Strategy { + any::().prop_map(|enabled| StringInvertedIndexType { + enabled, + config: StringInvertedIndexConfig {}, + }) + } + + fn string_value_type_strategy() -> BoxedStrategy> { + proptest::option::of( + ( + proptest::option::of(string_inverted_index_type_strategy()), + proptest::option::of(fts_index_type_strategy()), + ) + .prop_map(|(string_inverted_index, fts_index)| { + StringValueType { + string_inverted_index, + fts_index, + } + }), + ) + .boxed() + } + + fn float_inverted_index_type_strategy() -> impl Strategy { + any::().prop_map(|enabled| FloatInvertedIndexType { + enabled, + config: FloatInvertedIndexConfig {}, + }) + } + + fn float_value_type_strategy() -> BoxedStrategy> { + proptest::option::of( + proptest::option::of(float_inverted_index_type_strategy()).prop_map( + |float_inverted_index| FloatValueType { + float_inverted_index, + }, + ), + ) + .boxed() + } + + fn int_inverted_index_type_strategy() -> impl Strategy { + any::().prop_map(|enabled| IntInvertedIndexType { + enabled, + config: IntInvertedIndexConfig {}, + }) + } + + fn int_value_type_strategy() -> BoxedStrategy> { + proptest::option::of( + proptest::option::of(int_inverted_index_type_strategy()) + .prop_map(|int_inverted_index| IntValueType { int_inverted_index }), + ) + .boxed() + } + + fn bool_inverted_index_type_strategy() -> impl Strategy { + any::().prop_map(|enabled| BoolInvertedIndexType { + enabled, + config: BoolInvertedIndexConfig {}, + }) + } + + fn bool_value_type_strategy() -> BoxedStrategy> { + proptest::option::of( + proptest::option::of(bool_inverted_index_type_strategy()).prop_map( + |bool_inverted_index| BoolValueType { + bool_inverted_index, + }, + ), + ) + .boxed() + } + + fn sparse_vector_index_config_strategy() -> impl Strategy { + ( + sparse_embedding_function_strategy(), + source_key_strategy(), + proptest::option::of(any::()), + ) + .prop_map(|(embedding_function, source_key, bm25)| { + SparseVectorIndexConfig { + embedding_function, + source_key, + bm25, + } + }) + } + + fn sparse_vector_value_type_strategy() -> BoxedStrategy> { + proptest::option::of( + ( + any::(), + proptest::option::of(sparse_vector_index_config_strategy()), + ) + .prop_map(|(enabled, config)| SparseVectorValueType { + sparse_vector_index: config.map(|cfg| SparseVectorIndexType { + enabled, + config: cfg, + }), + }), + ) + .boxed() + } + + fn hnsw_index_config_strategy() -> impl Strategy { + internal_hnsw_configuration_strategy().prop_map(|config| HnswIndexConfig { + ef_construction: Some(config.ef_construction), + max_neighbors: Some(config.max_neighbors), + ef_search: Some(config.ef_search), + num_threads: Some(config.num_threads), + batch_size: Some(config.batch_size), + sync_threshold: Some(config.sync_threshold), + resize_factor: Some(config.resize_factor), + }) + } + + fn spann_index_config_strategy() -> impl Strategy { + internal_spann_configuration_strategy().prop_map(|config| SpannIndexConfig { + search_nprobe: Some(config.search_nprobe), + search_rng_factor: Some(config.search_rng_factor), + search_rng_epsilon: Some(config.search_rng_epsilon), + nreplica_count: Some(config.nreplica_count), + write_rng_factor: Some(config.write_rng_factor), + write_rng_epsilon: Some(config.write_rng_epsilon), + split_threshold: Some(config.split_threshold), + num_samples_kmeans: Some(config.num_samples_kmeans), + initial_lambda: Some(config.initial_lambda), + reassign_neighbor_count: Some(config.reassign_neighbor_count), + merge_threshold: Some(config.merge_threshold), + num_centers_to_merge_to: Some(config.num_centers_to_merge_to), + write_nprobe: Some(config.write_nprobe), + ef_construction: Some(config.ef_construction), + ef_search: Some(config.ef_search), + max_neighbors: Some(config.max_neighbors), + }) + } + + fn vector_index_config_strategy() -> impl Strategy { + ( + proptest::option::of(space_strategy()), + embedding_function_strategy(), + source_key_strategy(), + proptest::option::of(hnsw_index_config_strategy()), + proptest::option::of(spann_index_config_strategy()), + ) + .prop_map(|(space, embedding_function, source_key, hnsw, spann)| { + VectorIndexConfig { + space, + embedding_function, + source_key, + hnsw, + spann, + } + }) + } + + fn vector_index_type_strategy() -> impl Strategy { + (any::(), vector_index_config_strategy()) + .prop_map(|(enabled, config)| VectorIndexType { enabled, config }) + } + + fn float_list_value_type_strategy() -> BoxedStrategy> { + proptest::option::of( + proptest::option::of(vector_index_type_strategy()) + .prop_map(|vector_index| FloatListValueType { vector_index }), + ) + .boxed() + } + + fn value_types_strategy() -> BoxedStrategy { + ( + string_value_type_strategy(), + float_list_value_type_strategy(), + sparse_vector_value_type_strategy(), + int_value_type_strategy(), + float_value_type_strategy(), + bool_value_type_strategy(), + ) + .prop_map( + |(string, float_list, sparse_vector, int, float, boolean)| ValueTypes { + string, + float_list, + sparse_vector, + int, + float, + boolean, + }, + ) + .boxed() + } + + fn schema_strategy() -> BoxedStrategy { + ( + value_types_strategy(), + proptest::collection::hash_map( + non_special_key_strategy(), + value_types_strategy(), + 0..=3, + ), + proptest::option::of(value_types_strategy()), + proptest::option::of(value_types_strategy()), + ) + .prop_map( + |(defaults, mut extra_keys, document_override, embedding_override)| { + if let Some(doc) = document_override { + extra_keys.insert(DOCUMENT_KEY.to_string(), doc); + } + if let Some(embed) = embedding_override { + extra_keys.insert(EMBEDDING_KEY.to_string(), embed); + } + Schema { + defaults, + keys: extra_keys, + } + }, + ) + .boxed() + } + + fn force_non_default_schema(mut schema: Schema) -> Schema { + if schema.is_default() { + if let Some(string_value) = schema + .defaults + .string + .as_mut() + .and_then(|string_value| string_value.string_inverted_index.as_mut()) + { + string_value.enabled = !string_value.enabled; + } else { + schema.defaults.string = Some(StringValueType { + string_inverted_index: Some(StringInvertedIndexType { + enabled: false, + config: StringInvertedIndexConfig {}, + }), + fts_index: None, + }); + } + } + schema + } + + fn non_default_schema_strategy() -> BoxedStrategy { + schema_strategy().prop_map(force_non_default_schema).boxed() + } + + fn extract_vector_configs(schema: &Schema) -> (VectorIndexConfig, VectorIndexConfig) { + let defaults = schema + .defaults + .float_list + .as_ref() + .and_then(|fl| fl.vector_index.as_ref()) + .map(|vi| vi.config.clone()) + .expect("defaults vector index missing"); + + let embedding = schema + .keys + .get(EMBEDDING_KEY) + .and_then(|value_types| value_types.float_list.as_ref()) + .and_then(|fl| fl.vector_index.as_ref()) + .map(|vi| vi.config.clone()) + .expect("#embedding vector index missing"); + + (defaults, embedding) + } + + proptest! { + #[test] + fn reconcile_schema_and_config_matches_convert_for_config_only( + config in internal_collection_configuration_strategy(), + knn in knn_index_strategy(), + ) { + let result = Schema::reconcile_schema_and_config(None, Some(&config), knn) + .expect("reconciliation should succeed"); + + let (defaults_vi, embedding_vi) = extract_vector_configs(&result); + let expected_config = expected_vector_index_config(&config); + + prop_assert_eq!(defaults_vi, expected_config.clone()); + + let mut expected_embedding_config = expected_config; + expected_embedding_config.source_key = Some(DOCUMENT_KEY.to_string()); + prop_assert_eq!(embedding_vi, expected_embedding_config); + + prop_assert_eq!(result.keys.len(), 2); + } + } + + proptest! { + #[test] + fn reconcile_schema_and_config_errors_when_both_non_default( + config in non_default_internal_collection_configuration_strategy(), + knn in knn_index_strategy(), + ) { + let schema = Schema::try_from(&config) + .expect("conversion should succeed"); + prop_assume!(!schema.is_default()); + + let result = Schema::reconcile_schema_and_config(Some(&schema), Some(&config), knn); + + prop_assert!(matches!(result, Err(SchemaError::ConfigAndSchemaConflict))); + } + } + + proptest! { + #[test] + fn reconcile_schema_and_config_matches_schema_only_path( + schema in schema_strategy(), + knn in knn_index_strategy(), + ) { + let result = Schema::reconcile_schema_and_config(Some(&schema), None, knn) + .expect("reconciliation should succeed"); + + let (defaults_vi, embedding_vi) = extract_vector_configs(&result); + + // Property: schema defaults.float_list vector_index config should be merged into defaults + if let Some(schema_float_list) = schema.defaults.float_list.as_ref() { + if let Some(schema_vi) = schema_float_list.vector_index.as_ref() { + // Property: schema values take precedence over defaults + if let Some(schema_space) = &schema_vi.config.space { + prop_assert_eq!(defaults_vi.space, Some(schema_space.clone())); + } + if let Some(schema_ef) = &schema_vi.config.embedding_function { + prop_assert_eq!(defaults_vi.embedding_function, Some(schema_ef.clone())); + } + // Test nested config merging properties + match knn { + KnnIndex::Hnsw => { + if let Some(schema_hnsw) = &schema_vi.config.hnsw { + if let Some(merged_hnsw) = &defaults_vi.hnsw { + if let Some(schema_ef_construction) = schema_hnsw.ef_construction { + prop_assert_eq!(merged_hnsw.ef_construction, Some(schema_ef_construction)); + } + } + } + } + KnnIndex::Spann => { + if let Some(schema_spann) = &schema_vi.config.spann { + if let Some(merged_spann) = &defaults_vi.spann { + if let Some(schema_search_nprobe) = schema_spann.search_nprobe { + prop_assert_eq!(merged_spann.search_nprobe, Some(schema_search_nprobe)); + } + } + } + } + } + } + } + + // Property: schema #embedding float_list vector_index config should be merged into embedding + if let Some(embedding_values) = schema.keys.get(EMBEDDING_KEY) { + if let Some(embedding_float_list) = embedding_values.float_list.as_ref() { + if let Some(embedding_vi_type) = embedding_float_list.vector_index.as_ref() { + if let Some(schema_space) = &embedding_vi_type.config.space { + prop_assert_eq!(embedding_vi.space, Some(schema_space.clone())); + } + } + } + } + } + } + + proptest! { + #[test] + fn reconcile_schema_and_config_with_default_schema_and_default_config_applies_embedding_function( + embedding_function in default_embedding_function_strategy(), + knn in knn_index_strategy(), + ) { + let schema = Schema::new_default(knn); + let mut config = match knn { + KnnIndex::Hnsw => InternalCollectionConfiguration::default_hnsw(), + KnnIndex::Spann => InternalCollectionConfiguration::default_spann(), + }; + config.embedding_function = embedding_function.clone(); + + let result = Schema::reconcile_schema_and_config( + Some(&schema), + Some(&config), + knn, + ) + .expect("reconciliation should succeed"); + + let (defaults_vi, embedding_vi) = extract_vector_configs(&result); + + // Property: embedding function from config should be applied to both defaults and embedding + if let Some(ef) = embedding_function { + prop_assert_eq!(defaults_vi.embedding_function, Some(ef.clone())); + prop_assert_eq!(embedding_vi.embedding_function, Some(ef)); + } else { + // Property: when embedding function is None, it should remain None + prop_assert_eq!(defaults_vi.embedding_function, None); + prop_assert_eq!(embedding_vi.embedding_function, None); + } + } + } + + proptest! { + #[test] + fn reconcile_schema_and_config_with_default_config_keeps_non_default_schema( + schema in non_default_schema_strategy(), + knn in knn_index_strategy(), + ) { + let default_config = match knn { + KnnIndex::Hnsw => InternalCollectionConfiguration::default_hnsw(), + KnnIndex::Spann => InternalCollectionConfiguration::default_spann(), + }; + + let result = Schema::reconcile_schema_and_config( + Some(&schema), + Some(&default_config), + knn, + ) + .expect("reconciliation should succeed"); + + let (defaults_vi, embedding_vi) = extract_vector_configs(&result); + + // Property: when config is default, schema values should be preserved + // Test that schema defaults.float_list vector_index config is applied + if let Some(schema_float_list) = schema.defaults.float_list.as_ref() { + if let Some(schema_vi) = schema_float_list.vector_index.as_ref() { + if let Some(schema_space) = &schema_vi.config.space { + prop_assert_eq!(defaults_vi.space, Some(schema_space.clone())); + } + if let Some(schema_ef) = &schema_vi.config.embedding_function { + prop_assert_eq!(defaults_vi.embedding_function, Some(schema_ef.clone())); + } + } + } + + // Property: schema #embedding float_list vector_index config should be applied + if let Some(embedding_values) = schema.keys.get(EMBEDDING_KEY) { + if let Some(embedding_float_list) = embedding_values.float_list.as_ref() { + if let Some(embedding_vi_type) = embedding_float_list.vector_index.as_ref() { + if let Some(schema_space) = &embedding_vi_type.config.space { + prop_assert_eq!(embedding_vi.space, Some(schema_space.clone())); + } + } + } + } + } + } + } } diff --git a/rust/types/src/lib.rs b/rust/types/src/lib.rs index 9555016ff9b..223bf9df702 100644 --- a/rust/types/src/lib.rs +++ b/rust/types/src/lib.rs @@ -14,6 +14,8 @@ mod log; mod metadata; mod operation; pub mod operators; +#[cfg(feature = "testing")] +mod proptest_utils; mod record; mod scalar_encoding; mod segment; diff --git a/rust/types/src/proptest_utils.rs b/rust/types/src/proptest_utils.rs new file mode 100644 index 00000000000..eeb6eff80f6 --- /dev/null +++ b/rust/types/src/proptest_utils.rs @@ -0,0 +1,171 @@ +#[cfg(feature = "testing")] +#[allow(dead_code)] // Functions are used in test modules that are conditionally compiled +pub mod strategies { + use crate::hnsw_configuration::Space; + use crate::{ + strategies::TEST_NAME_PATTERN, EmbeddingFunctionConfiguration, + EmbeddingFunctionNewConfiguration, InternalCollectionConfiguration, + InternalHnswConfiguration, InternalSpannConfiguration, KnnIndex, VectorIndexConfiguration, + }; + use proptest::prelude::*; + use proptest::string::string_regex; + use serde_json::json; + + pub fn embedding_function_strategy( + ) -> impl Strategy> { + let known_strategy = string_regex(TEST_NAME_PATTERN).unwrap().prop_map(|name| { + EmbeddingFunctionConfiguration::Known(EmbeddingFunctionNewConfiguration { + name, + config: json!({ "alpha": 1 }), + }) + }); + + proptest::option::of(prop_oneof![ + Just(EmbeddingFunctionConfiguration::Legacy), + known_strategy, + ]) + } + + pub fn space_strategy() -> impl Strategy { + prop_oneof![Just(Space::L2), Just(Space::Cosine), Just(Space::Ip),] + } + + pub fn internal_hnsw_configuration_strategy() -> impl Strategy + { + ( + space_strategy(), + 1usize..=256, + 1usize..=256, + 1usize..=64, + 1usize..=32, + prop_oneof![Just(0.5f64), Just(1.0f64), Just(1.5f64), Just(2.0f64)], + 2usize..=4096, + 2usize..=4096, + ) + .prop_map( + |( + space, + ef_construction, + ef_search, + max_neighbors, + num_threads, + resize_factor, + sync_threshold, + batch_size, + )| InternalHnswConfiguration { + space, + ef_construction, + ef_search, + max_neighbors, + num_threads, + resize_factor, + sync_threshold, + batch_size, + }, + ) + } + + pub fn spann_epsilon_strategy() -> impl Strategy { + prop_oneof![Just(5.0f32), Just(7.5f32), Just(10.0f32)] + } + + pub fn internal_spann_configuration_strategy( + ) -> impl Strategy { + ( + ( + 1u32..=128, // search_nprobe + Just(1.0f32), // search_rng_factor (validated == 1.0) + spann_epsilon_strategy(), // search_rng_epsilon ∈ [5, 10] + 1u32..=64, // write_nprobe (max 64) + 1u32..=8, // nreplica_count (max 8) + Just(1.0f32), // write_rng_factor (validated == 1.0) + spann_epsilon_strategy(), // write_rng_epsilon ∈ [5, 10] + 50u32..=200, // split_threshold (min 50, max 200) + 1usize..=1000, // num_samples_kmeans (max 1000) + ), + ( + Just(100.0f32), // initial_lambda (validated == 100) + 1u32..=64, // reassign_neighbor_count (max 64) + 25u32..=100, // merge_threshold (min 25, max 100) + 1u32..=8, // num_centers_to_merge_to (max 8) + space_strategy(), // space + 1usize..=200, // ef_construction (max 200) + 1usize..=200, // ef_search (max 200) + 1usize..=64, // max_neighbors (max 64) + ), + ) + .prop_map( + |( + ( + search_nprobe, + search_rng_factor, + search_rng_epsilon, + write_nprobe, + nreplica_count, + write_rng_factor, + write_rng_epsilon, + split_threshold, + num_samples_kmeans, + ), + ( + initial_lambda, + reassign_neighbor_count, + merge_threshold, + num_centers_to_merge_to, + space, + ef_construction, + ef_search, + max_neighbors, + ), + )| InternalSpannConfiguration { + search_nprobe, + search_rng_factor, + search_rng_epsilon, + write_nprobe, + nreplica_count, + write_rng_factor, + write_rng_epsilon, + split_threshold, + num_samples_kmeans, + initial_lambda, + reassign_neighbor_count, + merge_threshold, + num_centers_to_merge_to, + space, + ef_construction, + ef_search, + max_neighbors, + }, + ) + } + + pub fn knn_index_strategy() -> impl Strategy { + prop_oneof![Just(KnnIndex::Hnsw), Just(KnnIndex::Spann),] + } + + pub fn internal_collection_configuration_strategy( + ) -> impl Strategy { + prop_oneof![ + ( + internal_hnsw_configuration_strategy(), + embedding_function_strategy() + ) + .prop_map(|(hnsw, embedding_function)| { + InternalCollectionConfiguration { + vector_index: VectorIndexConfiguration::Hnsw(hnsw), + embedding_function, + } + }), + ( + internal_spann_configuration_strategy(), + embedding_function_strategy() + ) + .prop_map(|(spann, embedding_function)| { + InternalCollectionConfiguration { + vector_index: VectorIndexConfiguration::Spann(spann), + embedding_function, + } + }), + ] + } +} diff --git a/rust/types/src/strategies.rs b/rust/types/src/strategies.rs index f25578f8882..36f7e01bf0e 100644 --- a/rust/types/src/strategies.rs +++ b/rust/types/src/strategies.rs @@ -7,6 +7,8 @@ use crate::{ use proptest::{collection, prelude::*, sample::SizeRange, string::string_regex}; use regex_syntax::hir::{ClassUnicode, ClassUnicodeRange}; +pub const TEST_NAME_PATTERN: &str = "[a-z]{1,16}"; + /** * Strategy for valid metadata keys. * Keys cannot be empty and cannot start with '#' or '$'.