diff --git a/go/pkg/sysdb/coordinator/model/collection_configuration.go b/go/pkg/sysdb/coordinator/model/collection_configuration.go index 032aad5b2e7..42104b5c882 100644 --- a/go/pkg/sysdb/coordinator/model/collection_configuration.go +++ b/go/pkg/sysdb/coordinator/model/collection_configuration.go @@ -249,7 +249,7 @@ type ValueTypes struct { SparseVector *SparseVectorValueType `json:"sparse_vector,omitempty"` Int *IntValueType `json:"int,omitempty"` Float *FloatValueType `json:"float,omitempty"` - Boolean *BoolValueType `json:"boolean,omitempty"` + Boolean *BoolValueType `json:"bool,omitempty"` } type Schema struct { diff --git a/go/pkg/sysdb/coordinator/model/collection_configuration_test.go b/go/pkg/sysdb/coordinator/model/collection_configuration_test.go index bf48c62ac5a..d378b2a1188 100644 --- a/go/pkg/sysdb/coordinator/model/collection_configuration_test.go +++ b/go/pkg/sysdb/coordinator/model/collection_configuration_test.go @@ -147,7 +147,7 @@ func TestUpdateSchemaFromConfig_HnswSuccess(t *testing.T) { "config": {} } }, - "boolean": { + "bool": { "bool_inverted_index": { "enabled": true, "config": {} @@ -312,7 +312,7 @@ func TestUpdateSchemaFromConfig_SpannSuccess(t *testing.T) { "config": {} } }, - "boolean": { + "bool": { "bool_inverted_index": { "enabled": true, "config": {} @@ -481,7 +481,7 @@ func TestUpdateSchemaFromConfig_EmbeddingFunction(t *testing.T) { "config": {} } }, - "boolean": { + "bool": { "bool_inverted_index": { "enabled": true, "config": {} diff --git a/go/pkg/sysdb/coordinator/table_catalog_test.go b/go/pkg/sysdb/coordinator/table_catalog_test.go index 5b9898825a9..468b3cbf08e 100644 --- a/go/pkg/sysdb/coordinator/table_catalog_test.go +++ b/go/pkg/sysdb/coordinator/table_catalog_test.go @@ -1555,7 +1555,7 @@ func TestUpdateCollection_WithSchema(t *testing.T) { "config": {} } }, - "boolean": { + "bool": { "bool_inverted_index": { "enabled": true, "config": {} diff --git a/rust/cli/src/commands/vacuum.rs b/rust/cli/src/commands/vacuum.rs index 9f505193364..4f17987b07e 100644 --- a/rust/cli/src/commands/vacuum.rs +++ b/rust/cli/src/commands/vacuum.rs @@ -11,7 +11,7 @@ use chroma_segment::local_segment_manager::LocalSegmentManager; use chroma_sqlite::db::SqliteDb; use chroma_sysdb::SysDb; use chroma_system::System; -use chroma_types::{CollectionUuid, KnnIndex, ListCollectionsRequest}; +use chroma_types::{CollectionUuid, ListCollectionsRequest, Schema}; use clap::Parser; use colored::Colorize; use dialoguer::Confirm; @@ -101,17 +101,18 @@ async fn trigger_vector_segments_max_seq_id_migration( sqlite: &SqliteDb, sysdb: &mut SysDb, segment_manager: &LocalSegmentManager, - default_knn_index: KnnIndex, ) -> Result<(), Box> { let collection_ids = get_collection_ids_to_migrate(sqlite).await?; for collection_id in collection_ids { let mut collection = sysdb.get_collection_with_segments(collection_id).await?; - collection - .collection - .reconcile_schema_with_config(default_knn_index) - .map_err(|e| Box::new(e) as Box)?; + if collection.collection.schema.is_none() { + collection.collection.schema = Some( + Schema::try_from(&collection.collection.config) + .map_err(|e| Box::new(e) as Box)?, + ); + } // If collection is uninitialized, that means nothing has been written yet. let dim = match collection.collection.dimension { @@ -153,13 +154,7 @@ pub async fn vacuum_chroma(config: FrontendConfig) -> Result<(), Box> println!("Purging the log...\n"); - trigger_vector_segments_max_seq_id_migration( - &sqlite, - &mut sysdb, - &segment_manager, - config.default_knn_index, - ) - .await?; + trigger_vector_segments_max_seq_id_migration(&sqlite, &mut sysdb, &segment_manager).await?; let tenant = String::from("default_tenant"); let database = String::from("default_database"); diff --git a/rust/frontend/src/get_collection_with_segments_provider.rs b/rust/frontend/src/get_collection_with_segments_provider.rs index c5622220acd..74e4fbff4a3 100644 --- a/rust/frontend/src/get_collection_with_segments_provider.rs +++ b/rust/frontend/src/get_collection_with_segments_provider.rs @@ -4,8 +4,7 @@ use chroma_config::Configurable; use chroma_error::{ChromaError, ErrorCodes}; use chroma_sysdb::SysDb; use chroma_types::{ - CollectionAndSegments, CollectionUuid, GetCollectionWithSegmentsError, KnnIndex, Schema, - SchemaError, + CollectionAndSegments, CollectionUuid, GetCollectionWithSegmentsError, Schema, SchemaError, }; use serde::{Deserialize, Serialize}; use std::{ @@ -143,7 +142,6 @@ impl CollectionsWithSegmentsProvider { pub(crate) async fn get_collection_with_segments( &mut self, collection_id: CollectionUuid, - knn_index: KnnIndex, ) -> Result { if let Some(collection_and_segments_with_ttl) = self .collections_with_segments_cache @@ -185,14 +183,13 @@ impl CollectionsWithSegmentsProvider { .await? }; - // reconcile schema and config - let reconciled_schema = Schema::reconcile_schema_and_config( - collection_and_segments_sysdb.collection.schema.as_ref(), - Some(&collection_and_segments_sysdb.collection.config), - knn_index, - ) - .map_err(CollectionsWithSegmentsProviderError::InvalidSchema)?; - collection_and_segments_sysdb.collection.schema = Some(reconciled_schema); + if collection_and_segments_sysdb.collection.schema.is_none() { + collection_and_segments_sysdb.collection.schema = Some( + Schema::try_from(&collection_and_segments_sysdb.collection.config) + .map_err(CollectionsWithSegmentsProviderError::InvalidSchema)?, + ); + } + self.set_collection_with_segments(collection_and_segments_sysdb.clone()) .await; Ok(collection_and_segments_sysdb) diff --git a/rust/frontend/src/impls/in_memory_frontend.rs b/rust/frontend/src/impls/in_memory_frontend.rs index e28c8287c73..0baf660e122 100644 --- a/rust/frontend/src/impls/in_memory_frontend.rs +++ b/rust/frontend/src/impls/in_memory_frontend.rs @@ -6,7 +6,8 @@ use chroma_types::operator::{Filter, KnnBatch, KnnProjection, Limit, Projection, use chroma_types::plan::{Count, Get, Knn}; use chroma_types::{ test_segment, Collection, CollectionAndSegments, CreateCollectionError, Database, Include, - IncludeList, InternalCollectionConfiguration, KnnIndex, Segment, VectorIndexConfiguration, + IncludeList, InternalCollectionConfiguration, KnnIndex, Schema, SchemaError, Segment, + VectorIndexConfiguration, }; use std::collections::HashSet; @@ -221,22 +222,27 @@ impl InMemoryFrontend { )); } - let mut collection = Collection { + let schema = Schema::reconcile_schema_and_config( + request.schema.as_ref(), + request.configuration.as_ref(), + KnnIndex::Hnsw, + ) + .map_err(CreateCollectionError::InvalidSchema)?; + + let config = InternalCollectionConfiguration::try_from(&schema).map_err(|e| { + CreateCollectionError::InvalidSchema(SchemaError::InvalidSchema { reason: e }) + })?; + + let collection = Collection { name: request.name.clone(), tenant: request.tenant_id.clone(), database: request.database_name.clone(), metadata: request.metadata, - config: request - .configuration - .unwrap_or(InternalCollectionConfiguration::default_hnsw()), - schema: request.schema, + config, + schema: Some(schema), ..Default::default() }; - collection - .reconcile_schema_with_config(KnnIndex::Hnsw) - .map_err(CreateCollectionError::InvalidSchema)?; - // Prevent SPANN usage in InMemoryFrontend if matches!( collection.config.vector_index, diff --git a/rust/frontend/src/impls/service_based_frontend.rs b/rust/frontend/src/impls/service_based_frontend.rs index 8372091da1d..bee32329c5c 100644 --- a/rust/frontend/src/impls/service_based_frontend.rs +++ b/rust/frontend/src/impls/service_based_frontend.rs @@ -176,7 +176,7 @@ impl ServiceBasedFrontend { ) -> Result { Ok(self .collections_with_segments_provider - .get_collection_with_segments(collection_id, self.default_knn_index) + .get_collection_with_segments(collection_id) .await .map_err(|err| Box::new(err) as Box)? .collection) @@ -188,7 +188,7 @@ impl ServiceBasedFrontend { ) -> Result, GetCollectionError> { Ok(self .collections_with_segments_provider - .get_collection_with_segments(collection_id, self.default_knn_index) + .get_collection_with_segments(collection_id) .await .map_err(|err| Box::new(err) as Box)? .collection @@ -381,7 +381,7 @@ impl ServiceBasedFrontend { if self.enable_schema { for collection in collections.iter_mut() { collection - .reconcile_schema_with_config(self.default_knn_index) + .reconcile_schema_for_read() .map_err(GetCollectionsError::InvalidSchema)?; } } @@ -425,7 +425,7 @@ impl ServiceBasedFrontend { if self.enable_schema { for collection in &mut collections { collection - .reconcile_schema_with_config(self.default_knn_index) + .reconcile_schema_for_read() .map_err(GetCollectionError::InvalidSchema)?; } } @@ -450,7 +450,7 @@ impl ServiceBasedFrontend { if self.enable_schema { collection - .reconcile_schema_with_config(self.default_knn_index) + .reconcile_schema_for_read() .map_err(GetCollectionByCrnError::InvalidSchema)?; } Ok(collection) @@ -630,9 +630,10 @@ impl ServiceBasedFrontend { // that was retrieved from sysdb, rather than the one that was passed in if self.enable_schema { collection - .reconcile_schema_with_config(self.default_knn_index) + .reconcile_schema_for_read() .map_err(CreateCollectionError::InvalidSchema)?; } + Ok(collection) } @@ -735,7 +736,7 @@ impl ServiceBasedFrontend { .await?; collection_and_segments .collection - .reconcile_schema_with_config(self.default_knn_index) + .reconcile_schema_for_read() .map_err(ForkCollectionError::InvalidSchema)?; let collection = collection_and_segments.collection.clone(); let latest_collection_logical_size_bytes = collection_and_segments @@ -1099,7 +1100,7 @@ impl ServiceBasedFrontend { let read_event = if let Some(where_clause) = r#where { let collection_and_segments = self .collections_with_segments_provider - .get_collection_with_segments(collection_id, self.default_knn_index) + .get_collection_with_segments(collection_id) .await .map_err(|err| Box::new(err) as Box)?; if self.enable_schema { @@ -1309,7 +1310,7 @@ impl ServiceBasedFrontend { ) -> Result { let collection_and_segments = self .collections_with_segments_provider - .get_collection_with_segments(collection_id, self.default_knn_index) + .get_collection_with_segments(collection_id) .await .map_err(|err| Box::new(err) as Box)?; let latest_collection_logical_size_bytes = collection_and_segments @@ -1424,7 +1425,7 @@ impl ServiceBasedFrontend { ) -> Result { let collection_and_segments = self .collections_with_segments_provider - .get_collection_with_segments(collection_id, self.default_knn_index) + .get_collection_with_segments(collection_id) .await .map_err(|err| Box::new(err) as Box)?; if self.enable_schema { @@ -1569,7 +1570,7 @@ impl ServiceBasedFrontend { ) -> Result { let collection_and_segments = self .collections_with_segments_provider - .get_collection_with_segments(collection_id, self.default_knn_index) + .get_collection_with_segments(collection_id) .await .map_err(|err| Box::new(err) as Box)?; if self.enable_schema { @@ -1726,7 +1727,7 @@ impl ServiceBasedFrontend { // Get collection and segments once for all queries let collection_and_segments = self .collections_with_segments_provider - .get_collection_with_segments(request.collection_id, self.default_knn_index) + .get_collection_with_segments(request.collection_id) .await .map_err(|err| QueryError::Other(Box::new(err) as Box))?; if self.enable_schema { diff --git a/rust/log/src/local_compaction_manager.rs b/rust/log/src/local_compaction_manager.rs index 4b0711c1478..604f7b4a0c2 100644 --- a/rust/log/src/local_compaction_manager.rs +++ b/rust/log/src/local_compaction_manager.rs @@ -15,7 +15,7 @@ use chroma_sysdb::SysDb; use chroma_system::Handler; use chroma_system::{Component, ComponentContext}; use chroma_types::{ - Chunk, CollectionUuid, GetCollectionWithSegmentsError, KnnIndex, LogRecord, SchemaError, + Chunk, CollectionUuid, GetCollectionWithSegmentsError, LogRecord, Schema, SchemaError, }; use serde::{Deserialize, Serialize}; use thiserror::Error; @@ -141,9 +141,12 @@ impl Handler for LocalCompactionManager { .get_collection_with_segments(message.collection_id) .await?; let schema_previously_persisted = collection_and_segments.collection.schema.is_some(); - collection_and_segments - .collection - .reconcile_schema_with_config(KnnIndex::Hnsw)?; + if !schema_previously_persisted { + collection_and_segments.collection.schema = Some( + Schema::try_from(&collection_and_segments.collection.config) + .map_err(CompactionManagerError::SchemaReconcileError)?, + ); + } // If collection is uninitialized, that means nothing has been written yet. let dim = match collection_and_segments.collection.dimension { Some(dim) => dim, @@ -267,7 +270,12 @@ impl Handler for LocalCompactionManager { .get_collection_with_segments(message.collection_id) .await?; let mut collection = collection_segments.collection.clone(); - collection.reconcile_schema_with_config(KnnIndex::Hnsw)?; + if collection.schema.is_none() { + collection.schema = Some( + Schema::try_from(&collection.config) + .map_err(CompactionManagerError::SchemaReconcileError)?, + ); + } // If dimension is None, that means nothing has been written yet. let dim = match collection.dimension { Some(dim) => dim, diff --git a/rust/segment/src/distributed_hnsw.rs b/rust/segment/src/distributed_hnsw.rs index 5e1552b7f78..3dac57696d3 100644 --- a/rust/segment/src/distributed_hnsw.rs +++ b/rust/segment/src/distributed_hnsw.rs @@ -8,7 +8,7 @@ use chroma_index::hnsw_provider::{ HnswIndexProviderOpenError, HnswIndexRef, }; use chroma_index::{Index, IndexUuid}; -use chroma_types::{Collection, HnswParametersFromSegmentError, SegmentUuid}; +use chroma_types::{Collection, HnswParametersFromSegmentError, Schema, SchemaError, SegmentUuid}; use chroma_types::{MaterializedLogOperation, Segment}; use std::collections::HashMap; use std::fmt::Debug; @@ -55,6 +55,8 @@ pub enum DistributedHNSWSegmentFromSegmentError { MissingHnswConfiguration, #[error("Could not parse HNSW configuration: {0}")] InvalidHnswConfiguration(#[from] HnswParametersFromSegmentError), + #[error("Invalid schema: {0}")] + InvalidSchema(#[source] SchemaError), } impl ChromaError for DistributedHNSWSegmentFromSegmentError { @@ -72,6 +74,7 @@ impl ChromaError for DistributedHNSWSegmentFromSegmentError { DistributedHNSWSegmentFromSegmentError::InvalidHnswConfiguration(_) => { ErrorCodes::Internal } + DistributedHNSWSegmentFromSegmentError::InvalidSchema(e) => e.code(), } } } @@ -96,9 +99,14 @@ impl DistributedHNSWSegmentWriter { hnsw_index_provider: HnswIndexProvider, ) -> Result, Box> { - let hnsw_configuration = collection - .config - .get_hnsw_config_with_legacy_fallback(segment) + let schema = if let Some(schema) = &collection.schema { + schema + } else { + &Schema::try_from(&collection.config) + .map_err(DistributedHNSWSegmentFromSegmentError::InvalidSchema)? + }; + let hnsw_configuration = schema + .get_internal_hnsw_config_with_legacy_fallback(segment) .map_err(DistributedHNSWSegmentFromSegmentError::InvalidHnswConfiguration)? .ok_or(DistributedHNSWSegmentFromSegmentError::MissingHnswConfiguration)?; @@ -313,9 +321,13 @@ impl DistributedHNSWSegmentReader { hnsw_index_provider: HnswIndexProvider, ) -> Result, Box> { - let hnsw_configuration = collection - .config - .get_hnsw_config_with_legacy_fallback(segment) + let schema = collection.schema.as_ref().ok_or_else(|| { + DistributedHNSWSegmentFromSegmentError::InvalidSchema(SchemaError::InvalidSchema { + reason: "Schema is None".to_string(), + }) + })?; + let hnsw_configuration = schema + .get_internal_hnsw_config_with_legacy_fallback(segment) .map_err(DistributedHNSWSegmentFromSegmentError::InvalidHnswConfiguration)? .ok_or(DistributedHNSWSegmentFromSegmentError::MissingHnswConfiguration)?; @@ -394,7 +406,7 @@ pub mod test { use chroma_index::{HnswIndexConfig, DEFAULT_MAX_ELEMENTS}; use chroma_types::{ Collection, CollectionUuid, InternalCollectionConfiguration, InternalHnswConfiguration, - Segment, SegmentUuid, + Schema, Segment, SegmentUuid, }; use tempfile::tempdir; use uuid::Uuid; @@ -423,18 +435,18 @@ pub mod test { config.persist_path, Some(persist_path.to_str().unwrap().to_string()) ); + let config = InternalCollectionConfiguration { + vector_index: chroma_types::VectorIndexConfiguration::Hnsw(InternalHnswConfiguration { + max_neighbors: 10, + ..Default::default() + }), + embedding_function: None, + }; // Try partial override let collection = Collection { - config: InternalCollectionConfiguration { - vector_index: chroma_types::VectorIndexConfiguration::Hnsw( - InternalHnswConfiguration { - max_neighbors: 10, - ..Default::default() - }, - ), - embedding_function: None, - }, + config: config.clone(), + schema: Some(Schema::try_from(&config).unwrap()), ..Default::default() }; @@ -448,9 +460,12 @@ pub mod test { }; let hnsw_params = collection - .config - .get_hnsw_config_with_legacy_fallback(&segment) + .schema + .as_ref() + .map(|schema| schema.get_internal_hnsw_config_with_legacy_fallback(&segment)) + .transpose() .unwrap() + .flatten() .unwrap(); let config = HnswIndexConfig::new_persistent( hnsw_params.max_neighbors, diff --git a/rust/segment/src/distributed_spann.rs b/rust/segment/src/distributed_spann.rs index e6f5fceb9f9..b9aec9d4e6e 100644 --- a/rust/segment/src/distributed_spann.rs +++ b/rust/segment/src/distributed_spann.rs @@ -15,7 +15,6 @@ use chroma_index::spann::types::{ use chroma_index::IndexUuid; use chroma_index::{hnsw_provider::HnswIndexProvider, spann::types::SpannIndexWriter}; use chroma_types::Collection; -use chroma_types::KnnIndex; use chroma_types::Schema; use chroma_types::SchemaError; use chroma_types::SegmentUuid; @@ -112,14 +111,13 @@ impl SpannSegmentWriter { return Err(SpannSegmentWriterError::InvalidArgument); } - let reconciled_schema = Schema::reconcile_schema_and_config( - collection.schema.as_ref(), - Some(&collection.config), - KnnIndex::Spann, - ) - .map_err(SpannSegmentWriterError::InvalidSchema)?; + let schema = if let Some(schema) = &collection.schema { + schema + } else { + &Schema::try_from(&collection.config).map_err(SpannSegmentWriterError::InvalidSchema)? + }; - let params = reconciled_schema + let params = schema .get_internal_spann_config() .ok_or(SpannSegmentWriterError::MissingSpannConfiguration)?; @@ -621,8 +619,8 @@ mod test { use chroma_storage::{local::LocalStorage, Storage}; use chroma_types::{ Chunk, Collection, CollectionUuid, DatabaseUuid, InternalCollectionConfiguration, - InternalSpannConfiguration, KnnIndex, LogRecord, Operation, OperationRecord, Schema, - SegmentUuid, SpannPostingList, + InternalSpannConfiguration, LogRecord, Operation, OperationRecord, Schema, SegmentUuid, + SpannPostingList, }; use crate::{ @@ -690,8 +688,8 @@ mod test { ..Default::default() }; collection.schema = Some( - Schema::reconcile_schema_and_config(None, Some(&collection.config), KnnIndex::Spann) - .expect("Error reconciling schema for test collection"), + Schema::try_from(&collection.config) + .expect("Error converting config to schema for test collection"), ); let pl_block_size = 5 * 1024 * 1024; @@ -927,8 +925,8 @@ mod test { ..Default::default() }; collection.schema = Some( - Schema::reconcile_schema_and_config(None, Some(&collection.config), KnnIndex::Spann) - .expect("Error reconciling schema for test collection"), + Schema::try_from(&collection.config) + .expect("Error converting config to schema for test collection"), ); let pl_block_size = 5 * 1024 * 1024; @@ -1089,8 +1087,8 @@ mod test { ..Default::default() }; collection.schema = Some( - Schema::reconcile_schema_and_config(None, Some(&collection.config), KnnIndex::Spann) - .expect("Error reconciling schema for test collection"), + Schema::try_from(&collection.config) + .expect("Error converting config to schema for test collection"), ); let segment_id = SegmentUuid::new(); @@ -1220,8 +1218,8 @@ mod test { ..Default::default() }; collection.schema = Some( - Schema::reconcile_schema_and_config(None, Some(&collection.config), KnnIndex::Spann) - .expect("Error reconciling schema for test collection"), + Schema::try_from(&collection.config) + .expect("Error converting config to schema for test collection"), ); let pl_block_size = 5 * 1024 * 1024; diff --git a/rust/types/src/collection.rs b/rust/types/src/collection.rs index 4189c0a6abe..df1271ce4ee 100644 --- a/rust/types/src/collection.rs +++ b/rust/types/src/collection.rs @@ -2,8 +2,8 @@ use std::str::FromStr; use super::{Metadata, MetadataValueConversionError}; use crate::{ - chroma_proto, test_segment, CollectionConfiguration, InternalCollectionConfiguration, KnnIndex, - Schema, SchemaError, Segment, SegmentScope, UpdateCollectionConfiguration, UpdateMetadata, + chroma_proto, test_segment, CollectionConfiguration, InternalCollectionConfiguration, Schema, + SchemaError, Segment, SegmentScope, UpdateCollectionConfiguration, UpdateMetadata, }; use chroma_error::{ChromaError, ErrorCodes}; use serde::{Deserialize, Serialize}; @@ -228,17 +228,18 @@ impl Collection { } impl Collection { - /// Reconcile the collection schema and configuration, ensuring both are consistent. - pub fn reconcile_schema_with_config(&mut self, knn_index: KnnIndex) -> Result<(), SchemaError> { - let reconciled_schema = Schema::reconcile_schema_and_config( - self.schema.as_ref(), - Some(&self.config), - knn_index, - )?; - - self.config = InternalCollectionConfiguration::try_from(&reconciled_schema) - .map_err(|reason| SchemaError::InvalidSchema { reason })?; - self.schema = Some(reconciled_schema); + /// Reconcile the collection schema and configuration when serving read requests. + /// + /// The read path needs to tolerate collections that only have a configuration persisted. + /// This helper hydrates `schema` from the stored configuration when needed, or regenerates + /// the configuration from the existing schema to keep both representations consistent. + pub fn reconcile_schema_for_read(&mut self) -> Result<(), SchemaError> { + if let Some(schema) = self.schema.as_ref() { + self.config = InternalCollectionConfiguration::try_from(schema) + .map_err(|reason| SchemaError::InvalidSchema { reason })?; + } else { + self.schema = Some(Schema::try_from(&self.config)?); + } Ok(()) } diff --git a/rust/types/src/collection_schema.rs b/rust/types/src/collection_schema.rs index ad33e45ba93..50df6fc634f 100644 --- a/rust/types/src/collection_schema.rs +++ b/rust/types/src/collection_schema.rs @@ -1403,15 +1403,31 @@ impl Schema { pub fn reconcile_with_collection_config( schema: &Schema, collection_config: &InternalCollectionConfiguration, + default_knn_index: KnnIndex, ) -> Result { // 1. Check if collection config is default if collection_config.is_default() { if schema.is_default() { - // if both are default, use collection config to create schema - // this handles the case where user did not provide schema or config. - // since default schema doesnt have an ef, we need to use the coll config to create - // a schema with the ef. - let new_schema = Self::convert_collection_config_to_schema(collection_config)?; + // if both are default, use the schema, and apply the ef from config if available + // for both defaults and #embedding key + let mut new_schema = Schema::new_default(default_knn_index); + + if collection_config.embedding_function.is_some() { + if let Some(float_list) = &mut new_schema.defaults.float_list { + if let Some(vector_index) = &mut float_list.vector_index { + vector_index.config.embedding_function = + collection_config.embedding_function.clone(); + } + } + if let Some(embedding_types) = new_schema.keys.get_mut(EMBEDDING_KEY) { + if let Some(float_list) = &mut embedding_types.float_list { + if let Some(vector_index) = &mut float_list.vector_index { + vector_index.config.embedding_function = + collection_config.embedding_function.clone(); + } + } + } + } return Ok(new_schema); } else { // Collection config is default and schema is non-default → schema is source of truth @@ -1421,7 +1437,7 @@ impl Schema { // 2. Collection config is non-default, schema must be default (already validated earlier) // Convert collection config to schema - Self::convert_collection_config_to_schema(collection_config) + Self::try_from(collection_config) } pub fn reconcile_schema_and_config( @@ -1438,7 +1454,7 @@ impl Schema { let reconciled_schema = Self::reconcile_with_defaults(schema, knn_index)?; if let Some(config) = configuration { - Self::reconcile_with_collection_config(&reconciled_schema, config) + Self::reconcile_with_collection_config(&reconciled_schema, config, knn_index) } else { Ok(reconciled_schema) } @@ -1682,77 +1698,6 @@ impl Schema { true } - /// Convert InternalCollectionConfiguration to Schema - fn convert_collection_config_to_schema( - collection_config: &InternalCollectionConfiguration, - ) -> Result { - // Start with a default schema structure - let mut schema = Schema::new_default(KnnIndex::Spann); // Default to HNSW, will be overridden - - // Convert vector index configuration - let vector_config = match &collection_config.vector_index { - VectorIndexConfiguration::Hnsw(hnsw_config) => VectorIndexConfig { - space: Some(hnsw_config.space.clone()), - embedding_function: collection_config.embedding_function.clone(), - source_key: Some(DOCUMENT_KEY.to_string()), // Default source key - hnsw: Some(HnswIndexConfig { - ef_construction: Some(hnsw_config.ef_construction), - max_neighbors: Some(hnsw_config.max_neighbors), - ef_search: Some(hnsw_config.ef_search), - num_threads: Some(hnsw_config.num_threads), - batch_size: Some(hnsw_config.batch_size), - sync_threshold: Some(hnsw_config.sync_threshold), - resize_factor: Some(hnsw_config.resize_factor), - }), - spann: None, - }, - VectorIndexConfiguration::Spann(spann_config) => VectorIndexConfig { - space: Some(spann_config.space.clone()), - embedding_function: collection_config.embedding_function.clone(), - source_key: Some(DOCUMENT_KEY.to_string()), // Default source key - hnsw: None, - spann: Some(SpannIndexConfig { - search_nprobe: Some(spann_config.search_nprobe), - search_rng_factor: Some(spann_config.search_rng_factor), - search_rng_epsilon: Some(spann_config.search_rng_epsilon), - nreplica_count: Some(spann_config.nreplica_count), - write_rng_factor: Some(spann_config.write_rng_factor), - write_rng_epsilon: Some(spann_config.write_rng_epsilon), - split_threshold: Some(spann_config.split_threshold), - num_samples_kmeans: Some(spann_config.num_samples_kmeans), - initial_lambda: Some(spann_config.initial_lambda), - reassign_neighbor_count: Some(spann_config.reassign_neighbor_count), - merge_threshold: Some(spann_config.merge_threshold), - num_centers_to_merge_to: Some(spann_config.num_centers_to_merge_to), - write_nprobe: Some(spann_config.write_nprobe), - ef_construction: Some(spann_config.ef_construction), - ef_search: Some(spann_config.ef_search), - max_neighbors: Some(spann_config.max_neighbors), - }), - }, - }; - - // Update defaults (keep enabled=false, just update the config) - // This serves as the template for any new float_list fields - if let Some(float_list) = &mut schema.defaults.float_list { - if let Some(vector_index) = &mut float_list.vector_index { - vector_index.config = vector_config.clone(); - } - } - - // Update the vector_index in the existing #embedding key override - // Keep enabled=true (already set by new_default) and update the config - if let Some(embedding_types) = schema.keys.get_mut(EMBEDDING_KEY) { - if let Some(float_list) = &mut embedding_types.float_list { - if let Some(vector_index) = &mut float_list.vector_index { - vector_index.config = vector_config; - } - } - } - - Ok(schema) - } - /// Check if a specific metadata key-value should be indexed based on schema configuration pub fn is_metadata_type_index_enabled( &self, @@ -2666,6 +2611,83 @@ impl From for IndexConfig { } } +impl TryFrom<&InternalCollectionConfiguration> for Schema { + type Error = SchemaError; + + fn try_from(config: &InternalCollectionConfiguration) -> Result { + // Start with a default schema structure + let mut schema = match &config.vector_index { + VectorIndexConfiguration::Hnsw(_) => Schema::new_default(KnnIndex::Hnsw), + VectorIndexConfiguration::Spann(_) => Schema::new_default(KnnIndex::Spann), + }; + // Convert vector index configuration + let vector_config = match &config.vector_index { + VectorIndexConfiguration::Hnsw(hnsw_config) => VectorIndexConfig { + space: Some(hnsw_config.space.clone()), + embedding_function: config.embedding_function.clone(), + source_key: None, + hnsw: Some(HnswIndexConfig { + ef_construction: Some(hnsw_config.ef_construction), + max_neighbors: Some(hnsw_config.max_neighbors), + ef_search: Some(hnsw_config.ef_search), + num_threads: Some(hnsw_config.num_threads), + batch_size: Some(hnsw_config.batch_size), + sync_threshold: Some(hnsw_config.sync_threshold), + resize_factor: Some(hnsw_config.resize_factor), + }), + spann: None, + }, + VectorIndexConfiguration::Spann(spann_config) => VectorIndexConfig { + space: Some(spann_config.space.clone()), + embedding_function: config.embedding_function.clone(), + source_key: None, + hnsw: None, + spann: Some(SpannIndexConfig { + search_nprobe: Some(spann_config.search_nprobe), + search_rng_factor: Some(spann_config.search_rng_factor), + search_rng_epsilon: Some(spann_config.search_rng_epsilon), + nreplica_count: Some(spann_config.nreplica_count), + write_rng_factor: Some(spann_config.write_rng_factor), + write_rng_epsilon: Some(spann_config.write_rng_epsilon), + split_threshold: Some(spann_config.split_threshold), + num_samples_kmeans: Some(spann_config.num_samples_kmeans), + initial_lambda: Some(spann_config.initial_lambda), + reassign_neighbor_count: Some(spann_config.reassign_neighbor_count), + merge_threshold: Some(spann_config.merge_threshold), + num_centers_to_merge_to: Some(spann_config.num_centers_to_merge_to), + write_nprobe: Some(spann_config.write_nprobe), + ef_construction: Some(spann_config.ef_construction), + ef_search: Some(spann_config.ef_search), + max_neighbors: Some(spann_config.max_neighbors), + }), + }, + }; + + // Update defaults (keep enabled=false, just update the config) + // This serves as the template for any new float_list fields + if let Some(float_list) = &mut schema.defaults.float_list { + if let Some(vector_index) = &mut float_list.vector_index { + vector_index.config = vector_config.clone(); + } + } + + // Update the vector_index in the existing #embedding key override + // Keep enabled=true (already set by new_default) and update the config + // Set source_key to DOCUMENT_KEY for the embedding key + if let Some(embedding_types) = schema.keys.get_mut(EMBEDDING_KEY) { + if let Some(float_list) = &mut embedding_types.float_list { + if let Some(vector_index) = &mut float_list.vector_index { + let mut vector_config = vector_config; + vector_config.source_key = Some(DOCUMENT_KEY.to_string()); + vector_index.config = vector_config; + } + } + } + + Ok(schema) + } +} + #[cfg(test)] mod tests { use super::*; @@ -2933,7 +2955,7 @@ mod tests { )), }; - let schema = Schema::convert_collection_config_to_schema(&collection_config).unwrap(); + let schema = Schema::try_from(&collection_config).unwrap(); let reconstructed = InternalCollectionConfiguration::try_from(&schema).unwrap(); assert_eq!(reconstructed, collection_config); @@ -2965,7 +2987,7 @@ mod tests { )), }; - let schema = Schema::convert_collection_config_to_schema(&collection_config).unwrap(); + let schema = Schema::try_from(&collection_config).unwrap(); let reconstructed = InternalCollectionConfiguration::try_from(&schema).unwrap(); assert_eq!(reconstructed, collection_config); @@ -3549,113 +3571,370 @@ mod tests { fn test_reconcile_with_collection_config_default_config() { // Test that when collection config is default, schema is returned as-is let collection_config = InternalCollectionConfiguration::default_hnsw(); - let schema = Schema::convert_collection_config_to_schema(&collection_config).unwrap(); + let schema = Schema::try_from(&collection_config).unwrap(); - let result = Schema::reconcile_with_collection_config(&schema, &collection_config).unwrap(); + let result = + Schema::reconcile_with_collection_config(&schema, &collection_config, KnnIndex::Hnsw) + .unwrap(); assert_eq!(result, schema); } + // Test all 8 cases of double default scenarios #[test] - fn test_reconcile_with_collection_config_both_non_default() { - // Test that when both schema and collection config are non-default, it returns an error - let mut schema = Schema::new_default(KnnIndex::Hnsw); - schema.defaults.string = Some(StringValueType { - fts_index: Some(FtsIndexType { - enabled: true, - config: FtsIndexConfig {}, - }), - string_inverted_index: None, - }); - - let mut collection_config = InternalCollectionConfiguration::default_hnsw(); - // Make collection config non-default by changing a parameter - if let VectorIndexConfiguration::Hnsw(ref mut hnsw_config) = collection_config.vector_index - { - hnsw_config.ef_construction = 500; // Non-default value - } + fn test_reconcile_double_default_hnsw_config_hnsw_schema_default_knn_hnsw() { + let collection_config = InternalCollectionConfiguration::default_hnsw(); + let schema = Schema::new_default(KnnIndex::Hnsw); + let result = + Schema::reconcile_with_collection_config(&schema, &collection_config, KnnIndex::Hnsw) + .unwrap(); - // Use reconcile_schema_and_config which has the early validation - let result = Schema::reconcile_schema_and_config( - Some(&schema), - Some(&collection_config), - KnnIndex::Spann, - ); - assert!(result.is_err()); - assert!(matches!( - result.unwrap_err(), - SchemaError::ConfigAndSchemaConflict - )); + // Should create new schema with default_knn_index (Hnsw) + assert!(result.defaults.float_list.is_some()); + assert!(result + .defaults + .float_list + .as_ref() + .unwrap() + .vector_index + .as_ref() + .unwrap() + .config + .hnsw + .is_some()); + assert!(result + .defaults + .float_list + .as_ref() + .unwrap() + .vector_index + .as_ref() + .unwrap() + .config + .spann + .is_none()); } #[test] - fn test_reconcile_with_collection_config_hnsw_override() { - // Test that non-default HNSW collection config overrides default schema - let schema = Schema::new_default(KnnIndex::Hnsw); // Use actual default schema - - let collection_config = InternalCollectionConfiguration { - vector_index: VectorIndexConfiguration::Hnsw(InternalHnswConfiguration { - ef_construction: 300, - max_neighbors: 32, - ef_search: 50, - num_threads: 8, - batch_size: 200, - sync_threshold: 2000, - resize_factor: 1.5, - space: Space::L2, - }), - embedding_function: Some(EmbeddingFunctionConfiguration::Legacy), - }; - - let result = Schema::reconcile_with_collection_config(&schema, &collection_config).unwrap(); + fn test_reconcile_double_default_hnsw_config_hnsw_schema_default_knn_spann() { + let collection_config = InternalCollectionConfiguration::default_hnsw(); + let schema = Schema::new_default(KnnIndex::Hnsw); + let result = + Schema::reconcile_with_collection_config(&schema, &collection_config, KnnIndex::Spann) + .unwrap(); - // Check that #embedding key override was created with the collection config settings - let embedding_override = result.keys.get(EMBEDDING_KEY).unwrap(); - let vector_index = embedding_override + // Should create new schema with default_knn_index (Spann) + assert!(result.defaults.float_list.is_some()); + assert!(result + .defaults .float_list .as_ref() .unwrap() .vector_index .as_ref() - .unwrap(); - - assert!(vector_index.enabled); - assert_eq!(vector_index.config.space, Some(Space::L2)); - assert_eq!( - vector_index.config.embedding_function, - Some(EmbeddingFunctionConfiguration::Legacy) - ); - assert_eq!( - vector_index.config.source_key, - Some(DOCUMENT_KEY.to_string()) - ); - - let hnsw_config = vector_index.config.hnsw.as_ref().unwrap(); - assert_eq!(hnsw_config.ef_construction, Some(300)); - assert_eq!(hnsw_config.max_neighbors, Some(32)); - assert_eq!(hnsw_config.ef_search, Some(50)); - assert_eq!(hnsw_config.num_threads, Some(8)); - assert_eq!(hnsw_config.batch_size, Some(200)); - assert_eq!(hnsw_config.sync_threshold, Some(2000)); - assert_eq!(hnsw_config.resize_factor, Some(1.5)); - - assert!(vector_index.config.spann.is_none()); + .unwrap() + .config + .spann + .is_some()); + assert!(result + .defaults + .float_list + .as_ref() + .unwrap() + .vector_index + .as_ref() + .unwrap() + .config + .hnsw + .is_none()); } #[test] - fn test_reconcile_with_collection_config_spann_override() { - // Test that non-default SPANN collection config overrides default schema - let schema = Schema::new_default(KnnIndex::Spann); // Use actual default schema + fn test_reconcile_double_default_hnsw_config_spann_schema_default_knn_hnsw() { + let collection_config = InternalCollectionConfiguration::default_hnsw(); + let schema = Schema::new_default(KnnIndex::Spann); + let result = + Schema::reconcile_with_collection_config(&schema, &collection_config, KnnIndex::Hnsw) + .unwrap(); - let collection_config = InternalCollectionConfiguration { - vector_index: VectorIndexConfiguration::Spann(InternalSpannConfiguration { - search_nprobe: 20, - search_rng_factor: 3.0, - search_rng_epsilon: 0.2, - nreplica_count: 5, - write_rng_factor: 2.0, - write_rng_epsilon: 0.1, - split_threshold: 2000, - num_samples_kmeans: 200, + // Should create new schema with default_knn_index (Hnsw) + assert!(result.defaults.float_list.is_some()); + assert!(result + .defaults + .float_list + .as_ref() + .unwrap() + .vector_index + .as_ref() + .unwrap() + .config + .hnsw + .is_some()); + assert!(result + .defaults + .float_list + .as_ref() + .unwrap() + .vector_index + .as_ref() + .unwrap() + .config + .spann + .is_none()); + } + + #[test] + fn test_reconcile_double_default_hnsw_config_spann_schema_default_knn_spann() { + let collection_config = InternalCollectionConfiguration::default_hnsw(); + let schema = Schema::new_default(KnnIndex::Spann); + let result = + Schema::reconcile_with_collection_config(&schema, &collection_config, KnnIndex::Spann) + .unwrap(); + + // Should create new schema with default_knn_index (Spann) + assert!(result.defaults.float_list.is_some()); + assert!(result + .defaults + .float_list + .as_ref() + .unwrap() + .vector_index + .as_ref() + .unwrap() + .config + .spann + .is_some()); + assert!(result + .defaults + .float_list + .as_ref() + .unwrap() + .vector_index + .as_ref() + .unwrap() + .config + .hnsw + .is_none()); + } + + #[test] + fn test_reconcile_double_default_spann_config_spann_schema_default_knn_hnsw() { + let collection_config = InternalCollectionConfiguration::default_spann(); + let schema = Schema::new_default(KnnIndex::Spann); + let result = + Schema::reconcile_with_collection_config(&schema, &collection_config, KnnIndex::Hnsw) + .unwrap(); + + // Should create new schema with default_knn_index (Hnsw) + assert!(result.defaults.float_list.is_some()); + assert!(result + .defaults + .float_list + .as_ref() + .unwrap() + .vector_index + .as_ref() + .unwrap() + .config + .hnsw + .is_some()); + assert!(result + .defaults + .float_list + .as_ref() + .unwrap() + .vector_index + .as_ref() + .unwrap() + .config + .spann + .is_none()); + } + + #[test] + fn test_reconcile_double_default_spann_config_spann_schema_default_knn_spann() { + let collection_config = InternalCollectionConfiguration::default_spann(); + let schema = Schema::new_default(KnnIndex::Spann); + let result = + Schema::reconcile_with_collection_config(&schema, &collection_config, KnnIndex::Spann) + .unwrap(); + + // Should create new schema with default_knn_index (Spann) + assert!(result.defaults.float_list.is_some()); + assert!(result + .defaults + .float_list + .as_ref() + .unwrap() + .vector_index + .as_ref() + .unwrap() + .config + .spann + .is_some()); + assert!(result + .defaults + .float_list + .as_ref() + .unwrap() + .vector_index + .as_ref() + .unwrap() + .config + .hnsw + .is_none()); + // Defaults should have source_key=None + assert_eq!( + result + .defaults + .float_list + .as_ref() + .unwrap() + .vector_index + .as_ref() + .unwrap() + .config + .source_key, + None + ); + } + + #[test] + fn test_reconcile_double_default_spann_config_hnsw_schema_default_knn_hnsw() { + let collection_config = InternalCollectionConfiguration::default_spann(); + let schema = Schema::new_default(KnnIndex::Hnsw); + let result = + Schema::reconcile_with_collection_config(&schema, &collection_config, KnnIndex::Hnsw) + .unwrap(); + + // Should create new schema with default_knn_index (Hnsw) + assert!(result.defaults.float_list.is_some()); + assert!(result + .defaults + .float_list + .as_ref() + .unwrap() + .vector_index + .as_ref() + .unwrap() + .config + .hnsw + .is_some()); + assert!(result + .defaults + .float_list + .as_ref() + .unwrap() + .vector_index + .as_ref() + .unwrap() + .config + .spann + .is_none()); + } + + #[test] + fn test_reconcile_double_default_spann_config_hnsw_schema_default_knn_spann() { + let collection_config = InternalCollectionConfiguration::default_spann(); + let schema = Schema::new_default(KnnIndex::Hnsw); + let result = + Schema::reconcile_with_collection_config(&schema, &collection_config, KnnIndex::Spann) + .unwrap(); + + // Should create new schema with default_knn_index (Spann) + assert!(result.defaults.float_list.is_some()); + assert!(result + .defaults + .float_list + .as_ref() + .unwrap() + .vector_index + .as_ref() + .unwrap() + .config + .spann + .is_some()); + assert!(result + .defaults + .float_list + .as_ref() + .unwrap() + .vector_index + .as_ref() + .unwrap() + .config + .hnsw + .is_none()); + } + + #[test] + fn test_defaults_source_key_not_document() { + // Test that defaults.float_list.vector_index.config.source_key is None, not DOCUMENT_KEY + let schema_hnsw = Schema::new_default(KnnIndex::Hnsw); + let schema_spann = Schema::new_default(KnnIndex::Spann); + + // Check HNSW default schema + let defaults_hnsw = schema_hnsw + .defaults + .float_list + .as_ref() + .unwrap() + .vector_index + .as_ref() + .unwrap(); + assert_eq!(defaults_hnsw.config.source_key, None); + + // Check Spann default schema + let defaults_spann = schema_spann + .defaults + .float_list + .as_ref() + .unwrap() + .vector_index + .as_ref() + .unwrap(); + assert_eq!(defaults_spann.config.source_key, None); + + // Test after reconcile with NON-default collection config + // This path calls try_from where our fix is + let collection_config_hnsw = InternalCollectionConfiguration { + vector_index: VectorIndexConfiguration::Hnsw(InternalHnswConfiguration { + ef_construction: 300, + max_neighbors: 32, + ef_search: 50, + num_threads: 8, + batch_size: 200, + sync_threshold: 2000, + resize_factor: 1.5, + space: Space::L2, + }), + embedding_function: Some(EmbeddingFunctionConfiguration::Legacy), + }; + let result_hnsw = Schema::reconcile_with_collection_config( + &schema_hnsw, + &collection_config_hnsw, + KnnIndex::Hnsw, + ) + .unwrap(); + let reconciled_defaults_hnsw = result_hnsw + .defaults + .float_list + .as_ref() + .unwrap() + .vector_index + .as_ref() + .unwrap(); + assert_eq!(reconciled_defaults_hnsw.config.source_key, None); + + let collection_config_spann = InternalCollectionConfiguration { + vector_index: VectorIndexConfiguration::Spann(InternalSpannConfiguration { + search_nprobe: 20, + search_rng_factor: 3.0, + search_rng_epsilon: 0.2, + nreplica_count: 5, + write_rng_factor: 2.0, + write_rng_epsilon: 0.1, + split_threshold: 2000, + num_samples_kmeans: 200, initial_lambda: 0.8, reassign_neighbor_count: 100, merge_threshold: 800, @@ -3668,8 +3947,342 @@ mod tests { }), embedding_function: None, }; + let result_spann = Schema::reconcile_with_collection_config( + &schema_spann, + &collection_config_spann, + KnnIndex::Spann, + ) + .unwrap(); + let reconciled_defaults_spann = result_spann + .defaults + .float_list + .as_ref() + .unwrap() + .vector_index + .as_ref() + .unwrap(); + assert_eq!(reconciled_defaults_spann.config.source_key, None); + + // Verify that #embedding key DOES have source_key set to DOCUMENT_KEY + let embedding_hnsw = result_hnsw.keys.get(EMBEDDING_KEY).unwrap(); + let embedding_vector_index_hnsw = embedding_hnsw + .float_list + .as_ref() + .unwrap() + .vector_index + .as_ref() + .unwrap(); + assert_eq!( + embedding_vector_index_hnsw.config.source_key, + Some(DOCUMENT_KEY.to_string()) + ); + + let embedding_spann = result_spann.keys.get(EMBEDDING_KEY).unwrap(); + let embedding_vector_index_spann = embedding_spann + .float_list + .as_ref() + .unwrap() + .vector_index + .as_ref() + .unwrap(); + assert_eq!( + embedding_vector_index_spann.config.source_key, + Some(DOCUMENT_KEY.to_string()) + ); + } - let result = Schema::reconcile_with_collection_config(&schema, &collection_config).unwrap(); + #[test] + fn test_try_from_source_key() { + // Direct test of try_from to verify source_key behavior + // Defaults should have source_key=None, #embedding should have source_key=DOCUMENT_KEY + + // Test with HNSW config + let collection_config_hnsw = InternalCollectionConfiguration { + vector_index: VectorIndexConfiguration::Hnsw(InternalHnswConfiguration { + ef_construction: 300, + max_neighbors: 32, + ef_search: 50, + num_threads: 8, + batch_size: 200, + sync_threshold: 2000, + resize_factor: 1.5, + space: Space::L2, + }), + embedding_function: Some(EmbeddingFunctionConfiguration::Legacy), + }; + let schema_hnsw = Schema::try_from(&collection_config_hnsw).unwrap(); + + // Check defaults have source_key=None + let defaults_hnsw = schema_hnsw + .defaults + .float_list + .as_ref() + .unwrap() + .vector_index + .as_ref() + .unwrap(); + assert_eq!(defaults_hnsw.config.source_key, None); + + // Check #embedding has source_key=DOCUMENT_KEY + let embedding_hnsw = schema_hnsw.keys.get(EMBEDDING_KEY).unwrap(); + let embedding_vector_index_hnsw = embedding_hnsw + .float_list + .as_ref() + .unwrap() + .vector_index + .as_ref() + .unwrap(); + assert_eq!( + embedding_vector_index_hnsw.config.source_key, + Some(DOCUMENT_KEY.to_string()) + ); + + // Test with Spann config + let collection_config_spann = InternalCollectionConfiguration { + vector_index: VectorIndexConfiguration::Spann(InternalSpannConfiguration { + search_nprobe: 20, + search_rng_factor: 3.0, + search_rng_epsilon: 0.2, + nreplica_count: 5, + write_rng_factor: 2.0, + write_rng_epsilon: 0.1, + split_threshold: 2000, + num_samples_kmeans: 200, + initial_lambda: 0.8, + reassign_neighbor_count: 100, + merge_threshold: 800, + num_centers_to_merge_to: 20, + write_nprobe: 10, + ef_construction: 400, + ef_search: 60, + max_neighbors: 24, + space: Space::Cosine, + }), + embedding_function: None, + }; + let schema_spann = Schema::try_from(&collection_config_spann).unwrap(); + + // Check defaults have source_key=None + let defaults_spann = schema_spann + .defaults + .float_list + .as_ref() + .unwrap() + .vector_index + .as_ref() + .unwrap(); + assert_eq!(defaults_spann.config.source_key, None); + + // Check #embedding has source_key=DOCUMENT_KEY + let embedding_spann = schema_spann.keys.get(EMBEDDING_KEY).unwrap(); + let embedding_vector_index_spann = embedding_spann + .float_list + .as_ref() + .unwrap() + .vector_index + .as_ref() + .unwrap(); + assert_eq!( + embedding_vector_index_spann.config.source_key, + Some(DOCUMENT_KEY.to_string()) + ); + } + + #[test] + fn test_default_hnsw_with_default_embedding_function() { + // Test that when InternalCollectionConfiguration is default HNSW but has + // an embedding function with name "default" and config as {}, it still + // goes through the double default path and preserves source_key behavior + use crate::collection_configuration::EmbeddingFunctionNewConfiguration; + + let collection_config = InternalCollectionConfiguration { + vector_index: VectorIndexConfiguration::Hnsw(InternalHnswConfiguration::default()), + embedding_function: Some(EmbeddingFunctionConfiguration::Known( + EmbeddingFunctionNewConfiguration { + name: "default".to_string(), + config: serde_json::json!({}), + }, + )), + }; + + // Verify it's still considered default + assert!(collection_config.is_default()); + + let schema = Schema::new_default(KnnIndex::Hnsw); + let result = + Schema::reconcile_with_collection_config(&schema, &collection_config, KnnIndex::Spann) + .unwrap(); + + // Check that defaults have source_key=None + let defaults = result + .defaults + .float_list + .as_ref() + .unwrap() + .vector_index + .as_ref() + .unwrap(); + assert_eq!(defaults.config.source_key, None); + + // Check that #embedding has source_key=DOCUMENT_KEY + let embedding = result.keys.get(EMBEDDING_KEY).unwrap(); + let embedding_vector_index = embedding + .float_list + .as_ref() + .unwrap() + .vector_index + .as_ref() + .unwrap(); + assert_eq!( + embedding_vector_index.config.source_key, + Some(DOCUMENT_KEY.to_string()) + ); + + // verify vector index config is set to spann + let vector_index_config = defaults.config.clone(); + assert!(vector_index_config.spann.is_some()); + assert!(vector_index_config.hnsw.is_none()); + + // Verify embedding function was set correctly + assert_eq!( + embedding_vector_index.config.embedding_function, + Some(EmbeddingFunctionConfiguration::Known( + EmbeddingFunctionNewConfiguration { + name: "default".to_string(), + config: serde_json::json!({}), + }, + )) + ); + assert_eq!( + defaults.config.embedding_function, + Some(EmbeddingFunctionConfiguration::Known( + EmbeddingFunctionNewConfiguration { + name: "default".to_string(), + config: serde_json::json!({}), + }, + )) + ); + } + + #[test] + fn test_reconcile_with_collection_config_both_non_default() { + // Test that when both schema and collection config are non-default, it returns an error + let mut schema = Schema::new_default(KnnIndex::Hnsw); + schema.defaults.string = Some(StringValueType { + fts_index: Some(FtsIndexType { + enabled: true, + config: FtsIndexConfig {}, + }), + string_inverted_index: None, + }); + + let mut collection_config = InternalCollectionConfiguration::default_hnsw(); + // Make collection config non-default by changing a parameter + if let VectorIndexConfiguration::Hnsw(ref mut hnsw_config) = collection_config.vector_index + { + hnsw_config.ef_construction = 500; // Non-default value + } + + // Use reconcile_schema_and_config which has the early validation + let result = Schema::reconcile_schema_and_config( + Some(&schema), + Some(&collection_config), + KnnIndex::Spann, + ); + assert!(result.is_err()); + assert!(matches!( + result.unwrap_err(), + SchemaError::ConfigAndSchemaConflict + )); + } + + #[test] + fn test_reconcile_with_collection_config_hnsw_override() { + // Test that non-default HNSW collection config overrides default schema + let schema = Schema::new_default(KnnIndex::Hnsw); // Use actual default schema + + let collection_config = InternalCollectionConfiguration { + vector_index: VectorIndexConfiguration::Hnsw(InternalHnswConfiguration { + ef_construction: 300, + max_neighbors: 32, + ef_search: 50, + num_threads: 8, + batch_size: 200, + sync_threshold: 2000, + resize_factor: 1.5, + space: Space::L2, + }), + embedding_function: Some(EmbeddingFunctionConfiguration::Legacy), + }; + + let result = + Schema::reconcile_with_collection_config(&schema, &collection_config, KnnIndex::Hnsw) + .unwrap(); + + // Check that #embedding key override was created with the collection config settings + let embedding_override = result.keys.get(EMBEDDING_KEY).unwrap(); + let vector_index = embedding_override + .float_list + .as_ref() + .unwrap() + .vector_index + .as_ref() + .unwrap(); + + assert!(vector_index.enabled); + assert_eq!(vector_index.config.space, Some(Space::L2)); + assert_eq!( + vector_index.config.embedding_function, + Some(EmbeddingFunctionConfiguration::Legacy) + ); + assert_eq!( + vector_index.config.source_key, + Some(DOCUMENT_KEY.to_string()) + ); + + let hnsw_config = vector_index.config.hnsw.as_ref().unwrap(); + assert_eq!(hnsw_config.ef_construction, Some(300)); + assert_eq!(hnsw_config.max_neighbors, Some(32)); + assert_eq!(hnsw_config.ef_search, Some(50)); + assert_eq!(hnsw_config.num_threads, Some(8)); + assert_eq!(hnsw_config.batch_size, Some(200)); + assert_eq!(hnsw_config.sync_threshold, Some(2000)); + assert_eq!(hnsw_config.resize_factor, Some(1.5)); + + assert!(vector_index.config.spann.is_none()); + } + + #[test] + fn test_reconcile_with_collection_config_spann_override() { + // Test that non-default SPANN collection config overrides default schema + let schema = Schema::new_default(KnnIndex::Spann); // Use actual default schema + + let collection_config = InternalCollectionConfiguration { + vector_index: VectorIndexConfiguration::Spann(InternalSpannConfiguration { + search_nprobe: 20, + search_rng_factor: 3.0, + search_rng_epsilon: 0.2, + nreplica_count: 5, + write_rng_factor: 2.0, + write_rng_epsilon: 0.1, + split_threshold: 2000, + num_samples_kmeans: 200, + initial_lambda: 0.8, + reassign_neighbor_count: 100, + merge_threshold: 800, + num_centers_to_merge_to: 20, + write_nprobe: 10, + ef_construction: 400, + ef_search: 60, + max_neighbors: 24, + space: Space::Cosine, + }), + embedding_function: None, + }; + + let result = + Schema::reconcile_with_collection_config(&schema, &collection_config, KnnIndex::Spann) + .unwrap(); // Check that #embedding key override was created with the collection config settings let embedding_override = result.keys.get(EMBEDDING_KEY).unwrap(); @@ -3730,7 +4343,9 @@ mod tests { embedding_function: Some(EmbeddingFunctionConfiguration::Legacy), }; - let result = Schema::reconcile_with_collection_config(&schema, &collection_config).unwrap(); + let result = + Schema::reconcile_with_collection_config(&schema, &collection_config, KnnIndex::Hnsw) + .unwrap(); // Check that defaults.float_list.vector_index was updated let defaults_vector_index = result @@ -3750,10 +4365,7 @@ mod tests { defaults_vector_index.config.embedding_function, Some(EmbeddingFunctionConfiguration::Legacy) ); - assert_eq!( - defaults_vector_index.config.source_key, - Some(DOCUMENT_KEY.to_string()) - ); + assert_eq!(defaults_vector_index.config.source_key, None); let defaults_hnsw = defaults_vector_index.config.hnsw.as_ref().unwrap(); assert_eq!(defaults_hnsw.ef_construction, Some(300)); assert_eq!(defaults_hnsw.max_neighbors, Some(32));