[ENH] Add sync point to test_filtering + fix issues (#2388)

sanketkedia · web-flow · commit 9f79843f0beb · 2024-06-28T10:07:07.000-07:00
## Description of changes

*Summarize the changes made by this PR.*
 - Improvements &amp; Bug fixes
    - Adds sync point to test_filtering
- Modifies test_filtering in cluster mode to not emit
documents/where_clauses of length &lt; 3 and characters "_" and "%"
- Increases the deadline of tests from 45 secs to 90 secs since waiting
for compaction to finish could end up taking longer than 45 secs
    - Fix lte comparison bug with f32 metadata values 
- Fixes version syncing logic to not race with compaction by getting the
initial version before the add
    - Suppresses health check warning for filtering too much
    - Fixes replace_block bug in sparse index
    - Fixes split bug in bf writer

## Test plan
- [x] Tests pass locally with `pytest` for python, `yarn test` for js,
`cargo test` for rust

## Documentation Changes
None
diff --git a/chromadb/test/property/strategies.py b/chromadb/test/property/strategies.py
@@ -99,11 +99,16 @@ class Record(TypedDict):
 # TODO: support empty strings everywhere
 sql_alphabet = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_"
 safe_text = st.text(alphabet=sql_alphabet, min_size=1)
+sql_alphabet_minus_underscore = (
+    "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-"
+)
+safe_text_min_size_3 = st.text(alphabet=sql_alphabet_minus_underscore, min_size=3)
 tenant_database_name = st.text(alphabet=sql_alphabet, min_size=3)
 
 # Workaround for FastAPI json encoding peculiarities
 # https://github.com/tiangolo/fastapi/blob/8ac8d70d52bb0dd9eb55ba4e22d3e383943da05c/fastapi/encoders.py#L104
 safe_text = safe_text.filter(lambda s: not s.startswith("_sa"))
+safe_text_min_size_3 = safe_text_min_size_3.filter(lambda s: not s.startswith("_sa"))
 tenant_database_name = tenant_database_name.filter(lambda s: not s.startswith("_sa"))
 
 safe_integers = st.integers(
@@ -316,10 +321,21 @@ def collections(
     if has_documents is None:
         has_documents = draw(st.booleans())
     assert has_documents is not None
-    if has_documents and add_filterable_data:
-        known_document_keywords = draw(st.lists(safe_text, min_size=5, max_size=5))
+    # For cluster tests, we want to avoid generating documents and where_document
+    # clauses of length < 3. We also don't want them to contain certan special
+    # characters like _ and % that implicitly involve searching for a regex in sqlite.
+    if not NOT_CLUSTER_ONLY:
+        if has_documents and add_filterable_data:
+            known_document_keywords = draw(
+                st.lists(safe_text_min_size_3, min_size=5, max_size=5)
+            )
+        else:
+            known_document_keywords = []
     else:
-        known_document_keywords = []
+        if has_documents and add_filterable_data:
+            known_document_keywords = draw(st.lists(safe_text, min_size=5, max_size=5))
+        else:
+            known_document_keywords = []
 
     if not has_documents:
         has_embeddings = True
@@ -375,6 +391,27 @@ def metadata(
 @st.composite
 def document(draw: st.DrawFn, collection: Collection) -> types.Document:
     """Strategy for generating documents that could be a part of the given collection"""
+    # For cluster tests, we want to avoid generating documents of length < 3.
+    # We also don't want them to contain certan special
+    # characters like _ and % that implicitly involve searching for a regex in sqlite.
+    if not NOT_CLUSTER_ONLY:
+        # Blacklist certain unicode characters that affect sqlite processing.
+        # For example, the null (/x00) character makes sqlite stop processing a string.
+        # Also, blacklist _ and % for cluster tests.
+        blacklist_categories = ("Cc", "Cs", "Pc", "Po")
+        if collection.known_document_keywords:
+            known_words_st = st.sampled_from(collection.known_document_keywords)
+        else:
+            known_words_st = st.text(
+                min_size=3,
+                alphabet=st.characters(blacklist_categories=blacklist_categories),  # type: ignore
+            )
+
+        random_words_st = st.text(
+            min_size=3, alphabet=st.characters(blacklist_categories=blacklist_categories)  # type: ignore
+        )
+        words = draw(st.lists(st.one_of(known_words_st, random_words_st), min_size=1))
+        return " ".join(words)
 
     # Blacklist certain unicode characters that affect sqlite processing.
     # For example, the null (/x00) character makes sqlite stop processing a string.
@@ -531,10 +568,19 @@ def where_clause(draw: st.DrawFn, collection: Collection) -> types.Where:
 @st.composite
 def where_doc_clause(draw: st.DrawFn, collection: Collection) -> types.WhereDocument:
     """Generate a where_document filter that could be used against the given collection"""
-    if collection.known_document_keywords:
-        word = draw(st.sampled_from(collection.known_document_keywords))
+    # For cluster tests, we want to avoid generating where_document
+    # clauses of length < 3. We also don't want them to contain certan special
+    # characters like _ and % that implicitly involve searching for a regex in sqlite.
+    if not NOT_CLUSTER_ONLY:
+        if collection.known_document_keywords:
+            word = draw(st.sampled_from(collection.known_document_keywords))
+        else:
+            word = draw(safe_text_min_size_3)
     else:
-        word = draw(safe_text)
+        if collection.known_document_keywords:
+            word = draw(st.sampled_from(collection.known_document_keywords))
+        else:
+            word = draw(safe_text)
 
     # This is hacky, but the distributed system does not support $not_contains
     # so we need to avoid generating these operators for now in that case.
diff --git a/chromadb/test/property/test_add.py b/chromadb/test/property/test_add.py
@@ -188,7 +188,11 @@ def test_add_large(
     ):
         coll.add(*batch)
 
-    if not NOT_CLUSTER_ONLY and should_compact:
+    if (
+        not NOT_CLUSTER_ONLY
+        and should_compact
+        and len(normalized_record_set["ids"]) > 10
+    ):
         initial_version = coll.get_model()["version"]
         # Wait for the model to be updated, since the record set is larger, add some additional time
         wait_for_version_increase(
diff --git a/chromadb/test/property/test_filtering.py b/chromadb/test/property/test_filtering.py
@@ -20,6 +20,7 @@
 import logging
 import random
 import re
+from chromadb.test.utils.wait_for_version_increase import wait_for_version_increase
 
 
 def _filter_where_clause(clause: Where, metadata: Optional[Metadata]) -> bool:
@@ -175,18 +176,26 @@ def _filter_embedding_set(
 
 
 @settings(
+    deadline=90000,
     suppress_health_check=[
         HealthCheck.function_scoped_fixture,
         HealthCheck.large_base_example,
-    ]
+        HealthCheck.filter_too_much,
+    ],
 )  # type: ignore
 @given(
     collection=collection_st,
     record_set=recordset_st,
     filters=st.lists(strategies.filters(collection_st, recordset_st), min_size=1),
+    should_compact=st.booleans(),
 )
 def test_filterable_metadata_get(
-    caplog, api: ServerAPI, collection: strategies.Collection, record_set, filters
+    caplog,
+    api: ServerAPI,
+    collection: strategies.Collection,
+    record_set,
+    filters,
+    should_compact: bool,
 ) -> None:
     caplog.set_level(logging.ERROR)
 
@@ -197,25 +206,38 @@ def test_filterable_metadata_get(
         embedding_function=collection.embedding_function,
     )
 
+    initial_version = coll.get_model()["version"]
+
     coll.add(**record_set)
+
+    if not NOT_CLUSTER_ONLY:
+        # Only wait for compaction if the size of the collection is
+        # some minimal size
+        if should_compact and len(invariants.wrap(record_set["ids"])) > 10:
+            # Wait for the model to be updated
+            wait_for_version_increase(api, collection.name, initial_version)
+
     for filter in filters:
         result_ids = coll.get(**filter)["ids"]
         expected_ids = _filter_embedding_set(record_set, filter)
         assert sorted(result_ids) == sorted(expected_ids)
 
 
 @settings(
+    deadline=90000,
     suppress_health_check=[
         HealthCheck.function_scoped_fixture,
         HealthCheck.large_base_example,
-    ]
+        HealthCheck.filter_too_much,
+    ],
 )  # type: ignore
 @given(
     collection=collection_st,
     record_set=recordset_st,
     filters=st.lists(strategies.filters(collection_st, recordset_st), min_size=1),
     limit=st.integers(min_value=1, max_value=10),
     offset=st.integers(min_value=0, max_value=10),
+    should_compact=st.booleans(),
 )
 def test_filterable_metadata_get_limit_offset(
     caplog,
@@ -225,6 +247,7 @@ def test_filterable_metadata_get_limit_offset(
     filters,
     limit,
     offset,
+    should_compact: bool,
 ) -> None:
     caplog.set_level(logging.ERROR)
 
@@ -240,7 +263,17 @@ def test_filterable_metadata_get_limit_offset(
         embedding_function=collection.embedding_function,
     )
 
+    initial_version = coll.get_model()["version"]
+
     coll.add(**record_set)
+
+    if not NOT_CLUSTER_ONLY:
+        # Only wait for compaction if the size of the collection is
+        # some minimal size
+        if should_compact and len(invariants.wrap(record_set["ids"])) > 10:
+            # Wait for the model to be updated
+            wait_for_version_increase(api, collection.name, initial_version)
+
     for filter in filters:
         # add limit and offset to filter
         filter["limit"] = limit
@@ -251,10 +284,12 @@ def test_filterable_metadata_get_limit_offset(
 
 
 @settings(
+    deadline=90000,
     suppress_health_check=[
         HealthCheck.function_scoped_fixture,
         HealthCheck.large_base_example,
-    ]
+        HealthCheck.filter_too_much,
+    ],
 )
 @given(
     collection=collection_st,
@@ -263,13 +298,15 @@ def test_filterable_metadata_get_limit_offset(
         strategies.filters(collection_st, recordset_st, include_all_ids=True),
         min_size=1,
     ),
+    should_compact=st.booleans(),
 )
 def test_filterable_metadata_query(
     caplog: pytest.LogCaptureFixture,
     api: ServerAPI,
     collection: strategies.Collection,
     record_set: strategies.RecordSet,
     filters: List[strategies.Filter],
+    should_compact: bool,
 ) -> None:
     caplog.set_level(logging.ERROR)
 
@@ -279,9 +316,18 @@ def test_filterable_metadata_query(
         metadata=collection.metadata,  # type: ignore
         embedding_function=collection.embedding_function,
     )
+    initial_version = coll.get_model()["version"]
     normalized_record_set = invariants.wrap_all(record_set)
 
     coll.add(**record_set)
+
+    if not NOT_CLUSTER_ONLY:
+        # Only wait for compaction if the size of the collection is
+        # some minimal size
+        if should_compact and len(invariants.wrap(record_set["ids"])) > 10:
+            # Wait for the model to be updated
+            wait_for_version_increase(api, collection.name, initial_version)
+
     total_count = len(normalized_record_set["ids"])
     # Pick a random vector
     random_query: Embedding
diff --git a/rust/worker/src/blockstore/arrow/block/delta.rs b/rust/worker/src/blockstore/arrow/block/delta.rs
@@ -128,6 +128,7 @@ impl BlockDelta {
         let mut blocks_to_split = Vec::new();
         blocks_to_split.push(self.clone());
         let mut output = Vec::new();
+        let mut first_iter = true;
         // iterate over all blocks to split until its empty
         while !blocks_to_split.is_empty() {
             let curr_block = blocks_to_split.pop().unwrap();
@@ -168,7 +169,11 @@ impl BlockDelta {
                 builder: new_delta,
                 id: Uuid::new_v4(),
             };
-
+            if first_iter {
+                first_iter = false;
+            } else {
+                output.push((curr_block.builder.get_key(0).clone(), curr_block));
+            }
             if new_block.get_size::<K, V>() > MAX_BLOCK_SIZE {
                 blocks_to_split.push(new_block);
             } else {
diff --git a/rust/worker/src/blockstore/arrow/block/delta_storage.rs b/rust/worker/src/blockstore/arrow/block/delta_storage.rs
@@ -267,7 +267,7 @@ impl StringValueStorage {
     }
 }
 
-#[derive(Clone)]
+#[derive(Clone, Debug)]
 pub(super) struct UInt32Storage {
     pub(super) storage: Arc<RwLock<BTreeMap<CompositeKey, u32>>>,
 }
@@ -355,7 +355,7 @@ impl UInt32Storage {
     }
 }
 
-#[derive(Clone)]
+#[derive(Clone, Debug)]
 pub(super) struct Int32ArrayStorage {
     pub(super) storage: Arc<RwLock<BTreeMap<CompositeKey, Int32Array>>>,
 }
@@ -464,7 +464,7 @@ impl Int32ArrayStorage {
     }
 }
 
-#[derive(Clone)]
+#[derive(Clone, Debug)]
 pub(super) struct RoaringBitmapStorage {
     pub(super) storage: Arc<RwLock<BTreeMap<CompositeKey, Vec<u8>>>>,
 }
@@ -561,7 +561,7 @@ impl RoaringBitmapStorage {
     }
 }
 
-#[derive(Clone)]
+#[derive(Clone, Debug)]
 pub(super) struct DataRecordStorage {
     pub(super) id_storage: Arc<RwLock<BTreeMap<CompositeKey, String>>>,
     pub(super) embedding_storage: Arc<RwLock<BTreeMap<CompositeKey, Vec<f32>>>>,
diff --git a/rust/worker/src/blockstore/arrow/blockfile.rs b/rust/worker/src/blockstore/arrow/blockfile.rs
@@ -275,14 +275,21 @@ impl<'me, K: ArrowReadableKey<'me> + Into<KeyWrapper>, V: ArrowReadableValue<'me
         let target_block_id = self.sparse_index.get_target_block_id(&search_key);
         let block = self.get_block(target_block_id).await;
         let res = match block {
-            Some(block) => block.get(prefix, key),
+            Some(block) => block.get(prefix, key.clone()),
             None => {
+                tracing::error!("Block with id {:?} not found", target_block_id);
                 return Err(Box::new(ArrowBlockfileError::BlockNotFound));
             }
         };
         match res {
             Some(value) => Ok(value),
             None => {
+                tracing::error!(
+                    "Key {:?}/{:?} not found in block {:?}",
+                    prefix,
+                    key,
+                    target_block_id
+                );
                 return Err(Box::new(BlockfileError::NotFoundError));
             }
         }
@@ -309,6 +316,7 @@ impl<'me, K: ArrowReadableKey<'me> + Into<KeyWrapper>, V: ArrowReadableValue<'me
                     block_offset += b.len();
                 }
                 None => {
+                    tracing::error!("Block id {:?} not found", uuid);
                     return Err(Box::new(ArrowBlockfileError::BlockNotFound));
                 }
             }
@@ -320,6 +328,10 @@ impl<'me, K: ArrowReadableKey<'me> + Into<KeyWrapper>, V: ArrowReadableValue<'me
                 return Ok((prefix, key, value));
             }
             _ => {
+                tracing::error!(
+                    "Value not found at index {:?} for block",
+                    index - block_offset,
+                );
                 return Err(Box::new(BlockfileError::NotFoundError));
             }
         }
diff --git a/rust/worker/src/blockstore/arrow/sparse_index.rs b/rust/worker/src/blockstore/arrow/sparse_index.rs
@@ -341,8 +341,13 @@ impl SparseIndex {
             forward.remove(&old_start_key);
             if old_start_key == SparseIndexDelimiter::Start {
                 forward.insert(SparseIndexDelimiter::Start, new_block_id);
+                reverse.insert(new_block_id, SparseIndexDelimiter::Start);
             } else {
-                forward.insert(SparseIndexDelimiter::Key(new_start_key), new_block_id);
+                forward.insert(
+                    SparseIndexDelimiter::Key(new_start_key.clone()),
+                    new_block_id,
+                );
+                reverse.insert(new_block_id, SparseIndexDelimiter::Key(new_start_key));
             }
         }
     }
diff --git a/rust/worker/src/execution/operators/count_records.rs b/rust/worker/src/execution/operators/count_records.rs
@@ -78,6 +78,7 @@ impl Operator<CountRecordsInput, CountRecordsOutput> for CountRecordsOperator {
             Err(e) => {
                 match *e {
                     RecordSegmentReaderCreationError::UninitializedSegment => {
+                        tracing::info!("[CountQueryOrchestrator] Record segment is uninitialized");
                         // This means there no compaction has occured.
                         // So we can just traverse the log records
                         // and count the number of records.
diff --git a/rust/worker/src/execution/operators/metadata_filtering.rs b/rust/worker/src/execution/operators/metadata_filtering.rs
diff --git a/rust/worker/src/index/fulltext/types.rs b/rust/worker/src/index/fulltext/types.rs
diff --git a/rust/worker/src/index/metadata/types.rs b/rust/worker/src/index/metadata/types.rs
diff --git a/rust/worker/src/segment/record_segment.rs b/rust/worker/src/segment/record_segment.rs

Original file line number	Diff line number	Diff line change
`@@ -267,7 +267,7 @@ impl StringValueStorage {`
`267`	`267`	`}`
`268`	`268`	`}`
`269`	`269`
`270`		`-#[derive(Clone)]`
	`270`	`+#[derive(Clone, Debug)]`
`271`	`271`	`pub(super) struct UInt32Storage {`
`272`	`272`	`pub(super) storage: Arc<RwLock<BTreeMap<CompositeKey, u32>>>,`
`273`	`273`	`}`
`@@ -355,7 +355,7 @@ impl UInt32Storage {`
`355`	`355`	`}`
`356`	`356`	`}`
`357`	`357`
`358`		`-#[derive(Clone)]`
	`358`	`+#[derive(Clone, Debug)]`
`359`	`359`	`pub(super) struct Int32ArrayStorage {`
`360`	`360`	`pub(super) storage: Arc<RwLock<BTreeMap<CompositeKey, Int32Array>>>,`
`361`	`361`	`}`
`@@ -464,7 +464,7 @@ impl Int32ArrayStorage {`
`464`	`464`	`}`
`465`	`465`	`}`
`466`	`466`
`467`		`-#[derive(Clone)]`
	`467`	`+#[derive(Clone, Debug)]`
`468`	`468`	`pub(super) struct RoaringBitmapStorage {`
`469`	`469`	`pub(super) storage: Arc<RwLock<BTreeMap<CompositeKey, Vec<u8>>>>,`
`470`	`470`	`}`
`@@ -561,7 +561,7 @@ impl RoaringBitmapStorage {`
`561`	`561`	`}`
`562`	`562`	`}`
`563`	`563`
`564`		`-#[derive(Clone)]`
	`564`	`+#[derive(Clone, Debug)]`
`565`	`565`	`pub(super) struct DataRecordStorage {`
`566`	`566`	`pub(super) id_storage: Arc<RwLock<BTreeMap<CompositeKey, String>>>,`
`567`	`567`	`pub(super) embedding_storage: Arc<RwLock<BTreeMap<CompositeKey, Vec<f32>>>>,`
Original file line number	Diff line number	Diff line change
`@@ -275,14 +275,21 @@ impl<'me, K: ArrowReadableKey<'me> + Into<KeyWrapper>, V: ArrowReadableValue<'me`
`275`	`275`	`let target_block_id = self.sparse_index.get_target_block_id(&search_key);`
`276`	`276`	`let block = self.get_block(target_block_id).await;`
`277`	`277`	`let res = match block {`
`278`		`- Some(block) => block.get(prefix, key),`
	`278`	`+ Some(block) => block.get(prefix, key.clone()),`
`279`	`279`	`None => {`
	`280`	`+ tracing::error!("Block with id {:?} not found", target_block_id);`
`280`	`281`	`return Err(Box::new(ArrowBlockfileError::BlockNotFound));`
`281`	`282`	`}`
`282`	`283`	`};`
`283`	`284`	`match res {`
`284`	`285`	`Some(value) => Ok(value),`
`285`	`286`	`None => {`
	`287`	`+ tracing::error!(`
	`288`	`+ "Key {:?}/{:?} not found in block {:?}",`
	`289`	`+ prefix,`
	`290`	`+ key,`
	`291`	`+ target_block_id`
	`292`	`+ );`
`286`	`293`	`return Err(Box::new(BlockfileError::NotFoundError));`
`287`	`294`	`}`
`288`	`295`	`}`
`@@ -309,6 +316,7 @@ impl<'me, K: ArrowReadableKey<'me> + Into<KeyWrapper>, V: ArrowReadableValue<'me`
`309`	`316`	`block_offset += b.len();`
`310`	`317`	`}`
`311`	`318`	`None => {`
	`319`	`+ tracing::error!("Block id {:?} not found", uuid);`
`312`	`320`	`return Err(Box::new(ArrowBlockfileError::BlockNotFound));`
`313`	`321`	`}`
`314`	`322`	`}`
`@@ -320,6 +328,10 @@ impl<'me, K: ArrowReadableKey<'me> + Into<KeyWrapper>, V: ArrowReadableValue<'me`
`320`	`328`	`return Ok((prefix, key, value));`
`321`	`329`	`}`
`322`	`330`	`_ => {`
	`331`	`+ tracing::error!(`
	`332`	`+ "Value not found at index {:?} for block",`
	`333`	`+ index - block_offset,`
	`334`	`+ );`
`323`	`335`	`return Err(Box::new(BlockfileError::NotFoundError));`
`324`	`336`	`}`
`325`	`337`	`}`
Original file line number	Diff line number	Diff line change
`@@ -341,8 +341,13 @@ impl SparseIndex {`
`341`	`341`	`forward.remove(&old_start_key);`
`342`	`342`	`if old_start_key == SparseIndexDelimiter::Start {`
`343`	`343`	`forward.insert(SparseIndexDelimiter::Start, new_block_id);`
	`344`	`+ reverse.insert(new_block_id, SparseIndexDelimiter::Start);`
`344`	`345`	`} else {`
`345`		`- forward.insert(SparseIndexDelimiter::Key(new_start_key), new_block_id);`
	`346`	`+ forward.insert(`
	`347`	`+ SparseIndexDelimiter::Key(new_start_key.clone()),`
	`348`	`+ new_block_id,`
	`349`	`+ );`
	`350`	`+ reverse.insert(new_block_id, SparseIndexDelimiter::Key(new_start_key));`
`346`	`351`	`}`
`347`	`352`	`}`
`348`	`353`	`}`