Build tantivy index segment from revision stream (#25383)

emmaling27 · Convex, Inc. · commit 13337fd5fe4e · 2024-05-07T00:15:48.000Z
This PR adds a function to create a tantivy segment from a stream of document revisions in descending timestamp order. We keep the invariant that convex ids are unique in tantivy segments, so we don't process deletes from the revision stream, we just skip writing them.

I also added tests for delete and replace and fixed a small bug where deleted documents weren't used if you only had OR terms in the search query.

GitOrigin-RevId: 47a940c5ec09454c1c321851a183ec2f503a073b
diff --git a/crates/common/src/deleted_bitset.rs b/crates/common/src/deleted_bitset.rs
@@ -33,7 +33,7 @@ use byteorder::{
 /// - bitset blocks (dense array of little-endian u64s): bitset contents
 pub const DELETED_BITSET_VERSION: u8 = 1;
 
-#[derive(Clone, Default)]
+#[derive(Clone, Default, Debug)]
 pub struct DeletedBitset {
     deleted: BitVec,
     num_deleted: usize,
diff --git a/crates/common/src/persistence.rs b/crates/common/src/persistence.rs
@@ -38,6 +38,7 @@ use crate::{
     interval::Interval,
     knobs::DEFAULT_DOCUMENTS_PAGE_SIZE,
     metrics::static_repeatable_ts_timer,
+    persistence_helpers::RevisionPair,
     query::Order,
     runtime::Runtime,
     types::{
@@ -54,6 +55,8 @@ pub type DocumentLogEntry = (Timestamp, InternalDocumentId, Option<ResolvedDocum
 
 pub type DocumentStream<'a> = BoxStream<'a, anyhow::Result<DocumentLogEntry>>;
 
+pub type DocumentRevisionStream<'a> = BoxStream<'a, anyhow::Result<RevisionPair>>;
+
 /// No tombstones included
 pub type LatestDocumentStream<'a> = BoxStream<'a, anyhow::Result<(Timestamp, ResolvedDocument)>>;
 
diff --git a/crates/search/src/convex_query.rs b/crates/search/src/convex_query.rs
@@ -60,15 +60,11 @@ impl ConvexSearchQuery {
             .into_iter()
             .map(|filter_term| TermQuery::new(filter_term, IndexRecordOption::Basic))
             .collect();
-        if and_queries.is_empty() {
-            Box::new(or_query)
-        } else {
-            Box::new(Self {
-                or_query,
-                and_queries,
-                deleted_documents,
-            })
-        }
+        Box::new(Self {
+            or_query,
+            and_queries,
+            deleted_documents,
+        })
     }
 }
 
@@ -118,10 +114,14 @@ impl Weight for ConvexSearchWeight {
             .iter()
             .map(|filter_weight| filter_weight.scorer(reader, boost))
             .collect::<tantivy::Result<Vec<_>>>()?;
-        let query_scorer = intersect_scorers_and_use_one_for_scores(
-            self.or_weight.scorer(reader, boost)?,
-            intersect_scorers(and_scorers),
-        );
+        let query_scorer = if !and_scorers.is_empty() {
+            intersect_scorers_and_use_one_for_scores(
+                self.or_weight.scorer(reader, boost)?,
+                intersect_scorers(and_scorers),
+            )
+        } else {
+            self.or_weight.scorer(reader, boost)?
+        };
         Ok(Box::new(ExcludeDeleted::new(
             query_scorer,
             self.deleted_documents.clone(),
diff --git a/crates/search/src/incremental_index.rs b/crates/search/src/incremental_index.rs
@@ -0,0 +1,71 @@
+use std::{
+    collections::BTreeSet,
+    path::Path,
+};
+
+use common::persistence::DocumentRevisionStream;
+use futures::TryStreamExt;
+use tantivy::{
+    IndexBuilder,
+    SingleSegmentIndexWriter,
+};
+use text_search::tracker::MemoryIdAndDeletionTracker;
+
+use crate::{
+    constants::CONVEX_EN_TOKENIZER,
+    convex_en,
+    TantivySearchIndexSchema,
+};
+
+/// The maximum size of a segment in bytes. 10MB.
+#[allow(dead_code)]
+const SEGMENT_MAX_SIZE_BYTES: usize = 10_000_000;
+
+#[allow(dead_code)]
+pub(crate) const ID_TRACKER_PATH: &str = "id_tracker";
+#[allow(dead_code)]
+pub(crate) const DELETED_TANTIVY_IDS_PATH: &str = "deleted_tantivy_ids";
+#[allow(dead_code)]
+pub(crate) const DELETED_TERMS_PATH: &str = "deleted_terms";
+
+#[allow(dead_code)]
+pub async fn build_index(
+    // Stream of document revisions in descending timestamp order.
+    revision_stream: DocumentRevisionStream<'_>,
+    tantivy_schema: TantivySearchIndexSchema,
+    dir: &Path,
+) -> anyhow::Result<()> {
+    let index = IndexBuilder::new()
+        .schema(tantivy_schema.schema.clone())
+        .create_in_dir(dir)?;
+    index
+        .tokenizers()
+        .register(CONVEX_EN_TOKENIZER, convex_en());
+    let mut segment_writer = SingleSegmentIndexWriter::new(index, SEGMENT_MAX_SIZE_BYTES)?;
+    let mut tracker = MemoryIdAndDeletionTracker::default();
+    futures::pin_mut!(revision_stream);
+    // Keep track of the document IDs we've seen so we can check for duplicates.
+    // We'll discard revisions to documents that we've already seen because we are
+    // processing in reverse timestamp order.
+    let mut document_ids_seen = BTreeSet::new();
+    while let Some(revision_pair) = revision_stream.try_next().await? {
+        let convex_id = revision_pair.id.internal_id();
+        if document_ids_seen.contains(&convex_id) {
+            continue;
+        }
+        document_ids_seen.insert(convex_id);
+        if let Some(new_document) = revision_pair.document() {
+            let tantivy_document =
+                tantivy_schema.index_into_tantivy_document(new_document, revision_pair.ts());
+            let doc_id = segment_writer.add_document(tantivy_document)?;
+            tracker.set_link(convex_id, doc_id)?;
+        }
+    }
+    segment_writer.finalize()?;
+    tracker.write(
+        dir.to_path_buf().join(ID_TRACKER_PATH),
+        dir.to_path_buf().join(DELETED_TANTIVY_IDS_PATH),
+        dir.to_path_buf().join(DELETED_TERMS_PATH),
+    )?;
+    Ok(())
+}
diff --git a/crates/search/src/lib.rs b/crates/search/src/lib.rs
@@ -17,6 +17,7 @@ mod constants;
 mod convex_query;
 pub mod disk_index;
 pub mod fragmented_segment;
+mod incremental_index;
 mod intersection;
 mod levenshtein_dfa;
 mod memory_index;
@@ -178,6 +179,15 @@ impl DocumentTerm {
     }
 }
 
+impl From<DocumentTerm> for Term {
+    fn from(doc_term: DocumentTerm) -> Self {
+        match doc_term {
+            DocumentTerm::Search { term, .. } => term,
+            DocumentTerm::Filter { term } => term,
+        }
+    }
+}
+
 pub type EditDistance = u8;
 
 /// Used to represent the position of a term within a document. For now, this
diff --git a/crates/search/src/searcher/searcher.rs b/crates/search/src/searcher/searcher.rs
diff --git a/crates/text_search/src/tracker.rs b/crates/text_search/src/tracker.rs