Skip to content

Commit 13337fd

Browse files
emmaling27Convex, Inc.
authored andcommitted
Build tantivy index segment from revision stream (#25383)
This PR adds a function to create a tantivy segment from a stream of document revisions in descending timestamp order. We keep the invariant that convex ids are unique in tantivy segments, so we don't process deletes from the revision stream, we just skip writing them. I also added tests for delete and replace and fixed a small bug where deleted documents weren't used if you only had OR terms in the search query. GitOrigin-RevId: 47a940c5ec09454c1c321851a183ec2f503a073b
1 parent 4befed9 commit 13337fd

File tree

7 files changed

+383
-86
lines changed

7 files changed

+383
-86
lines changed

crates/common/src/deleted_bitset.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ use byteorder::{
3333
/// - bitset blocks (dense array of little-endian u64s): bitset contents
3434
pub const DELETED_BITSET_VERSION: u8 = 1;
3535

36-
#[derive(Clone, Default)]
36+
#[derive(Clone, Default, Debug)]
3737
pub struct DeletedBitset {
3838
deleted: BitVec,
3939
num_deleted: usize,

crates/common/src/persistence.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ use crate::{
3838
interval::Interval,
3939
knobs::DEFAULT_DOCUMENTS_PAGE_SIZE,
4040
metrics::static_repeatable_ts_timer,
41+
persistence_helpers::RevisionPair,
4142
query::Order,
4243
runtime::Runtime,
4344
types::{
@@ -54,6 +55,8 @@ pub type DocumentLogEntry = (Timestamp, InternalDocumentId, Option<ResolvedDocum
5455

5556
pub type DocumentStream<'a> = BoxStream<'a, anyhow::Result<DocumentLogEntry>>;
5657

58+
pub type DocumentRevisionStream<'a> = BoxStream<'a, anyhow::Result<RevisionPair>>;
59+
5760
/// No tombstones included
5861
pub type LatestDocumentStream<'a> = BoxStream<'a, anyhow::Result<(Timestamp, ResolvedDocument)>>;
5962

crates/search/src/convex_query.rs

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -60,15 +60,11 @@ impl ConvexSearchQuery {
6060
.into_iter()
6161
.map(|filter_term| TermQuery::new(filter_term, IndexRecordOption::Basic))
6262
.collect();
63-
if and_queries.is_empty() {
64-
Box::new(or_query)
65-
} else {
66-
Box::new(Self {
67-
or_query,
68-
and_queries,
69-
deleted_documents,
70-
})
71-
}
63+
Box::new(Self {
64+
or_query,
65+
and_queries,
66+
deleted_documents,
67+
})
7268
}
7369
}
7470

@@ -118,10 +114,14 @@ impl Weight for ConvexSearchWeight {
118114
.iter()
119115
.map(|filter_weight| filter_weight.scorer(reader, boost))
120116
.collect::<tantivy::Result<Vec<_>>>()?;
121-
let query_scorer = intersect_scorers_and_use_one_for_scores(
122-
self.or_weight.scorer(reader, boost)?,
123-
intersect_scorers(and_scorers),
124-
);
117+
let query_scorer = if !and_scorers.is_empty() {
118+
intersect_scorers_and_use_one_for_scores(
119+
self.or_weight.scorer(reader, boost)?,
120+
intersect_scorers(and_scorers),
121+
)
122+
} else {
123+
self.or_weight.scorer(reader, boost)?
124+
};
125125
Ok(Box::new(ExcludeDeleted::new(
126126
query_scorer,
127127
self.deleted_documents.clone(),
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
use std::{
2+
collections::BTreeSet,
3+
path::Path,
4+
};
5+
6+
use common::persistence::DocumentRevisionStream;
7+
use futures::TryStreamExt;
8+
use tantivy::{
9+
IndexBuilder,
10+
SingleSegmentIndexWriter,
11+
};
12+
use text_search::tracker::MemoryIdAndDeletionTracker;
13+
14+
use crate::{
15+
constants::CONVEX_EN_TOKENIZER,
16+
convex_en,
17+
TantivySearchIndexSchema,
18+
};
19+
20+
/// The maximum size of a segment in bytes. 10MB.
21+
#[allow(dead_code)]
22+
const SEGMENT_MAX_SIZE_BYTES: usize = 10_000_000;
23+
24+
#[allow(dead_code)]
25+
pub(crate) const ID_TRACKER_PATH: &str = "id_tracker";
26+
#[allow(dead_code)]
27+
pub(crate) const DELETED_TANTIVY_IDS_PATH: &str = "deleted_tantivy_ids";
28+
#[allow(dead_code)]
29+
pub(crate) const DELETED_TERMS_PATH: &str = "deleted_terms";
30+
31+
#[allow(dead_code)]
32+
pub async fn build_index(
33+
// Stream of document revisions in descending timestamp order.
34+
revision_stream: DocumentRevisionStream<'_>,
35+
tantivy_schema: TantivySearchIndexSchema,
36+
dir: &Path,
37+
) -> anyhow::Result<()> {
38+
let index = IndexBuilder::new()
39+
.schema(tantivy_schema.schema.clone())
40+
.create_in_dir(dir)?;
41+
index
42+
.tokenizers()
43+
.register(CONVEX_EN_TOKENIZER, convex_en());
44+
let mut segment_writer = SingleSegmentIndexWriter::new(index, SEGMENT_MAX_SIZE_BYTES)?;
45+
let mut tracker = MemoryIdAndDeletionTracker::default();
46+
futures::pin_mut!(revision_stream);
47+
// Keep track of the document IDs we've seen so we can check for duplicates.
48+
// We'll discard revisions to documents that we've already seen because we are
49+
// processing in reverse timestamp order.
50+
let mut document_ids_seen = BTreeSet::new();
51+
while let Some(revision_pair) = revision_stream.try_next().await? {
52+
let convex_id = revision_pair.id.internal_id();
53+
if document_ids_seen.contains(&convex_id) {
54+
continue;
55+
}
56+
document_ids_seen.insert(convex_id);
57+
if let Some(new_document) = revision_pair.document() {
58+
let tantivy_document =
59+
tantivy_schema.index_into_tantivy_document(new_document, revision_pair.ts());
60+
let doc_id = segment_writer.add_document(tantivy_document)?;
61+
tracker.set_link(convex_id, doc_id)?;
62+
}
63+
}
64+
segment_writer.finalize()?;
65+
tracker.write(
66+
dir.to_path_buf().join(ID_TRACKER_PATH),
67+
dir.to_path_buf().join(DELETED_TANTIVY_IDS_PATH),
68+
dir.to_path_buf().join(DELETED_TERMS_PATH),
69+
)?;
70+
Ok(())
71+
}

crates/search/src/lib.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ mod constants;
1717
mod convex_query;
1818
pub mod disk_index;
1919
pub mod fragmented_segment;
20+
mod incremental_index;
2021
mod intersection;
2122
mod levenshtein_dfa;
2223
mod memory_index;
@@ -178,6 +179,15 @@ impl DocumentTerm {
178179
}
179180
}
180181

182+
impl From<DocumentTerm> for Term {
183+
fn from(doc_term: DocumentTerm) -> Self {
184+
match doc_term {
185+
DocumentTerm::Search { term, .. } => term,
186+
DocumentTerm::Filter { term } => term,
187+
}
188+
}
189+
}
190+
181191
pub type EditDistance = u8;
182192

183193
/// Used to represent the position of a term within a document. For now, this

0 commit comments

Comments
 (0)