|
| 1 | +use std::{ |
| 2 | + collections::BTreeSet, |
| 3 | + path::Path, |
| 4 | +}; |
| 5 | + |
| 6 | +use common::persistence::DocumentRevisionStream; |
| 7 | +use futures::TryStreamExt; |
| 8 | +use tantivy::{ |
| 9 | + IndexBuilder, |
| 10 | + SingleSegmentIndexWriter, |
| 11 | +}; |
| 12 | +use text_search::tracker::MemoryIdAndDeletionTracker; |
| 13 | + |
| 14 | +use crate::{ |
| 15 | + constants::CONVEX_EN_TOKENIZER, |
| 16 | + convex_en, |
| 17 | + TantivySearchIndexSchema, |
| 18 | +}; |
| 19 | + |
| 20 | +/// The maximum size of a segment in bytes. 10MB. |
| 21 | +#[allow(dead_code)] |
| 22 | +const SEGMENT_MAX_SIZE_BYTES: usize = 10_000_000; |
| 23 | + |
| 24 | +#[allow(dead_code)] |
| 25 | +pub(crate) const ID_TRACKER_PATH: &str = "id_tracker"; |
| 26 | +#[allow(dead_code)] |
| 27 | +pub(crate) const DELETED_TANTIVY_IDS_PATH: &str = "deleted_tantivy_ids"; |
| 28 | +#[allow(dead_code)] |
| 29 | +pub(crate) const DELETED_TERMS_PATH: &str = "deleted_terms"; |
| 30 | + |
| 31 | +#[allow(dead_code)] |
| 32 | +pub async fn build_index( |
| 33 | + // Stream of document revisions in descending timestamp order. |
| 34 | + revision_stream: DocumentRevisionStream<'_>, |
| 35 | + tantivy_schema: TantivySearchIndexSchema, |
| 36 | + dir: &Path, |
| 37 | +) -> anyhow::Result<()> { |
| 38 | + let index = IndexBuilder::new() |
| 39 | + .schema(tantivy_schema.schema.clone()) |
| 40 | + .create_in_dir(dir)?; |
| 41 | + index |
| 42 | + .tokenizers() |
| 43 | + .register(CONVEX_EN_TOKENIZER, convex_en()); |
| 44 | + let mut segment_writer = SingleSegmentIndexWriter::new(index, SEGMENT_MAX_SIZE_BYTES)?; |
| 45 | + let mut tracker = MemoryIdAndDeletionTracker::default(); |
| 46 | + futures::pin_mut!(revision_stream); |
| 47 | + // Keep track of the document IDs we've seen so we can check for duplicates. |
| 48 | + // We'll discard revisions to documents that we've already seen because we are |
| 49 | + // processing in reverse timestamp order. |
| 50 | + let mut document_ids_seen = BTreeSet::new(); |
| 51 | + while let Some(revision_pair) = revision_stream.try_next().await? { |
| 52 | + let convex_id = revision_pair.id.internal_id(); |
| 53 | + if document_ids_seen.contains(&convex_id) { |
| 54 | + continue; |
| 55 | + } |
| 56 | + document_ids_seen.insert(convex_id); |
| 57 | + if let Some(new_document) = revision_pair.document() { |
| 58 | + let tantivy_document = |
| 59 | + tantivy_schema.index_into_tantivy_document(new_document, revision_pair.ts()); |
| 60 | + let doc_id = segment_writer.add_document(tantivy_document)?; |
| 61 | + tracker.set_link(convex_id, doc_id)?; |
| 62 | + } |
| 63 | + } |
| 64 | + segment_writer.finalize()?; |
| 65 | + tracker.write( |
| 66 | + dir.to_path_buf().join(ID_TRACKER_PATH), |
| 67 | + dir.to_path_buf().join(DELETED_TANTIVY_IDS_PATH), |
| 68 | + dir.to_path_buf().join(DELETED_TERMS_PATH), |
| 69 | + )?; |
| 70 | + Ok(()) |
| 71 | +} |
0 commit comments