Skip to content

Commit 5db78ca

Browse files
NicolappsConvex, Inc.
authored andcommitted
Restore read-set intersection improvements (#39187)
GitOrigin-RevId: f036bde510bc9119821c08903d179e89583d4aa6
1 parent 5f503a3 commit 5db78ca

File tree

24 files changed

+978
-493
lines changed

24 files changed

+978
-493
lines changed

Cargo.lock

Lines changed: 33 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ cfg-if = "1.0"
3838
chrono = "0.4.38"
3939
clap = { version = "^4.1.8", features = [ "derive" ] }
4040
colored = "3"
41+
compact_str = "0.9.0"
4142
const-oid = "0.9.6"
4243
criterion = "0.5"
4344
crossbeam-channel = "0.5"

crates/common/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ bitvec = { workspace = true }
3838
byteorder = { workspace = true }
3939
bytes = { workspace = true }
4040
cmd_util = { path = "../cmd_util" }
41+
compact_str = { workspace = true }
4142
crossbeam-channel = { workspace = true }
4243
csf = { workspace = true }
4344
cstr = { workspace = true }

crates/common/src/bootstrap_model/index/mod.rs

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,10 @@ pub use self::{
2626
};
2727
use crate::{
2828
paths::FieldPath,
29-
types::TableName,
29+
types::{
30+
IndexDescriptor,
31+
TableName,
32+
},
3033
};
3134

3235
/// Table name for Index data.
@@ -38,6 +41,10 @@ pub static TABLE_ID_FIELD_NAME: LazyLock<IdentifierFieldName> =
3841
pub static TABLE_ID_FIELD_PATH: LazyLock<FieldPath> =
3942
LazyLock::new(|| FieldPath::new(vec![TABLE_ID_FIELD_NAME.clone()]).unwrap());
4043

44+
/// See `record_reads_for_writes` in `database::writes`
45+
pub static INDEX_BY_TABLE_ID_VIRTUAL_INDEX_DESCRIPTOR: LazyLock<IndexDescriptor> =
46+
LazyLock::new(|| IndexDescriptor::new("by_table_id").expect("Invalid virtual index name"));
47+
4148
pub const MAX_INDEX_FIELDS_SIZE: usize = 16;
4249
pub const MAX_TEXT_INDEX_FILTER_FIELDS_SIZE: usize = 16;
4350
pub const MAX_VECTOR_INDEX_FILTER_FIELDS_SIZE: usize = 16;
Lines changed: 235 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,235 @@
1+
use std::collections::{
2+
BTreeMap,
3+
HashSet,
4+
};
5+
6+
use compact_str::CompactString;
7+
use value::{
8+
heap_size::{
9+
HeapSize,
10+
WithHeapSize,
11+
},
12+
FieldPath,
13+
};
14+
15+
#[cfg(any(test, feature = "testing"))]
16+
use crate::index::IndexKey;
17+
use crate::{
18+
index::IndexKeyBytes,
19+
query::FilterValue as SearchFilterValue,
20+
types::TabletIndexName,
21+
};
22+
23+
/// For a given document, contains all the index keys for the indexes on the
24+
/// document’s table.
25+
///
26+
/// This is used in lieu of the full document in the write log. This is most of
27+
/// the time more memory efficient (because we don’t need to store the full
28+
/// document) and faster (because we don’t need to reconstruct the index keys
29+
/// every time we need them).
30+
#[derive(Clone, Debug)]
31+
#[cfg_attr(any(test, feature = "testing"), derive(PartialEq, Eq))]
32+
pub struct DocumentIndexKeys(WithHeapSize<BTreeMap<TabletIndexName, DocumentIndexKeyValue>>);
33+
34+
impl From<BTreeMap<TabletIndexName, DocumentIndexKeyValue>> for DocumentIndexKeys {
35+
fn from(map: BTreeMap<TabletIndexName, DocumentIndexKeyValue>) -> Self {
36+
Self(map.into())
37+
}
38+
}
39+
40+
impl DocumentIndexKeys {
41+
pub fn get(&self, index_name: &TabletIndexName) -> Option<&DocumentIndexKeyValue> {
42+
self.0.get(index_name)
43+
}
44+
45+
#[cfg(any(test, feature = "testing"))]
46+
pub fn empty_for_test() -> Self {
47+
Self(Default::default())
48+
}
49+
50+
#[cfg(any(test, feature = "testing"))]
51+
pub fn with_standard_index_for_test(
52+
index_name: TabletIndexName,
53+
index_value: IndexKey,
54+
) -> Self {
55+
let mut keys = BTreeMap::new();
56+
keys.insert(
57+
index_name,
58+
DocumentIndexKeyValue::Standard(index_value.to_bytes()),
59+
);
60+
Self(keys.into())
61+
}
62+
63+
#[cfg(any(test, feature = "testing"))]
64+
pub fn with_search_index_for_test(
65+
index_name: TabletIndexName,
66+
search_field: FieldPath,
67+
search_field_value: SearchValueTokens,
68+
) -> Self {
69+
let mut keys = BTreeMap::new();
70+
keys.insert(
71+
index_name,
72+
DocumentIndexKeyValue::Search(SearchIndexKeyValue {
73+
filter_values: Default::default(),
74+
search_field,
75+
search_field_value: Some(search_field_value),
76+
}),
77+
);
78+
Self(keys.into())
79+
}
80+
81+
#[cfg(any(test, feature = "testing"))]
82+
pub fn with_search_index_for_test_with_filters(
83+
index_name: TabletIndexName,
84+
search_field: FieldPath,
85+
search_field_value: SearchValueTokens,
86+
filter_values: BTreeMap<FieldPath, SearchFilterValue>,
87+
) -> Self {
88+
let mut keys = BTreeMap::new();
89+
keys.insert(
90+
index_name,
91+
DocumentIndexKeyValue::Search(SearchIndexKeyValue {
92+
filter_values: filter_values.into(),
93+
search_field,
94+
search_field_value: Some(search_field_value),
95+
}),
96+
);
97+
Self(keys.into())
98+
}
99+
}
100+
101+
impl HeapSize for DocumentIndexKeys {
102+
fn heap_size(&self) -> usize {
103+
self.0.heap_size()
104+
}
105+
}
106+
107+
#[derive(Clone, Debug)]
108+
#[cfg_attr(any(test, feature = "testing"), derive(PartialEq, Eq))]
109+
pub enum DocumentIndexKeyValue {
110+
Standard(IndexKeyBytes),
111+
Search(SearchIndexKeyValue),
112+
// We don’t store index key values for vector indexes because they don’t
113+
// support subscriptions.
114+
}
115+
116+
#[derive(Clone, Debug)]
117+
#[cfg_attr(any(test, feature = "testing"), derive(PartialEq, Eq))]
118+
pub struct SearchIndexKeyValue {
119+
/// These are values for the fields present in the must
120+
/// clauses of the search index.
121+
pub filter_values: WithHeapSize<BTreeMap<FieldPath, SearchFilterValue>>,
122+
pub search_field: FieldPath,
123+
pub search_field_value: Option<SearchValueTokens>,
124+
}
125+
126+
impl HeapSize for DocumentIndexKeyValue {
127+
fn heap_size(&self) -> usize {
128+
match self {
129+
DocumentIndexKeyValue::Standard(index_key) => index_key.heap_size(),
130+
DocumentIndexKeyValue::Search(SearchIndexKeyValue {
131+
filter_values,
132+
search_field,
133+
search_field_value,
134+
}) => {
135+
filter_values.heap_size()
136+
+ search_field.heap_size()
137+
+ search_field_value.heap_size()
138+
},
139+
}
140+
}
141+
}
142+
143+
/// The tokens in some textual value (search field of a full-text search index).
144+
///
145+
/// Tokens are not sorted in any particular order, but must be unique.
146+
/// (Uniqueness is not strictly necessary here, but we’d like to avoid
147+
/// iterating over the same token multiple times.)
148+
#[derive(Clone, Debug)]
149+
pub struct SearchValueTokens(WithHeapSize<Box<[CompactString]>>);
150+
151+
impl From<HashSet<CompactString>> for SearchValueTokens {
152+
fn from(value: HashSet<CompactString>) -> Self {
153+
let tokens: Box<[CompactString]> = value.into_iter().collect();
154+
Self(tokens.into())
155+
}
156+
}
157+
158+
impl SearchValueTokens {
159+
pub fn for_each_token<F>(&self, prefix: bool, mut for_each: F)
160+
where
161+
F: FnMut(&str),
162+
{
163+
if prefix {
164+
// We're inverting prefix match here by constructing all possible prefixes for
165+
// each term in the document if at least one prefix search exists in
166+
// the readset (resulting in this method being called with prefix:
167+
// true).
168+
//
169+
// This lets callers search into tries containing the actual search term with
170+
// dfa prefixes set to false and still match based on prefix.
171+
// Searching a trie with the document tokens is bounded by the size
172+
// of the document, which is expected to be significantly smaller
173+
// than the total number of subscriptions for busy backends.
174+
for token in self.calculate_prefixes() {
175+
for_each(token);
176+
}
177+
} else {
178+
for token in self.0.iter() {
179+
for_each(token);
180+
}
181+
}
182+
}
183+
184+
fn calculate_prefixes(&self) -> impl Iterator<Item = &str> + '_ {
185+
let mut set: HashSet<&str> = HashSet::new();
186+
187+
for token in self.0.iter() {
188+
if !set.insert(token) {
189+
continue;
190+
}
191+
for (i, _) in token.char_indices()
192+
// Skip the first index which is always 0
193+
.skip(1)
194+
{
195+
// After that we get all prefixes except for the complete
196+
// token (because `..i` always skips the last character
197+
// bytes).
198+
set.insert(&token[..i]);
199+
}
200+
}
201+
set.into_iter()
202+
}
203+
}
204+
205+
#[cfg(any(test, feature = "testing"))]
206+
impl SearchValueTokens {
207+
pub fn from_iter_for_test<T: IntoIterator<Item = String>>(iter: T) -> Self {
208+
let tokens: Box<[CompactString]> = iter.into_iter().map(|x| x.into()).collect();
209+
Self(tokens.into())
210+
}
211+
}
212+
213+
#[cfg(any(test, feature = "testing"))]
214+
impl PartialEq for SearchValueTokens {
215+
fn eq(&self, other: &Self) -> bool {
216+
if self.0.len() != other.0.len() {
217+
return false;
218+
}
219+
220+
let sorted_this: std::collections::BTreeSet<CompactString> =
221+
self.0.clone().into_iter().collect();
222+
let sorted_other: std::collections::BTreeSet<CompactString> =
223+
other.0.clone().into_iter().collect();
224+
sorted_this == sorted_other
225+
}
226+
}
227+
228+
#[cfg(any(test, feature = "testing"))]
229+
impl Eq for SearchValueTokens {}
230+
231+
impl HeapSize for SearchValueTokens {
232+
fn heap_size(&self) -> usize {
233+
self.0.heap_size()
234+
}
235+
}

0 commit comments

Comments
 (0)