Skip to content

Commit 798ad68

Browse files
authored
Bump tantivy version, and add phrase prefix query support. (#3543)
1 parent 6adf4bd commit 798ad68

File tree

8 files changed

+76
-39
lines changed

8 files changed

+76
-39
lines changed

quickwit/Cargo.lock

Lines changed: 12 additions & 12 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

quickwit/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,7 @@ quickwit-serve = { version = "0.6.0", path = "./quickwit-serve" }
221221
quickwit-storage = { version = "0.6.0", path = "./quickwit-storage" }
222222
quickwit-telemetry = { version = "0.6.0", path = "./quickwit-telemetry" }
223223

224-
tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "7ee78bd", default-features = false, features = [
224+
tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "924fc70", default-features = false, features = [
225225
"mmap",
226226
"lz4-compression",
227227
"zstd-compression",

quickwit/quickwit-doc-mapper/src/doc_mapper.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -388,7 +388,7 @@ mod tests {
388388
let (query, _) = doc_mapper.query(schema, &query_ast, true).unwrap();
389389
assert_eq!(
390390
format!("{query:?}"),
391-
r#"BooleanQuery { subqueries: [(Should, TermQuery(Term(field=0, type=Json, path=toto, type=U64, 5))), (Should, TermQuery(Term(field=0, type=Json, path=toto, type=Str, "5")))] }"#
391+
r#"BooleanQuery { subqueries: [(Should, TermQuery(Term(field=0, type=Json, path=toto, type=I64, 5))), (Should, TermQuery(Term(field=0, type=Json, path=toto, type=Str, "5")))] }"#
392392
);
393393
}
394394

quickwit/quickwit-query/src/elastic_query_dsl/phrase_prefix_query.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ impl ConvertableToQueryAst for MatchPhrasePrefix {
6767
let phrase_prefix_query_ast = query_ast::PhrasePrefixQuery {
6868
field: self.field,
6969
phrase: query,
70-
analyzer,
70+
params: analyzer,
7171
max_expansions,
7272
};
7373
Ok(phrase_prefix_query_ast.into())

quickwit/quickwit-query/src/query_ast/full_text_query.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ impl FullTextParams {
6868
let text_indexing_options = json_options
6969
.get_text_indexing_options()
7070
.with_context(|| format!("Json field text `{}` is not indexed", json_path))?;
71-
let text_analyzer: TextAnalyzer = self.text_analyzer(text_indexing_options)?;
71+
let mut text_analyzer: TextAnalyzer = self.text_analyzer(text_indexing_options)?;
7272
let mut token_stream: BoxTokenStream = text_analyzer.token_stream(text);
7373
let mut tokens = Vec::new();
7474
let mut term = Term::with_capacity(100);
@@ -91,7 +91,7 @@ impl FullTextParams {
9191
text: &str,
9292
text_field_indexing: &TextFieldIndexing,
9393
) -> anyhow::Result<Vec<(usize, Term)>> {
94-
let text_analyzer: TextAnalyzer = self.text_analyzer(text_field_indexing)?;
94+
let mut text_analyzer: TextAnalyzer = self.text_analyzer(text_field_indexing)?;
9595
let mut token_stream: BoxTokenStream = text_analyzer.token_stream(text);
9696
let mut tokens = Vec::new();
9797
token_stream.process(&mut |token| {

quickwit/quickwit-query/src/query_ast/phrase_prefix_query.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ pub struct PhrasePrefixQuery {
3636
pub field: String,
3737
pub phrase: String,
3838
pub max_expansions: u32,
39-
pub analyzer: FullTextParams,
39+
pub params: FullTextParams,
4040
}
4141

4242
impl PhrasePrefixQuery {
@@ -63,7 +63,7 @@ impl PhrasePrefixQuery {
6363
));
6464
}
6565

66-
let terms = self.analyzer.tokenize_text_into_terms(
66+
let terms = self.params.tokenize_text_into_terms(
6767
field,
6868
&self.phrase,
6969
text_field_indexing,
@@ -85,7 +85,7 @@ impl PhrasePrefixQuery {
8585
.to_string(),
8686
));
8787
}
88-
let terms = self.analyzer.tokenize_text_into_terms_json(
88+
let terms = self.params.tokenize_text_into_terms_json(
8989
field,
9090
json_path,
9191
&self.phrase,
@@ -116,7 +116,7 @@ impl BuildTantivyAst for PhrasePrefixQuery {
116116
let (_, terms) = self.get_terms(schema)?;
117117

118118
if terms.is_empty() {
119-
if self.analyzer.zero_terms_query.is_none() {
119+
if self.params.zero_terms_query.is_none() {
120120
Ok(TantivyQueryAst::match_none())
121121
} else {
122122
Ok(TantivyQueryAst::match_all())

quickwit/quickwit-query/src/query_ast/user_input_query.rs

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ use crate::query_ast::tantivy_query_ast::TantivyQueryAst;
3232
use crate::query_ast::{self, BuildTantivyAst, FullTextMode, FullTextParams, QueryAst};
3333
use crate::{BooleanOperand, InvalidQuery, JsonLiteral};
3434

35+
const DEFAULT_PHRASE_QUERY_MAX_EXPANSION: u32 = 50;
36+
3537
/// A query expressed in the tantivy query grammar DSL.
3638
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
3739
pub struct UserInputQuery {
@@ -182,6 +184,7 @@ fn convert_user_input_literal(
182184
let UserInputLiteral {
183185
field_name,
184186
phrase,
187+
prefix,
185188
delimiter,
186189
slop,
187190
} = user_input_literal;
@@ -211,6 +214,15 @@ fn convert_user_input_literal(
211214
let mut phrase_queries: Vec<QueryAst> = field_names
212215
.into_iter()
213216
.map(|field_name| {
217+
if prefix {
218+
return query_ast::PhrasePrefixQuery {
219+
field: field_name,
220+
phrase: phrase.clone(),
221+
params: full_text_params.clone(),
222+
max_expansions: DEFAULT_PHRASE_QUERY_MAX_EXPANSION,
223+
}
224+
.into();
225+
}
214226
query_ast::FullTextQuery {
215227
field: field_name,
216228
text: phrase.clone(),
@@ -309,6 +321,25 @@ mod tests {
309321
);
310322
}
311323

324+
#[test]
325+
fn test_user_input_query_phrase_with_prefix() {
326+
let ast = UserInputQuery {
327+
user_text: "field:\"hello\"*".to_string(),
328+
default_fields: None,
329+
default_operator: BooleanOperand::And,
330+
}
331+
.parse_user_query(&[])
332+
.unwrap();
333+
let QueryAst::PhrasePrefix(phrase_prefix_query) = ast else { panic!() };
334+
assert_eq!(&phrase_prefix_query.field, "field");
335+
assert_eq!(&phrase_prefix_query.phrase, "hello");
336+
assert_eq!(phrase_prefix_query.max_expansions, 50);
337+
assert_eq!(
338+
phrase_prefix_query.params.mode,
339+
FullTextMode::Phrase { slop: 0 }
340+
);
341+
}
342+
312343
#[test]
313344
fn test_user_input_query_override_default_fields() {
314345
let ast = UserInputQuery {

quickwit/quickwit-query/src/tokenizers.rs

Lines changed: 24 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ use tantivy::tokenizer::{
2626
};
2727

2828
fn create_quickwit_tokenizer_manager() -> TokenizerManager {
29-
let raw_tokenizer = TextAnalyzer::builder(RawTokenizer)
29+
let raw_tokenizer = TextAnalyzer::builder(RawTokenizer::default())
3030
.filter(RemoveLongFilter::limit(255))
3131
.build();
3232

@@ -41,14 +41,14 @@ fn create_quickwit_tokenizer_manager() -> TokenizerManager {
4141

4242
tokenizer_manager.register(
4343
"default",
44-
TextAnalyzer::builder(tantivy::tokenizer::SimpleTokenizer)
44+
TextAnalyzer::builder(tantivy::tokenizer::SimpleTokenizer::default())
4545
.filter(RemoveLongFilter::limit(255))
4646
.filter(LowerCaser)
4747
.build(),
4848
);
4949
tokenizer_manager.register(
5050
"en_stem",
51-
TextAnalyzer::builder(tantivy::tokenizer::SimpleTokenizer)
51+
TextAnalyzer::builder(tantivy::tokenizer::SimpleTokenizer::default())
5252
.filter(RemoveLongFilter::limit(255))
5353
.filter(LowerCaser)
5454
.filter(tantivy::tokenizer::Stemmer::new(
@@ -61,11 +61,11 @@ fn create_quickwit_tokenizer_manager() -> TokenizerManager {
6161
}
6262

6363
fn create_quickwit_fastfield_normalizer_manager() -> TokenizerManager {
64-
let raw_tokenizer = TextAnalyzer::builder(RawTokenizer)
64+
let raw_tokenizer = TextAnalyzer::builder(RawTokenizer::default())
6565
.filter(RemoveLongFilter::limit(255))
6666
.build();
6767

68-
let lower_case_tokenizer = TextAnalyzer::builder(RawTokenizer)
68+
let lower_case_tokenizer = TextAnalyzer::builder(RawTokenizer::default())
6969
.filter(LowerCaser)
7070
.filter(RemoveLongFilter::limit(255))
7171
.build();
@@ -82,7 +82,7 @@ struct ChineseTokenizer;
8282
impl Tokenizer for ChineseTokenizer {
8383
type TokenStream<'a> = ChineseTokenStream<'a>;
8484

85-
fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> {
85+
fn token_stream<'a>(&mut self, text: &'a str) -> Self::TokenStream<'a> {
8686
ChineseTokenStream {
8787
text,
8888
last_char: None,
@@ -209,21 +209,27 @@ mod tests {
209209
sand in my face
210210
"#;
211211

212-
let tokenizer = get_quickwit_tokenizer_manager().get("raw").unwrap();
213-
let mut haiku_stream = tokenizer.token_stream(my_haiku);
214-
assert!(haiku_stream.advance());
215-
assert!(!haiku_stream.advance());
216-
let my_too_long_text = vec!["a".repeat(255)].join("");
217-
assert!(!tokenizer.token_stream(&my_too_long_text).advance());
218-
let my_long_text = vec!["a".repeat(254)].join("");
219-
assert!(tokenizer.token_stream(&my_long_text).advance());
212+
let mut tokenizer = get_quickwit_tokenizer_manager().get("raw").unwrap();
213+
{
214+
let mut haiku_stream = tokenizer.token_stream(my_haiku);
215+
assert!(haiku_stream.advance());
216+
assert!(!haiku_stream.advance());
217+
}
218+
{
219+
let my_too_long_text = vec!["a".repeat(255)].join("");
220+
assert!(!tokenizer.token_stream(&my_too_long_text).advance());
221+
}
222+
{
223+
let my_long_text = vec!["a".repeat(254)].join("");
224+
assert!(tokenizer.token_stream(&my_long_text).advance());
225+
}
220226
}
221227

222228
#[test]
223229
fn test_chinese_tokenizer() {
224230
let text = "Hello world, 你好世界, bonjour monde";
225231

226-
let tokenizer = get_quickwit_tokenizer_manager()
232+
let mut tokenizer = get_quickwit_tokenizer_manager()
227233
.get("chinese_compatible")
228234
.unwrap();
229235
let mut text_stream = tokenizer.token_stream(text);
@@ -300,7 +306,7 @@ mod tests {
300306
fn test_chinese_tokenizer_no_space() {
301307
let text = "Hello你好bonjour";
302308

303-
let tokenizer = get_quickwit_tokenizer_manager()
309+
let mut tokenizer = get_quickwit_tokenizer_manager()
304310
.get("chinese_compatible")
305311
.unwrap();
306312
let mut text_stream = tokenizer.token_stream(text);
@@ -347,8 +353,8 @@ mod tests {
347353
proptest::proptest! {
348354
#[test]
349355
fn test_proptest_ascii_default_chinese_equal(text in "[ -~]{0,64}") {
350-
let cn_tok = get_quickwit_tokenizer_manager().get("chinese_compatible").unwrap();
351-
let default_tok = get_quickwit_tokenizer_manager().get("default").unwrap();
356+
let mut cn_tok = get_quickwit_tokenizer_manager().get("chinese_compatible").unwrap();
357+
let mut default_tok = get_quickwit_tokenizer_manager().get("default").unwrap();
352358

353359
let mut text_stream = cn_tok.token_stream(&text);
354360

0 commit comments

Comments
 (0)