Skip to content

Commit 98ced0f

Browse files
Restricted stemming to the multilang feature (quickwit-oss#6085)
Co-authored-by: fulmicoton <paul.masurel@datadoghq.com>
1 parent 33f6b65 commit 98ced0f

File tree

3 files changed

+14
-11
lines changed

3 files changed

+14
-11
lines changed

quickwit/Cargo.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -359,7 +359,6 @@ tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "d904630", d
359359
"lz4-compression",
360360
"mmap",
361361
"quickwit",
362-
"stemmer",
363362
"zstd-compression",
364363
"columnar-zstd-compression",
365364
] }

quickwit/quickwit-query/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ multilang = [
4848
"lindera-dictionary",
4949
"lindera-tokenizer",
5050
"whichlang",
51+
"tantivy/stemmer",
5152
]
5253

5354
[[bench]]

quickwit/quickwit-query/src/tokenizers/mod.rs

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@ mod tokenizer_manager;
2020

2121
use once_cell::sync::Lazy;
2222
use tantivy::tokenizer::{
23-
AsciiFoldingFilter, Language, LowerCaser, RawTokenizer, RemoveLongFilter, SimpleTokenizer,
24-
Stemmer, TextAnalyzer, WhitespaceTokenizer,
23+
AsciiFoldingFilter, LowerCaser, RawTokenizer, RemoveLongFilter, SimpleTokenizer, TextAnalyzer,
24+
WhitespaceTokenizer,
2525
};
2626

2727
use self::chinese_compatible::ChineseTokenizer;
@@ -58,14 +58,17 @@ pub fn create_default_quickwit_tokenizer_manager() -> TokenizerManager {
5858
.filter(LowerCaser)
5959
.build();
6060
tokenizer_manager.register("default", default_tokenizer, true);
61-
62-
let en_stem_tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
63-
.filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH))
64-
.filter(LowerCaser)
65-
.filter(Stemmer::new(Language::English))
66-
.build();
67-
tokenizer_manager.register("en_stem", en_stem_tokenizer, true);
68-
61+
#[cfg(feature = "multilang")]
62+
{
63+
let en_stem_tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
64+
.filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH))
65+
.filter(LowerCaser)
66+
.filter(tantivy::tokenizer::Stemmer::new(
67+
tantivy::tokenizer::Language::English,
68+
))
69+
.build();
70+
tokenizer_manager.register("en_stem", en_stem_tokenizer, true);
71+
}
6972
tokenizer_manager.register("whitespace", WhitespaceTokenizer::default(), false);
7073

7174
let chinese_tokenizer = TextAnalyzer::builder(ChineseTokenizer)

0 commit comments

Comments
 (0)