Skip to content

Commit dd0ba68

Browse files
feat(tantivy): add advanced search features (#2805)
Co-authored-by: yujonglee <yujonglee.dev@gmail.com> Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
1 parent 7d6dce5 commit dd0ba68

File tree

4 files changed

+232
-13
lines changed

4 files changed

+232
-13
lines changed

plugins/tantivy/js/bindings.gen.ts

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -59,12 +59,14 @@ async removeDocument(id: string, collection: string | null) : Promise<Result<nul
5959
/** user-defined types **/
6060

6161
export type CreatedAtFilter = { gte: number | null; lte: number | null; gt: number | null; lt: number | null; eq: number | null }
62-
export type SearchDocument = { id: string; doc_type: string; language: string | null; title: string; content: string; created_at: number }
63-
export type SearchFilters = { created_at: CreatedAtFilter | null; doc_type: string | null }
64-
export type SearchHit = { score: number; document: SearchDocument }
65-
export type SearchOptions = { fuzzy: boolean | null; distance: number | null }
62+
export type HighlightRange = { start: number; end: number }
63+
export type SearchDocument = { id: string; doc_type: string; language: string | null; title: string; content: string; created_at: number; facets?: string[] }
64+
export type SearchFilters = { created_at: CreatedAtFilter | null; doc_type: string | null; facet: string | null }
65+
export type SearchHit = { score: number; document: SearchDocument; title_snippet: Snippet | null; content_snippet: Snippet | null }
66+
export type SearchOptions = { fuzzy: boolean | null; distance: number | null; snippets: boolean | null; snippet_max_chars: number | null; phrase_slop: number | null }
6667
export type SearchRequest = { query: string; collection?: string | null; filters?: SearchFilters; limit?: number; options?: SearchOptions }
6768
export type SearchResult = { hits: SearchHit[]; count: number }
69+
export type Snippet = { fragment: string; highlights: HighlightRange[] }
6870

6971
/** tauri-specta globals **/
7072

plugins/tantivy/src/ext.rs

Lines changed: 194 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,74 @@
11
use tantivy::collector::{Count, TopDocs};
22
use tantivy::query::{
3-
BooleanQuery, BoostQuery, FuzzyTermQuery, Occur, Query, QueryParser, TermQuery,
3+
BooleanQuery, BoostQuery, FuzzyTermQuery, Occur, PhraseQuery, Query, QueryParser, TermQuery,
44
};
5-
use tantivy::schema::IndexRecordOption;
5+
use tantivy::schema::{Facet, IndexRecordOption};
6+
use tantivy::snippet::SnippetGenerator;
67
use tantivy::{Index, ReloadPolicy, TantivyDocument, Term};
78
use tauri_plugin_path2::Path2PluginExt;
89

910
use crate::query::build_created_at_range_query;
1011
use crate::schema::{extract_search_document, get_fields};
1112
use crate::tokenizer::register_tokenizers;
1213
use crate::{
13-
CollectionConfig, CollectionIndex, IndexState, SearchDocument, SearchHit, SearchRequest,
14-
SearchResult,
14+
CollectionConfig, CollectionIndex, HighlightRange, IndexState, SearchDocument, SearchHit,
15+
SearchRequest, SearchResult, Snippet,
1516
};
1617

1718
pub fn detect_language(text: &str) -> hypr_language::Language {
1819
hypr_language::detect(text)
1920
}
2021

22+
fn parse_query_parts(query: &str) -> (Vec<&str>, Vec<&str>) {
23+
let mut phrases = Vec::new();
24+
let mut regular_terms = Vec::new();
25+
let mut in_quote = false;
26+
let mut quote_start = 0;
27+
let mut current_start = 0;
28+
29+
let chars: Vec<char> = query.chars().collect();
30+
let mut i = 0;
31+
32+
while i < chars.len() {
33+
if chars[i] == '"' {
34+
if in_quote {
35+
let phrase = &query[quote_start..i];
36+
if !phrase.trim().is_empty() {
37+
phrases.push(phrase.trim());
38+
}
39+
in_quote = false;
40+
current_start = i + 1;
41+
} else {
42+
let before = &query[current_start..i];
43+
for term in before.split_whitespace() {
44+
if !term.is_empty() {
45+
regular_terms.push(term);
46+
}
47+
}
48+
in_quote = true;
49+
quote_start = i + 1;
50+
}
51+
}
52+
i += 1;
53+
}
54+
55+
if in_quote {
56+
let phrase = &query[quote_start..];
57+
if !phrase.trim().is_empty() {
58+
phrases.push(phrase.trim());
59+
}
60+
} else {
61+
let remaining = &query[current_start..];
62+
for term in remaining.split_whitespace() {
63+
if !term.is_empty() {
64+
regular_terms.push(term);
65+
}
66+
}
67+
}
68+
69+
(phrases, regular_terms)
70+
}
71+
2172
pub struct Tantivy<'a, R: tauri::Runtime, M: tauri::Manager<R>> {
2273
manager: &'a M,
2374
_runtime: std::marker::PhantomData<fn() -> R>,
@@ -95,18 +146,80 @@ impl<'a, R: tauri::Runtime, M: tauri::Manager<R>> Tantivy<'a, R, M> {
95146
let searcher = reader.searcher();
96147

97148
let use_fuzzy = request.options.fuzzy.unwrap_or(false);
149+
let phrase_slop = request.options.phrase_slop.unwrap_or(0);
98150

99151
// Title boost factor (3x) to match Orama's title:3, content:1 behavior
100152
const TITLE_BOOST: f32 = 3.0;
101153

102154
let mut combined_query: Box<dyn Query> = if use_fuzzy {
103155
let distance = request.options.distance.unwrap_or(1);
104-
let terms: Vec<&str> = request.query.split_whitespace().collect();
156+
157+
// Parse query to extract phrases (quoted) and regular terms
158+
let (phrases, regular_terms) = parse_query_parts(&request.query);
159+
105160
let mut term_queries: Vec<(Occur, Box<dyn Query>)> = Vec::new();
106161

107-
// For each term, create a Must clause that requires the term to match
108-
// in either title OR content (with title boosted)
109-
for term in terms {
162+
// Handle quoted phrases with PhraseQuery
163+
for phrase in phrases {
164+
let words: Vec<&str> = phrase.split_whitespace().collect();
165+
if words.len() > 1 {
166+
// Create phrase query for title field
167+
let title_terms: Vec<Term> = words
168+
.iter()
169+
.map(|w| Term::from_field_text(fields.title, w))
170+
.collect();
171+
let mut title_phrase = PhraseQuery::new(title_terms);
172+
title_phrase.set_slop(phrase_slop);
173+
174+
// Create phrase query for content field
175+
let content_terms: Vec<Term> = words
176+
.iter()
177+
.map(|w| Term::from_field_text(fields.content, w))
178+
.collect();
179+
let mut content_phrase = PhraseQuery::new(content_terms);
180+
content_phrase.set_slop(phrase_slop);
181+
182+
// Boost title matches by 3x
183+
let boosted_title: Box<dyn Query> =
184+
Box::new(BoostQuery::new(Box::new(title_phrase), TITLE_BOOST));
185+
let content_query: Box<dyn Query> = Box::new(content_phrase);
186+
187+
// Phrase must match in at least one field (title OR content)
188+
let phrase_field_query = BooleanQuery::new(vec![
189+
(Occur::Should, boosted_title),
190+
(Occur::Should, content_query),
191+
]);
192+
193+
term_queries.push((Occur::Must, Box::new(phrase_field_query)));
194+
} else if !words.is_empty() {
195+
// Single word "phrase" - treat as regular term
196+
let word = words[0];
197+
let title_fuzzy = FuzzyTermQuery::new(
198+
Term::from_field_text(fields.title, word),
199+
distance,
200+
true,
201+
);
202+
let content_fuzzy = FuzzyTermQuery::new(
203+
Term::from_field_text(fields.content, word),
204+
distance,
205+
true,
206+
);
207+
208+
let boosted_title: Box<dyn Query> =
209+
Box::new(BoostQuery::new(Box::new(title_fuzzy), TITLE_BOOST));
210+
let content_query: Box<dyn Query> = Box::new(content_fuzzy);
211+
212+
let term_field_query = BooleanQuery::new(vec![
213+
(Occur::Should, boosted_title),
214+
(Occur::Should, content_query),
215+
]);
216+
217+
term_queries.push((Occur::Must, Box::new(term_field_query)));
218+
}
219+
}
220+
221+
// Handle regular (unquoted) terms with fuzzy matching
222+
for term in regular_terms {
110223
let title_fuzzy =
111224
FuzzyTermQuery::new(Term::from_field_text(fields.title, term), distance, true);
112225
let content_fuzzy = FuzzyTermQuery::new(
@@ -157,20 +270,81 @@ impl<'a, R: tauri::Runtime, M: tauri::Manager<R>> Tantivy<'a, R, M> {
157270
]));
158271
}
159272

273+
// Apply facet filter
274+
if let Some(ref facet_path) = request.filters.facet {
275+
if let Ok(facet) = Facet::from_text(facet_path) {
276+
let facet_term = Term::from_facet(fields.facets, &facet);
277+
let facet_query = TermQuery::new(facet_term, IndexRecordOption::Basic);
278+
combined_query = Box::new(BooleanQuery::new(vec![
279+
(Occur::Must, combined_query),
280+
(Occur::Must, Box::new(facet_query)),
281+
]));
282+
}
283+
}
284+
160285
// Use tuple collector to get both top docs and total count
161286
let (top_docs, count) = searcher.search(
162287
&combined_query,
163288
&(TopDocs::with_limit(request.limit), Count),
164289
)?;
165290

291+
let generate_snippets = request.options.snippets.unwrap_or(false);
292+
let snippet_max_chars = request.options.snippet_max_chars.unwrap_or(150);
293+
294+
let (title_snippet_gen, content_snippet_gen) = if generate_snippets {
295+
let mut title_gen =
296+
SnippetGenerator::create(&searcher, &*combined_query, fields.title)?;
297+
title_gen.set_max_num_chars(snippet_max_chars);
298+
299+
let mut content_gen =
300+
SnippetGenerator::create(&searcher, &*combined_query, fields.content)?;
301+
content_gen.set_max_num_chars(snippet_max_chars);
302+
303+
(Some(title_gen), Some(content_gen))
304+
} else {
305+
(None, None)
306+
};
307+
166308
let mut hits = Vec::new();
167309
for (score, doc_address) in top_docs {
168310
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
169311

170312
if let Some(search_doc) = extract_search_document(schema, &fields, &retrieved_doc) {
313+
let title_snippet = title_snippet_gen.as_ref().map(|generator| {
314+
let snippet = generator.snippet_from_doc(&retrieved_doc);
315+
Snippet {
316+
fragment: snippet.fragment().to_string(),
317+
highlights: snippet
318+
.highlighted()
319+
.iter()
320+
.map(|range| HighlightRange {
321+
start: range.start,
322+
end: range.end,
323+
})
324+
.collect(),
325+
}
326+
});
327+
328+
let content_snippet = content_snippet_gen.as_ref().map(|generator| {
329+
let snippet = generator.snippet_from_doc(&retrieved_doc);
330+
Snippet {
331+
fragment: snippet.fragment().to_string(),
332+
highlights: snippet
333+
.highlighted()
334+
.iter()
335+
.map(|range| HighlightRange {
336+
start: range.start,
337+
end: range.end,
338+
})
339+
.collect(),
340+
}
341+
});
342+
171343
hits.push(SearchHit {
172344
score,
173345
document: search_doc,
346+
title_snippet,
347+
content_snippet,
174348
});
175349
}
176350
}
@@ -232,6 +406,12 @@ impl<'a, R: tauri::Runtime, M: tauri::Manager<R>> Tantivy<'a, R, M> {
232406
doc.add_text(fields.content, &document.content);
233407
doc.add_i64(fields.created_at, document.created_at);
234408

409+
for facet_path in &document.facets {
410+
if let Ok(facet) = Facet::from_text(facet_path) {
411+
doc.add_facet(fields.facets, facet);
412+
}
413+
}
414+
235415
writer.add_document(doc)?;
236416
writer.commit()?;
237417

@@ -273,6 +453,12 @@ impl<'a, R: tauri::Runtime, M: tauri::Manager<R>> Tantivy<'a, R, M> {
273453
doc.add_text(fields.content, &document.content);
274454
doc.add_i64(fields.created_at, document.created_at);
275455

456+
for facet_path in &document.facets {
457+
if let Ok(facet) = Facet::from_text(facet_path) {
458+
doc.add_facet(fields.facets, facet);
459+
}
460+
}
461+
276462
writer.add_document(doc)?;
277463
writer.commit()?;
278464

plugins/tantivy/src/lib.rs

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,28 @@ pub struct SearchDocument {
2727
pub title: String,
2828
pub content: String,
2929
pub created_at: i64,
30+
#[serde(default)]
31+
pub facets: Vec<String>,
32+
}
33+
34+
#[derive(Debug, Clone, Serialize, Deserialize, specta::Type)]
35+
pub struct Snippet {
36+
pub fragment: String,
37+
pub highlights: Vec<HighlightRange>,
38+
}
39+
40+
#[derive(Debug, Clone, Serialize, Deserialize, specta::Type)]
41+
pub struct HighlightRange {
42+
pub start: usize,
43+
pub end: usize,
3044
}
3145

3246
#[derive(Debug, Clone, Serialize, Deserialize, specta::Type)]
3347
pub struct SearchHit {
3448
pub score: f32,
3549
pub document: SearchDocument,
50+
pub title_snippet: Option<Snippet>,
51+
pub content_snippet: Option<Snippet>,
3652
}
3753

3854
#[derive(Debug, Clone, Serialize, Deserialize, specta::Type)]
@@ -54,12 +70,16 @@ pub struct CreatedAtFilter {
5470
pub struct SearchFilters {
5571
pub created_at: Option<CreatedAtFilter>,
5672
pub doc_type: Option<String>,
73+
pub facet: Option<String>,
5774
}
5875

5976
#[derive(Debug, Clone, Default, Serialize, Deserialize, specta::Type)]
6077
pub struct SearchOptions {
6178
pub fuzzy: Option<bool>,
6279
pub distance: Option<u8>,
80+
pub snippets: Option<bool>,
81+
pub snippet_max_chars: Option<usize>,
82+
pub phrase_slop: Option<u32>,
6383
}
6484

6585
fn default_limit() -> usize {

plugins/tantivy/src/schema.rs

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
use tantivy::TantivyDocument;
2-
use tantivy::schema::{FAST, Field, STORED, STRING, Schema, TextFieldIndexing, TextOptions, Value};
2+
use tantivy::schema::{
3+
FAST, FacetOptions, Field, STORED, STRING, Schema, TextFieldIndexing, TextOptions, Value,
4+
};
35

46
use crate::SearchDocument;
57

@@ -10,6 +12,7 @@ pub struct SchemaFields {
1012
pub title: Field,
1113
pub content: Field,
1214
pub created_at: Field,
15+
pub facets: Field,
1316
}
1417

1518
pub fn build_schema() -> Schema {
@@ -28,6 +31,7 @@ pub fn build_schema() -> Schema {
2831
schema_builder.add_text_field("title", text_options.clone());
2932
schema_builder.add_text_field("content", text_options);
3033
schema_builder.add_i64_field("created_at", FAST | STORED);
34+
schema_builder.add_facet_field("facets", FacetOptions::default());
3135
schema_builder.build()
3236
}
3337

@@ -39,6 +43,7 @@ pub fn get_fields(schema: &Schema) -> SchemaFields {
3943
title: schema.get_field("title").unwrap(),
4044
content: schema.get_field("content").unwrap(),
4145
created_at: schema.get_field("created_at").unwrap(),
46+
facets: schema.get_field("facets").unwrap(),
4247
}
4348
}
4449

@@ -57,13 +62,19 @@ pub fn extract_search_document(
5762
let content = doc.get_first(fields.content)?.as_str()?.to_string();
5863
let created_at = doc.get_first(fields.created_at)?.as_i64()?;
5964

65+
let facets: Vec<String> = doc
66+
.get_all(fields.facets)
67+
.filter_map(|v| v.as_facet().map(|f| f.to_string()))
68+
.collect();
69+
6070
Some(SearchDocument {
6171
id,
6272
doc_type,
6373
language,
6474
title,
6575
content,
6676
created_at,
77+
facets,
6778
})
6879
}
6980

0 commit comments

Comments
 (0)