Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions plugins/tantivy/js/bindings.gen.ts
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,14 @@ async removeDocument(id: string, collection: string | null) : Promise<Result<nul
/** user-defined types **/

export type CreatedAtFilter = { gte: number | null; lte: number | null; gt: number | null; lt: number | null; eq: number | null }
export type SearchDocument = { id: string; doc_type: string; language: string | null; title: string; content: string; created_at: number }
export type SearchFilters = { created_at: CreatedAtFilter | null; doc_type: string | null }
export type SearchHit = { score: number; document: SearchDocument }
export type SearchOptions = { fuzzy: boolean | null; distance: number | null }
export type HighlightRange = { start: number; end: number }
export type SearchDocument = { id: string; doc_type: string; language: string | null; title: string; content: string; created_at: number; facets?: string[] }
export type SearchFilters = { created_at: CreatedAtFilter | null; doc_type: string | null; facet: string | null }
export type SearchHit = { score: number; document: SearchDocument; title_snippet: Snippet | null; content_snippet: Snippet | null }
export type SearchOptions = { fuzzy: boolean | null; distance: number | null; snippets: boolean | null; snippet_max_chars: number | null; phrase_slop: number | null }
export type SearchRequest = { query: string; collection?: string | null; filters?: SearchFilters; limit?: number; options?: SearchOptions }
export type SearchResult = { hits: SearchHit[]; count: number }
export type Snippet = { fragment: string; highlights: HighlightRange[] }

/** tauri-specta globals **/

Expand Down
202 changes: 194 additions & 8 deletions plugins/tantivy/src/ext.rs
Original file line number Diff line number Diff line change
@@ -1,23 +1,74 @@
use tantivy::collector::{Count, TopDocs};
use tantivy::query::{
BooleanQuery, BoostQuery, FuzzyTermQuery, Occur, Query, QueryParser, TermQuery,
BooleanQuery, BoostQuery, FuzzyTermQuery, Occur, PhraseQuery, Query, QueryParser, TermQuery,
};
use tantivy::schema::IndexRecordOption;
use tantivy::schema::{Facet, IndexRecordOption};
use tantivy::snippet::SnippetGenerator;
use tantivy::{Index, ReloadPolicy, TantivyDocument, Term};
use tauri_plugin_path2::Path2PluginExt;

use crate::query::build_created_at_range_query;
use crate::schema::{extract_search_document, get_fields};
use crate::tokenizer::register_tokenizers;
use crate::{
CollectionConfig, CollectionIndex, IndexState, SearchDocument, SearchHit, SearchRequest,
SearchResult,
CollectionConfig, CollectionIndex, HighlightRange, IndexState, SearchDocument, SearchHit,
SearchRequest, SearchResult, Snippet,
};

pub fn detect_language(text: &str) -> hypr_language::Language {
hypr_language::detect(text)
}

fn parse_query_parts(query: &str) -> (Vec<&str>, Vec<&str>) {
let mut phrases = Vec::new();
let mut regular_terms = Vec::new();
let mut in_quote = false;
let mut quote_start = 0;
let mut current_start = 0;

let chars: Vec<char> = query.chars().collect();
let mut i = 0;

while i < chars.len() {
if chars[i] == '"' {
if in_quote {
let phrase = &query[quote_start..i];
if !phrase.trim().is_empty() {
phrases.push(phrase.trim());
}
in_quote = false;
current_start = i + 1;
} else {
let before = &query[current_start..i];
for term in before.split_whitespace() {
if !term.is_empty() {
regular_terms.push(term);
}
}
in_quote = true;
quote_start = i + 1;
}
}
i += 1;
}

if in_quote {
let phrase = &query[quote_start..];
if !phrase.trim().is_empty() {
phrases.push(phrase.trim());
}
} else {
let remaining = &query[current_start..];
for term in remaining.split_whitespace() {
if !term.is_empty() {
regular_terms.push(term);
}
}
}

(phrases, regular_terms)
}

pub struct Tantivy<'a, R: tauri::Runtime, M: tauri::Manager<R>> {
manager: &'a M,
_runtime: std::marker::PhantomData<fn() -> R>,
Expand Down Expand Up @@ -95,18 +146,80 @@ impl<'a, R: tauri::Runtime, M: tauri::Manager<R>> Tantivy<'a, R, M> {
let searcher = reader.searcher();

let use_fuzzy = request.options.fuzzy.unwrap_or(false);
let phrase_slop = request.options.phrase_slop.unwrap_or(0);

// Title boost factor (3x) to match Orama's title:3, content:1 behavior
const TITLE_BOOST: f32 = 3.0;

let mut combined_query: Box<dyn Query> = if use_fuzzy {
let distance = request.options.distance.unwrap_or(1);
let terms: Vec<&str> = request.query.split_whitespace().collect();

// Parse query to extract phrases (quoted) and regular terms
let (phrases, regular_terms) = parse_query_parts(&request.query);

let mut term_queries: Vec<(Occur, Box<dyn Query>)> = Vec::new();

// For each term, create a Must clause that requires the term to match
// in either title OR content (with title boosted)
for term in terms {
// Handle quoted phrases with PhraseQuery
for phrase in phrases {
let words: Vec<&str> = phrase.split_whitespace().collect();
if words.len() > 1 {
// Create phrase query for title field
let title_terms: Vec<Term> = words
.iter()
.map(|w| Term::from_field_text(fields.title, w))
.collect();
let mut title_phrase = PhraseQuery::new(title_terms);
title_phrase.set_slop(phrase_slop);

// Create phrase query for content field
let content_terms: Vec<Term> = words
.iter()
.map(|w| Term::from_field_text(fields.content, w))
.collect();
let mut content_phrase = PhraseQuery::new(content_terms);
content_phrase.set_slop(phrase_slop);

// Boost title matches by 3x
let boosted_title: Box<dyn Query> =
Box::new(BoostQuery::new(Box::new(title_phrase), TITLE_BOOST));
let content_query: Box<dyn Query> = Box::new(content_phrase);

// Phrase must match in at least one field (title OR content)
let phrase_field_query = BooleanQuery::new(vec![
(Occur::Should, boosted_title),
(Occur::Should, content_query),
]);

term_queries.push((Occur::Must, Box::new(phrase_field_query)));
} else if !words.is_empty() {
// Single word "phrase" - treat as regular term
let word = words[0];
let title_fuzzy = FuzzyTermQuery::new(
Term::from_field_text(fields.title, word),
distance,
true,
);
let content_fuzzy = FuzzyTermQuery::new(
Term::from_field_text(fields.content, word),
distance,
true,
);

let boosted_title: Box<dyn Query> =
Box::new(BoostQuery::new(Box::new(title_fuzzy), TITLE_BOOST));
let content_query: Box<dyn Query> = Box::new(content_fuzzy);

let term_field_query = BooleanQuery::new(vec![
(Occur::Should, boosted_title),
(Occur::Should, content_query),
]);

term_queries.push((Occur::Must, Box::new(term_field_query)));
}
}

// Handle regular (unquoted) terms with fuzzy matching
for term in regular_terms {
let title_fuzzy =
FuzzyTermQuery::new(Term::from_field_text(fields.title, term), distance, true);
let content_fuzzy = FuzzyTermQuery::new(
Expand Down Expand Up @@ -157,20 +270,81 @@ impl<'a, R: tauri::Runtime, M: tauri::Manager<R>> Tantivy<'a, R, M> {
]));
}

// Apply facet filter
if let Some(ref facet_path) = request.filters.facet {
if let Ok(facet) = Facet::from_text(facet_path) {
let facet_term = Term::from_facet(fields.facets, &facet);
let facet_query = TermQuery::new(facet_term, IndexRecordOption::Basic);
combined_query = Box::new(BooleanQuery::new(vec![
(Occur::Must, combined_query),
(Occur::Must, Box::new(facet_query)),
]));
}
}

// Use tuple collector to get both top docs and total count
let (top_docs, count) = searcher.search(
&combined_query,
&(TopDocs::with_limit(request.limit), Count),
)?;

let generate_snippets = request.options.snippets.unwrap_or(false);
let snippet_max_chars = request.options.snippet_max_chars.unwrap_or(150);

let (title_snippet_gen, content_snippet_gen) = if generate_snippets {
let mut title_gen =
SnippetGenerator::create(&searcher, &*combined_query, fields.title)?;
title_gen.set_max_num_chars(snippet_max_chars);

let mut content_gen =
SnippetGenerator::create(&searcher, &*combined_query, fields.content)?;
content_gen.set_max_num_chars(snippet_max_chars);

(Some(title_gen), Some(content_gen))
} else {
(None, None)
};

let mut hits = Vec::new();
for (score, doc_address) in top_docs {
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;

if let Some(search_doc) = extract_search_document(schema, &fields, &retrieved_doc) {
let title_snippet = title_snippet_gen.as_ref().map(|generator| {
let snippet = generator.snippet_from_doc(&retrieved_doc);
Snippet {
fragment: snippet.fragment().to_string(),
highlights: snippet
.highlighted()
.iter()
.map(|range| HighlightRange {
start: range.start,
end: range.end,
})
.collect(),
}
});

let content_snippet = content_snippet_gen.as_ref().map(|generator| {
let snippet = generator.snippet_from_doc(&retrieved_doc);
Snippet {
fragment: snippet.fragment().to_string(),
highlights: snippet
.highlighted()
.iter()
.map(|range| HighlightRange {
start: range.start,
end: range.end,
})
.collect(),
}
});

hits.push(SearchHit {
score,
document: search_doc,
title_snippet,
content_snippet,
});
}
}
Expand Down Expand Up @@ -232,6 +406,12 @@ impl<'a, R: tauri::Runtime, M: tauri::Manager<R>> Tantivy<'a, R, M> {
doc.add_text(fields.content, &document.content);
doc.add_i64(fields.created_at, document.created_at);

for facet_path in &document.facets {
if let Ok(facet) = Facet::from_text(facet_path) {
doc.add_facet(fields.facets, facet);
}
}

writer.add_document(doc)?;
writer.commit()?;

Expand Down Expand Up @@ -273,6 +453,12 @@ impl<'a, R: tauri::Runtime, M: tauri::Manager<R>> Tantivy<'a, R, M> {
doc.add_text(fields.content, &document.content);
doc.add_i64(fields.created_at, document.created_at);

for facet_path in &document.facets {
if let Ok(facet) = Facet::from_text(facet_path) {
doc.add_facet(fields.facets, facet);
}
}

writer.add_document(doc)?;
writer.commit()?;

Expand Down
20 changes: 20 additions & 0 deletions plugins/tantivy/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,28 @@ pub struct SearchDocument {
pub title: String,
pub content: String,
pub created_at: i64,
#[serde(default)]
pub facets: Vec<String>,
}

#[derive(Debug, Clone, Serialize, Deserialize, specta::Type)]
pub struct Snippet {
pub fragment: String,
pub highlights: Vec<HighlightRange>,
}

#[derive(Debug, Clone, Serialize, Deserialize, specta::Type)]
pub struct HighlightRange {
pub start: usize,
pub end: usize,
}

#[derive(Debug, Clone, Serialize, Deserialize, specta::Type)]
pub struct SearchHit {
pub score: f32,
pub document: SearchDocument,
pub title_snippet: Option<Snippet>,
pub content_snippet: Option<Snippet>,
}

#[derive(Debug, Clone, Serialize, Deserialize, specta::Type)]
Expand All @@ -54,12 +70,16 @@ pub struct CreatedAtFilter {
pub struct SearchFilters {
pub created_at: Option<CreatedAtFilter>,
pub doc_type: Option<String>,
pub facet: Option<String>,
}

#[derive(Debug, Clone, Default, Serialize, Deserialize, specta::Type)]
pub struct SearchOptions {
pub fuzzy: Option<bool>,
pub distance: Option<u8>,
pub snippets: Option<bool>,
pub snippet_max_chars: Option<usize>,
pub phrase_slop: Option<u32>,
}

fn default_limit() -> usize {
Expand Down
13 changes: 12 additions & 1 deletion plugins/tantivy/src/schema.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
use tantivy::TantivyDocument;
use tantivy::schema::{FAST, Field, STORED, STRING, Schema, TextFieldIndexing, TextOptions, Value};
use tantivy::schema::{
FAST, FacetOptions, Field, STORED, STRING, Schema, TextFieldIndexing, TextOptions, Value,
};

use crate::SearchDocument;

Expand All @@ -10,6 +12,7 @@ pub struct SchemaFields {
pub title: Field,
pub content: Field,
pub created_at: Field,
pub facets: Field,
}

pub fn build_schema() -> Schema {
Expand All @@ -28,6 +31,7 @@ pub fn build_schema() -> Schema {
schema_builder.add_text_field("title", text_options.clone());
schema_builder.add_text_field("content", text_options);
schema_builder.add_i64_field("created_at", FAST | STORED);
schema_builder.add_facet_field("facets", FacetOptions::default());
schema_builder.build()
}

Expand All @@ -39,6 +43,7 @@ pub fn get_fields(schema: &Schema) -> SchemaFields {
title: schema.get_field("title").unwrap(),
content: schema.get_field("content").unwrap(),
created_at: schema.get_field("created_at").unwrap(),
facets: schema.get_field("facets").unwrap(),
}
}

Expand All @@ -57,13 +62,19 @@ pub fn extract_search_document(
let content = doc.get_first(fields.content)?.as_str()?.to_string();
let created_at = doc.get_first(fields.created_at)?.as_i64()?;

let facets: Vec<String> = doc
.get_all(fields.facets)
.filter_map(|v| v.as_facet().map(|f| f.to_string()))
.collect();

Some(SearchDocument {
id,
doc_type,
language,
title,
content,
created_at,
facets,
})
}

Expand Down
Loading