Skip to content

Commit 06f0ef0

Browse files
congx4claude
andauthored
feat(es-compat): support regexp shorthand, expose concatenate fields, map text to keyword in _mapping (quickwit-oss#6208)
* feat(es-compat): support regexp shorthand format, expose concatenate fields, and map text to keyword in _mapping Elasticsearch's `regexp` query accepts two formats: - Shorthand: `{"regexp": {"field": "pattern"}}` - Full: `{"regexp": {"field": {"value": "pattern", "case_insensitive": true}}}` Quickwit only supported the full form, causing queries from ES-compatible connectors (e.g. Trino ES connector) to fail with a deserialization error. This adds support for the shorthand format via `#[serde(untagged)]` enum deserialization. Additionally, in the `_mapping` endpoint: - `Text` fields are now reported as `keyword` type. This enables filter pushdown (e.g. `LIKE` predicates) from connectors that only push down filters for `keyword`-typed fields. - `Concatenate` fields are now exposed as `keyword` type instead of being hidden. This allows connectors to discover and query these fields. Made-with: Cursor * fix(es-compat): replace manual Default impl with derive to fix clippy lint Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * address PR review: inline custom Deserialize for RegexQueryParams, add keyword comment - Replace inner enum + serde(from) with a custom Deserialize impl directly on RegexQueryParams, as suggested by reviewer - Add comment explaining why text fields are mapped to keyword in the ES-compat _mapping response Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * refactor: simplify RegexQueryParams to a plain untagged enum Replace the custom Deserialize visitor with a simple #[serde(untagged)] enum that handles both shorthand and full regexp query formats directly. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 2bcade6 commit 06f0ef0

File tree

2 files changed

+79
-17
lines changed

2 files changed

+79
-17
lines changed

quickwit/quickwit-query/src/elastic_query_dsl/regex_query.rs

Lines changed: 65 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,22 +18,43 @@ use crate::elastic_query_dsl::ConvertibleToQueryAst;
1818
use crate::elastic_query_dsl::one_field_map::OneFieldMap;
1919
use crate::query_ast::{QueryAst, RegexQuery as AstRegexQuery};
2020

21-
#[derive(Deserialize, Debug, Default, Eq, PartialEq, Clone)]
22-
#[serde(deny_unknown_fields)]
23-
pub struct RegexQueryParams {
24-
value: String,
25-
#[serde(default)]
26-
case_insensitive: bool,
21+
/// Elasticsearch supports two formats for regexp queries:
22+
/// - Shorthand: `{"regexp": {"field": "pattern"}}`
23+
/// - Full: `{"regexp": {"field": {"value": "pattern", "case_insensitive": true}}}`
24+
#[derive(Deserialize, Debug, Eq, PartialEq, Clone)]
25+
#[serde(untagged)]
26+
pub enum RegexQueryParams {
27+
Full {
28+
#[serde(rename = "value")]
29+
pattern: String,
30+
#[serde(default)]
31+
case_insensitive: bool,
32+
},
33+
Shorthand(String),
34+
}
35+
36+
impl RegexQueryParams {
37+
fn into_tuple(self) -> (String, bool) {
38+
match self {
39+
RegexQueryParams::Full {
40+
pattern,
41+
case_insensitive,
42+
} => (pattern, case_insensitive),
43+
RegexQueryParams::Shorthand(pattern) => (pattern, false),
44+
}
45+
}
2746
}
2847

2948
pub type RegexQuery = OneFieldMap<RegexQueryParams>;
3049

3150
impl ConvertibleToQueryAst for RegexQuery {
3251
fn convert_to_query_ast(self) -> anyhow::Result<QueryAst> {
33-
let regex = if self.value.case_insensitive {
34-
format!("(?i){}", self.value.value)
52+
let (pattern, case_insensitive) = self.value.into_tuple();
53+
54+
let regex = if case_insensitive {
55+
format!("(?i){pattern}")
3556
} else {
36-
self.value.value.clone()
57+
pattern
3758
};
3859
Ok(AstRegexQuery {
3960
field: self.field,
@@ -42,3 +63,38 @@ impl ConvertibleToQueryAst for RegexQuery {
4263
.into())
4364
}
4465
}
66+
67+
#[cfg(test)]
68+
mod tests {
69+
use super::*;
70+
71+
#[test]
72+
fn test_regex_query_shorthand_format() {
73+
let json = serde_json::json!({"service": ".*logs.*"});
74+
let query: RegexQuery = serde_json::from_value(json).unwrap();
75+
assert_eq!(query.field, "service");
76+
let (pattern, case_insensitive) = query.value.into_tuple();
77+
assert_eq!(pattern, ".*logs.*");
78+
assert!(!case_insensitive);
79+
}
80+
81+
#[test]
82+
fn test_regex_query_full_format() {
83+
let json = serde_json::json!({"service": {"value": ".*logs.*", "case_insensitive": true}});
84+
let query: RegexQuery = serde_json::from_value(json).unwrap();
85+
assert_eq!(query.field, "service");
86+
let (pattern, case_insensitive) = query.value.into_tuple();
87+
assert_eq!(pattern, ".*logs.*");
88+
assert!(case_insensitive);
89+
}
90+
91+
#[test]
92+
fn test_regex_query_full_format_default_case() {
93+
let json = serde_json::json!({"service": {"value": ".*logs.*"}});
94+
let query: RegexQuery = serde_json::from_value(json).unwrap();
95+
assert_eq!(query.field, "service");
96+
let (pattern, case_insensitive) = query.value.into_tuple();
97+
assert_eq!(pattern, ".*logs.*");
98+
assert!(!case_insensitive);
99+
}
100+
}

quickwit/quickwit-serve/src/elasticsearch_api/model/mappings.rs

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,11 @@ fn build_properties(field_mappings: &[FieldMappingEntry]) -> HashMap<String, Fie
9999

100100
fn field_mapping_from_entry(entry: &FieldMappingEntry) -> Option<FieldMapping> {
101101
match &entry.mapping_type {
102-
FieldMappingType::Text(..) => Some(FieldMapping::Leaf { typ: "text" }),
102+
// Quickwit text fields behave like ES keyword fields: they support exact
103+
// match, prefix, and regexp queries. Reporting them as "keyword" enables
104+
// downstream connectors (e.g. Trino ES connector) to push down filters and
105+
// LIKE predicates, which they only do for keyword-typed fields.
106+
FieldMappingType::Text(..) => Some(FieldMapping::Leaf { typ: "keyword" }),
103107
FieldMappingType::I64(..) => Some(FieldMapping::Leaf { typ: "long" }),
104108
FieldMappingType::U64(..) => Some(FieldMapping::Leaf { typ: "long" }),
105109
FieldMappingType::F64(..) => Some(FieldMapping::Leaf { typ: "double" }),
@@ -115,7 +119,7 @@ fn field_mapping_from_entry(entry: &FieldMappingEntry) -> Option<FieldMapping> {
115119
properties,
116120
})
117121
}
118-
FieldMappingType::Concatenate(_) => None,
122+
FieldMappingType::Concatenate(_) => Some(FieldMapping::Leaf { typ: "keyword" }),
119123
}
120124
}
121125

@@ -178,7 +182,7 @@ mod tests {
178182
let entry: FieldMappingEntry = serde_json::from_value(entry_json).unwrap();
179183
let mapping = field_mapping_from_entry(&entry).unwrap();
180184
let serialized = serde_json::to_value(&mapping).unwrap();
181-
assert_eq!(serialized, json!({ "type": "text" }));
185+
assert_eq!(serialized, json!({ "type": "keyword" }));
182186
}
183187

184188
#[test]
@@ -209,21 +213,23 @@ mod tests {
209213
"type": "object",
210214
"properties": {
211215
"id": { "type": "long" },
212-
"label": { "type": "text" }
216+
"label": { "type": "keyword" }
213217
}
214218
})
215219
);
216220
}
217221

218222
#[test]
219-
fn test_field_mapping_from_entry_concatenate_skipped() {
223+
fn test_field_mapping_from_entry_concatenate_exposed_as_keyword() {
220224
let entry_json = json!({
221225
"name": "concat_field",
222226
"type": "concatenate",
223227
"concatenate_fields": ["field_a", "field_b"]
224228
});
225229
let entry: FieldMappingEntry = serde_json::from_value(entry_json).unwrap();
226-
assert!(field_mapping_from_entry(&entry).is_none());
230+
let mapping = field_mapping_from_entry(&entry).unwrap();
231+
let serialized = serde_json::to_value(&mapping).unwrap();
232+
assert_eq!(serialized, json!({ "type": "keyword" }));
227233
}
228234

229235
#[test]
@@ -251,7 +257,7 @@ mod tests {
251257
let props = build_properties(&entries);
252258
let to_json = |fm: &FieldMapping| serde_json::to_value(fm).unwrap();
253259

254-
assert_eq!(to_json(&props["title"]), json!({ "type": "text" }));
260+
assert_eq!(to_json(&props["title"]), json!({ "type": "keyword" }));
255261
assert_eq!(to_json(&props["count"]), json!({ "type": "long" }));
256262
assert_eq!(to_json(&props["unsigned"]), json!({ "type": "long" }));
257263
assert_eq!(to_json(&props["score"]), json!({ "type": "double" }));
@@ -263,7 +269,7 @@ mod tests {
263269

264270
let meta = to_json(&props["metadata"]);
265271
assert_eq!(meta["type"], "object");
266-
assert_eq!(meta["properties"]["source"]["type"], "text");
272+
assert_eq!(meta["properties"]["source"]["type"], "keyword");
267273
}
268274

269275
#[test]

0 commit comments

Comments
 (0)