Skip to content

Commit 6815e5c

Browse files
frascuchonpre-commit-ci[bot]damianpumar
authored
[ENHANCEMENT]: argilla-server: Enhance text search with simple query dsl (#5222)
# Description <!-- Please include a summary of the changes and the related issue. Please also include relevant motivation and context. List any dependencies that are required for this change. --> This PR adds a tiny dsl to improve text queries based on [this query](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-simple-query-string-query.html#simple-query-string-syntax) **Type of change** <!-- Please delete options that are not relevant. Remember to title the PR according to the type of change --> - Improvement (change adding some improvement to an existing functionality) - Documentation update **How Has This Been Tested** <!-- Please add some reference about how your feature has been tested. --> **Checklist** <!-- Please go over the list and make sure you've taken everything into account --> - I added relevant documentation - I followed the style guidelines of this project - I did a self-review of my code - I made corresponding changes to the documentation - I confirm My changes generate no new warnings - I have added tests that prove my fix is effective or that my feature works - I have added relevant notes to the CHANGELOG.md file (See https://keepachangelog.com/) --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Damián Pumar <[email protected]>
1 parent 60f3073 commit 6815e5c

File tree

3 files changed

+39
-13
lines changed

3 files changed

+39
-13
lines changed

argilla-frontend/components/features/annotation/container/fields/useSearchTextHighlight.ts

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,25 @@ declare namespace CSS {
1010
};
1111
}
1212

13+
const DSLChars = ["|", "+", "-", "*"];
14+
1315
export const useSearchTextHighlight = (fieldId: string) => {
1416
const FIELD_ID_TO_HIGHLIGHT = `fields-content-${fieldId}`;
1517
const HIGHLIGHT_CLASS = `search-text-highlight-${fieldId}`;
1618

19+
const scapeDSLChars = (value: string) => {
20+
let output = value;
21+
22+
for (const char of DSLChars) {
23+
output = output.replaceAll(char, " ");
24+
}
25+
26+
return output
27+
.split(" ")
28+
.map((w) => w.trim())
29+
.filter(Boolean);
30+
};
31+
1732
const createRangesToHighlight = (
1833
fieldComponent: HTMLElement,
1934
searchText: string
@@ -89,7 +104,7 @@ export const useSearchTextHighlight = (fieldId: string) => {
89104
};
90105

91106
const textNodes = getTextNodesUnder(fieldComponent);
92-
const words = searchText.split(" ");
107+
const words = scapeDSLChars(searchText);
93108

94109
for (const textNode of textNodes) {
95110
for (const word of words) {

argilla-server/src/argilla_server/search_engine/commons.py

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,21 @@ def es_ids_query(ids: List[str]) -> dict:
106106
return {"ids": {"values": ids}}
107107

108108

109+
def es_simple_query_string(field_name: str, query: str) -> dict:
110+
# See https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-simple-query-string-query.html
111+
return {
112+
"simple_query_string": {
113+
"query": query,
114+
"fields": [field_name],
115+
"default_operator": "AND",
116+
"analyze_wildcard": False,
117+
"auto_generate_synonyms_phrase_query": False,
118+
"fuzzy_max_expansions": 10,
119+
"fuzzy_transpositions": False,
120+
}
121+
}
122+
123+
109124
def es_nested_query(path: str, query: dict) -> dict:
110125
return {
111126
"nested": {
@@ -138,7 +153,7 @@ def es_field_for_metadata_property(metadata_property: Union[str, MetadataPropert
138153

139154

140155
def es_field_for_record_field(field_name: str) -> str:
141-
return f"fields.{field_name}"
156+
return f"fields.{field_name or '*'}"
142157

143158

144159
def es_field_for_response_property(property: str) -> str:
@@ -612,17 +627,7 @@ def _build_text_query(dataset: Dataset, text: Optional[Union[TextQuery, str]] =
612627
if isinstance(text, str):
613628
text = TextQuery(q=text)
614629

615-
if not text.field:
616-
# See https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-multi-match-query.html
617-
field_names = [
618-
es_field_for_record_field(field.name)
619-
for field in dataset.fields
620-
if field.settings.get("type") == FieldType.text
621-
]
622-
return {"multi_match": {"query": text.q, "type": "cross_fields", "fields": field_names, "operator": "and"}}
623-
else:
624-
# See https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-match-query.html
625-
return {"match": {es_field_for_record_field(text.field): {"query": text.q, "operator": "and"}}}
630+
return es_simple_query_string(es_field_for_record_field(text.field), query=text.q)
626631

627632
@staticmethod
628633
def _mapping_for_fields(fields: List[Field]) -> dict:

argilla-server/tests/unit/search_engine/test_commons.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -548,6 +548,9 @@ async def test_create_index_for_dataset_with_questions(
548548
("00000", 1),
549549
("card payment", 5),
550550
("nothing", 0),
551+
("cash | negative", 6), # OR
552+
("cash + negative", 1), # AN
553+
("-(cash | negative)", 3), # NOT
551554
(TextQuery(q="card"), 5),
552555
(TextQuery(q="account"), 1),
553556
(TextQuery(q="payment"), 6),
@@ -558,6 +561,9 @@ async def test_create_index_for_dataset_with_questions(
558561
(TextQuery(q="negative", field="label"), 4),
559562
(TextQuery(q="00000", field="textId"), 1),
560563
(TextQuery(q="card payment", field="text"), 5),
564+
(TextQuery(q="cash | negative", field="text"), 3),
565+
(TextQuery(q="cash + negative", field="text"), 0),
566+
(TextQuery(q="-(cash | negative)", field="text"), 6),
561567
],
562568
)
563569
async def test_search_with_query_string(

0 commit comments

Comments
 (0)