Skip to content

Commit a92459e

Browse files
committed
✨ Store translations separately for highlighting
1 parent 81ad6a5 commit a92459e

File tree

11 files changed

+286
-16
lines changed

11 files changed

+286
-16
lines changed

openaleph_search/index/indexes.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,8 +114,10 @@ def make_schema_bucket_mapping(bucket: Bucket) -> dict[str, Any]:
114114
if bucket == "pages":
115115
# store full text for highlighting
116116
mapping["properties"][Field.CONTENT]["store"] = True
117+
mapping["properties"][Field.TRANSLATION]["store"] = True
117118
else:
118119
mapping["properties"][Field.CONTENT]["store"] = False
120+
mapping["properties"][Field.TRANSLATION]["store"] = False
119121
return mapping
120122

121123

openaleph_search/index/mapping.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
MappingProperty: TypeAlias = dict[str, list[str] | str]
1717
Mapping: TypeAlias = dict[str, MappingProperty]
1818

19+
PROP_TRANSLATED = "translatedText"
20+
1921
# MAPPING SHORTCUTS #
2022
DEFAULT_ANALYZER = "default"
2123
DEFAULT_NORMALIZER = "default"
@@ -95,6 +97,7 @@ class Field:
9597
CONTENT = "content"
9698
TEXT = "text"
9799
TAGS = "tags"
100+
TRANSLATION = "translation"
98101

99102
NUMERIC = "numeric"
100103
PROPERTIES = "properties"
@@ -184,6 +187,7 @@ class FieldType:
184187
*GROUPS,
185188
Field.TEXT,
186189
Field.CONTENT,
190+
Field.TRANSLATION,
187191
Field.NAME,
188192
Field.NAME_KEYS,
189193
Field.NAME_PARTS,
@@ -217,6 +221,7 @@ class FieldType:
217221
# full text
218222
Field.CONTENT: FieldType.CONTENT,
219223
Field.TEXT: FieldType.TEXT,
224+
Field.TRANSLATION: FieldType.TEXT,
220225
# tagging
221226
Field.TAGS: FieldType.KEYWORD,
222227
# processing metadata
@@ -303,7 +308,9 @@ def make_schema_mapping(schemata: Iterable[SchemaType]) -> Mapping:
303308
if prop.stub:
304309
continue
305310
merged_props[name]["type"].add(get_index_field_type(prop.type))
306-
if prop.type == registry.text:
311+
if name == PROP_TRANSLATED:
312+
merged_props[name]["copy_to"].add(Field.TRANSLATION)
313+
elif prop.type == registry.text:
307314
merged_props[name]["copy_to"].add(Field.CONTENT)
308315
else:
309316
merged_props[name]["copy_to"].add(Field.TEXT)

openaleph_search/query/base.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,7 @@ def get_highlight(self) -> dict[str, Any]:
375375
),
376376
Field.NAMES: get_highlighter(Field.NAME),
377377
Field.NAMES: get_highlighter(Field.NAMES),
378+
Field.TRANSLATION: get_highlighter(Field.TRANSLATION, query),
378379
}
379380
if Field.TEXT not in fields:
380381
fields[Field.TEXT] = get_highlighter(Field.TEXT, query)

openaleph_search/query/queries.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ class EntitiesQuery(Query):
5656
f"{Field.NAME_PARTS}^2",
5757
Field.CONTENT,
5858
f"{Field.TEXT}^0.8",
59+
f"{Field.TRANSLATION}^0.7",
5960
]
6061
PREFIX_FIELD = Field.NAME_PARTS
6162
HIGHLIGHT_FIELD = Field.CONTENT

openaleph_search/transform/entity.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,12 @@
88
from multiprocessing import cpu_count
99
from typing import Generator, Iterable
1010

11-
from anystore.functools import weakref_cache
1211
from anystore.logging import get_logger
1312
from anystore.util import Took
1413
from banal import ensure_list
1514
from followthemoney import EntityProxy, model, registry
1615
from followthemoney.namespace import Namespace
16+
from ftmq.util import select_data
1717

1818
from openaleph_search.index.indexer import Action, Actions
1919
from openaleph_search.index.indexes import entities_write_index, schema_bucket
@@ -53,7 +53,11 @@ def _get_symbols(entity: EntityProxy) -> set[str]:
5353
return symbols
5454

5555

56-
@weakref_cache
56+
def _get_translations(entity: EntityProxy) -> set[str]:
57+
return set(select_data(entity, "__translation__"))
58+
59+
60+
@functools.cache
5761
def _get_namespace(value: str) -> Namespace:
5862
return Namespace(value)
5963

@@ -78,6 +82,7 @@ def format_entity(dataset: str, entity: EntityProxy, **kwargs) -> Action | None:
7882
dataset = valid_dataset(dataset)
7983

8084
data = entity.to_dict()
85+
data["properties"] = data.get("properties", {})
8186
# deprecated
8287
collection_id = kwargs.get("collection_id")
8388
if collection_id is not None:
@@ -109,13 +114,21 @@ def format_entity(dataset: str, entity: EntityProxy, **kwargs) -> Action | None:
109114

110115
# Slight hack: a magic property in followthemoney that gets taken out
111116
# of the properties and added straight to the index text.
112-
properties = data.get("properties", {})
113-
text = properties.pop("indexText", [])
117+
text = data["properties"].pop("indexText", [])
118+
119+
# Another hack: Translations are prefixed with "__translation__" in
120+
# `Pages.indexText`
121+
if entity.schema.name == "Pages":
122+
translations = _get_translations(entity)
123+
if translations:
124+
data[Field.TRANSLATION] = list(translations)
125+
text = [t for t in text if not t.startswith("__translation__")]
126+
114127
if text:
115128
data[Field.CONTENT] = text
116129

117130
# length normalization
118-
data[Field.NUM_VALUES] = sum([len(v) for v in properties.values()])
131+
data[Field.NUM_VALUES] = sum([len(v) for v in data["properties"].values()])
119132

120133
# integer casting
121134
numeric = {}

tests/test_highlighting.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from ftmq.util import make_entity
2+
13
from openaleph_search.index.entities import index_bulk
24
from openaleph_search.parse.parser import SearchQueryParser
35
from openaleph_search.query.queries import EntitiesQuery
@@ -78,3 +80,92 @@ def test_highlighting_pages(fixture_pages, cleanup_after):
7880
parent_id="f61295777cf69f423855655f1614794ce22086d8.b154e50f50c8c8133168767d78bbd1dff067f308",
7981
)
8082
assert "<em>MIT license</em>" in highlight
83+
84+
85+
def test_highlighting_translation_plaintext(cleanup_after):
86+
"""Test that translation highlights are returned under the 'translation' key,
87+
separate from 'content' highlights."""
88+
entity = make_entity(
89+
{
90+
"id": "plaintext-highlight-translation",
91+
"schema": "PlainText",
92+
"properties": {
93+
"fileName": ["report.txt"],
94+
"bodyText": ["Original text about financial regulations"],
95+
"translatedText": ["Übersetzter Text über Finanzvorschriften"],
96+
},
97+
}
98+
)
99+
100+
index_bulk("test_highlight_translation", [entity], sync=True)
101+
102+
args = [
103+
("q", "Finanzvorschriften"),
104+
("highlight", "true"),
105+
("filter:dataset", "test_highlight_translation"),
106+
]
107+
query = EntitiesQuery(SearchQueryParser(args, None))
108+
result = query.search()
109+
110+
assert result["hits"]["total"]["value"] == 1
111+
hit = result["hits"]["hits"][0]
112+
assert "highlight" in hit
113+
# Translation highlights should be under the 'translation' key
114+
assert "translation" in hit["highlight"]
115+
assert any(
116+
"<em>Finanzvorschriften</em>" in fragment
117+
for fragment in hit["highlight"]["translation"]
118+
)
119+
120+
121+
def test_highlighting_translation_pages(cleanup_after):
122+
"""Test that Pages translation highlights are returned under 'translation' key,
123+
while original content highlights come under 'content'."""
124+
entity = make_entity(
125+
{
126+
"id": "pages-highlight-translation",
127+
"schema": "Pages",
128+
"properties": {
129+
"fileName": ["bericht.pdf"],
130+
"indexText": [
131+
"Original German text about environmental policies",
132+
"__translation__ Translated text about Umweltpolitik",
133+
],
134+
},
135+
}
136+
)
137+
138+
index_bulk("test_highlight_pages_translation", [entity], sync=True)
139+
140+
# Search for a term in the translation
141+
args = [
142+
("q", "Umweltpolitik"),
143+
("highlight", "true"),
144+
("filter:dataset", "test_highlight_pages_translation"),
145+
]
146+
query = EntitiesQuery(SearchQueryParser(args, None))
147+
result = query.search()
148+
149+
assert result["hits"]["total"]["value"] == 1
150+
hit = result["hits"]["hits"][0]
151+
assert "highlight" in hit
152+
assert "translation" in hit["highlight"]
153+
assert any(
154+
"<em>Umweltpolitik</em>" in fragment
155+
for fragment in hit["highlight"]["translation"]
156+
)
157+
158+
# Search for a term in the original content — should highlight under 'content'
159+
args = [
160+
("q", "environmental policies"),
161+
("highlight", "true"),
162+
("filter:dataset", "test_highlight_pages_translation"),
163+
]
164+
query = EntitiesQuery(SearchQueryParser(args, None))
165+
result = query.search()
166+
167+
assert result["hits"]["total"]["value"] == 1
168+
hit = result["hits"]["hits"][0]
169+
assert "highlight" in hit
170+
assert "content" in hit["highlight"]
171+
assert "translation" not in hit["highlight"]

tests/test_indexer.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,59 @@ def test_bulk_indexing_mode(cleanup_after):
190190
assert idx_settings["number_of_replicas"] == str(settings.index_replicas)
191191

192192

193+
def test_translation_plaintext():
194+
"""PlainText: translatedText property is kept in properties; the ES mapping
195+
copy_to directive copies it into the `translation` field at index time, so
196+
the transform payload should NOT contain a top-level `translation` key."""
197+
entity = make_entity(
198+
{
199+
"id": "plain-text-translated",
200+
"schema": "PlainText",
201+
"properties": {
202+
"fileName": ["document.txt"],
203+
"translatedText": ["This is the translated text"],
204+
},
205+
}
206+
)
207+
action = format_entity("test_dataset", entity)
208+
assert action is not None
209+
source = action["_source"]
210+
# translatedText stays in properties for ES copy_to to handle
211+
assert "translatedText" in source["properties"]
212+
assert source["properties"]["translatedText"] == ["This is the translated text"]
213+
# No explicit translation field — ES copy_to handles it
214+
assert "translation" not in source
215+
216+
217+
def test_translation_pages():
218+
"""Pages: translations are extracted from indexText values prefixed with
219+
`__translation__` and placed into the `translation` field explicitly."""
220+
entity = make_entity(
221+
{
222+
"id": "pages-translated",
223+
"schema": "Pages",
224+
"properties": {
225+
"fileName": ["document.pdf"],
226+
"indexText": [
227+
"regular text content",
228+
"__translation__ Dies ist der übersetzte Text",
229+
"__translation__ Ceci est le texte traduit",
230+
],
231+
},
232+
}
233+
)
234+
action = format_entity("test_dataset", entity)
235+
assert action is not None
236+
source = action["_source"]
237+
assert "translation" in source
238+
assert set(source["translation"]) == {
239+
"Dies ist der übersetzte Text",
240+
"Ceci est le texte traduit",
241+
}
242+
# indexText is moved to `content`, and translations are stripped out
243+
assert "content" in source
244+
245+
193246
def test_indexer_namespace(monkeypatch):
194247
import importlib
195248

tests/test_mapping.py

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,12 @@ def test_mapping_spec():
167167
assert GROUP_MAPPING["emails"]["type"] == "keyword"
168168
mapping = make_schema_mapping(model.schemata.values())
169169
assert all(
170-
("text" in f["copy_to"] or "content" in f["copy_to"]) for f in mapping.values()
170+
(
171+
"text" in f["copy_to"]
172+
or "content" in f["copy_to"]
173+
or "translation" in f["copy_to"]
174+
)
175+
for f in mapping.values()
171176
)
172177
assert "names" in mapping["name"]["copy_to"]
173178
assert "dates" in mapping["birthDate"]["copy_to"]
@@ -188,6 +193,12 @@ def test_mapping_spec():
188193
"name" in person_mapping["name"]["copy_to"]
189194
), "Person name should copy to name field"
190195

196+
# translatedText should copy to translation field, not to content/text
197+
plaintext_mapping = make_schema_mapping(["PlainText"])
198+
assert "translation" in plaintext_mapping["translatedText"]["copy_to"]
199+
assert "content" not in plaintext_mapping["translatedText"]["copy_to"]
200+
assert "text" not in plaintext_mapping["translatedText"]["copy_to"]
201+
191202
full_mapping = make_mapping(mapping)
192203
assert "date" in full_mapping["properties"]["numeric"]["properties"]
193204
assert "dates" in full_mapping["properties"]["numeric"]["properties"]
@@ -197,14 +208,11 @@ def test_mapping_spec():
197208

198209

199210
def test_mapping_schema_bucket():
200-
# full text is stored only for Pages entities
211+
# full text and translation are stored only for Pages entities (for highlighting)
201212
mapping = make_schema_bucket_mapping("pages")
202213
assert mapping["properties"]["content"]["store"] is True
203-
mapping = make_schema_bucket_mapping("page")
204-
assert mapping["properties"]["content"]["store"] is False
205-
mapping = make_schema_bucket_mapping("documents")
206-
assert mapping["properties"]["content"]["store"] is False
207-
mapping = make_schema_bucket_mapping("intervals")
208-
assert mapping["properties"]["content"]["store"] is False
209-
mapping = make_schema_bucket_mapping("things")
210-
assert mapping["properties"]["content"]["store"] is False
214+
assert mapping["properties"]["translation"]["store"] is True
215+
for bucket in ("page", "documents", "intervals", "things"):
216+
mapping = make_schema_bucket_mapping(bucket)
217+
assert mapping["properties"]["content"]["store"] is False
218+
assert mapping["properties"]["translation"]["store"] is False

tests/test_pages.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,3 +38,50 @@ def test_pages(fixture_pages, cleanup_after):
3838
assert len(result["hits"]["hits"]) == 1
3939
assert result["hits"]["hits"][0]["_source"]["schema"] == "Pages"
4040
assert "<em>MIT</em>" in result["hits"]["hits"][0]["highlight"]["content"][0]
41+
42+
43+
def test_pages_translation(cleanup_after):
44+
"""Test that Pages entities with __translation__-prefixed indexText values
45+
are searchable via the translation field."""
46+
from ftmq.util import make_entity
47+
48+
entity = make_entity(
49+
{
50+
"id": "pages-with-translation",
51+
"schema": "Pages",
52+
"properties": {
53+
"fileName": ["bericht.pdf"],
54+
"indexText": [
55+
"Dies ist der Originalbericht auf Deutsch",
56+
"__translation__ This is the translated report in English",
57+
"__translation__ Ceci est le rapport traduit en français",
58+
],
59+
},
60+
}
61+
)
62+
63+
index_bulk("test_pages_translation", [entity], sync=True)
64+
65+
# Search for original German text — should match via content field
66+
query = _create_query(
67+
"/search?q=Originalbericht&filter:dataset=test_pages_translation"
68+
)
69+
result = query.search()
70+
assert result["hits"]["total"]["value"] == 1
71+
assert result["hits"]["hits"][0]["_id"] == "pages-with-translation"
72+
73+
# Search for English translation — should match via translation field
74+
query = _create_query(
75+
"/search?q=translated report English&filter:dataset=test_pages_translation"
76+
)
77+
result = query.search()
78+
assert result["hits"]["total"]["value"] == 1
79+
assert result["hits"]["hits"][0]["_id"] == "pages-with-translation"
80+
81+
# Search for French translation
82+
query = _create_query(
83+
"/search?q=rapport traduit français&filter:dataset=test_pages_translation"
84+
)
85+
result = query.search()
86+
assert result["hits"]["total"]["value"] == 1
87+
assert result["hits"]["hits"][0]["_id"] == "pages-with-translation"

0 commit comments

Comments
 (0)