Skip to content

Commit 8b88a76

Browse files
authored
Merge pull request #2 from openaleph/feat/tagging
Entity tagging
2 parents 6e0072c + f1a3604 commit 8b88a76

File tree

6 files changed

+91
-1
lines changed

6 files changed

+91
-1
lines changed

.github/workflows/build-elastic.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,9 @@ jobs:
2323
images: ghcr.io/openaleph/elasticsearch
2424
tags: |
2525
type=ref,event=branch
26-
type=semver,pattern={{version}}
2726
type=sha
2827
type=raw,value=9
28+
type=raw,value=9.1.1
2929
type=raw,value=latest
3030
3131
- name: Set up Docker Buildx

openaleph_search/index/entities.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
"caption",
2929
"schema",
3030
"properties",
31+
"tags",
3132
"dataset",
3233
"collection_id",
3334
"profile_id",

openaleph_search/index/mapping.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ class Field:
107107
GEO_POINT = "geo_point"
108108
CONTENT = "content"
109109
TEXT = "text"
110+
TAGS = "tags"
110111

111112
NUMERIC = "numeric"
112113
PROPERTIES = "properties"
@@ -228,6 +229,8 @@ class FieldType:
228229
# full text
229230
Field.CONTENT: FieldType.CONTENT,
230231
Field.TEXT: FieldType.TEXT,
232+
# tagging
233+
Field.TAGS: FieldType.KEYWORD,
231234
# processing metadata
232235
Field.UPDATED_AT: FieldType.DATE,
233236
Field.CREATED_AT: FieldType.DATE,

openaleph_search/transform/entity.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,9 @@ def format_entity(dataset: str, entity: EntityProxy, **kwargs) -> Action | None:
8989
data[Field.NAME_PARTS] = list(index_name_parts(entity.schema, names))
9090
data[Field.NAME_PHONETIC] = list(phonetic_names(entity.schema, names))
9191

92+
# Add tags from EntityProxy.context (they are added from aleph db before indexing)
93+
data[Field.TAGS] = ensure_list(entity.context.get("tags"))
94+
9295
# Slight hack: a magic property in followthemoney that gets taken out
9396
# of the properties and added straight to the index text.
9497
properties = data.get("properties", {})

tests/test_indexer.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
1+
from ftmq.util import make_entity
2+
13
from openaleph_search.index.admin import clear_index
24
from openaleph_search.index.entities import index_bulk, iter_entities
5+
from openaleph_search.transform.entity import format_entity
36

47

58
def test_indexer(entities, cleanup_after):
@@ -12,3 +15,38 @@ def test_indexer(entities, cleanup_after):
1215
# overwrite
1316
index_bulk("test_dataset", entities)
1417
assert len(list(iter_entities())) == 21
18+
19+
20+
def test_indexer_with_tags(cleanup_after):
21+
# clear
22+
clear_index()
23+
24+
# Create entity with tags in context
25+
entity_data = {
26+
"id": "test-person-with-tags",
27+
"schema": "Person",
28+
"properties": {"name": ["Jane Doe"], "birthDate": ["1980-01-01"]},
29+
}
30+
entity = make_entity(entity_data)
31+
entity.context["tags"] = ["politician", "businessman", "controversial"]
32+
33+
# Verify format_entity includes the tags
34+
formatted = format_entity("test_dataset", entity)
35+
assert formatted is not None
36+
assert "tags" in formatted["_source"]
37+
assert formatted["_source"]["tags"] == [
38+
"politician",
39+
"businessman",
40+
"controversial",
41+
]
42+
43+
index_bulk("test_dataset", [entity])
44+
45+
# Verify entity was indexed
46+
indexed_entities = list(iter_entities())
47+
assert len(indexed_entities) == 1
48+
49+
# Verify tags are in the indexed document
50+
indexed_entity = indexed_entities[0]
51+
assert "tags" in indexed_entity
52+
assert indexed_entity["tags"] == ["politician", "businessman", "controversial"]

tests/test_search.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -321,3 +321,48 @@ def test_search_sort(cleanup_after):
321321
result = query.search()
322322
assert len(result["hits"]["hits"]) == 2
323323
assert result["hits"]["hits"][0]["_id"] == "event2"
324+
325+
326+
def test_search_tag_filter(cleanup_after):
327+
# Create entity with tags in context
328+
entity_data = {
329+
"id": "jane-doe-tagged",
330+
"schema": "Person",
331+
"properties": {"name": ["Jane Doe"], "birthDate": ["1980-01-01"]},
332+
}
333+
entity = make_entity(entity_data)
334+
entity.context["tags"] = ["politician", "businessman", "controversial"]
335+
336+
index_bulk("test_tagged", [entity], sync=True)
337+
338+
# Test search with tag filter
339+
query = _create_query("/search?filter:dataset=test_tagged&filter:tags=politician")
340+
result = query.search()
341+
342+
assert result["hits"]["total"]["value"] == 1
343+
assert result["hits"]["hits"][0]["_id"] == "jane-doe-tagged"
344+
345+
# Test search with non-existent tag
346+
query = _create_query("/search?filter:dataset=test_tagged&filter:tags=nonexistent")
347+
result = query.search()
348+
349+
assert result["hits"]["total"]["value"] == 0
350+
351+
# Test search with multiple tag values
352+
query = _create_query("/search?filter:dataset=test_tagged&filter:tags=businessman")
353+
result = query.search()
354+
355+
assert result["hits"]["total"]["value"] == 1
356+
assert result["hits"]["hits"][0]["_id"] == "jane-doe-tagged"
357+
358+
# Test tags facet aggregation
359+
query = _create_query("/search?filter:dataset=test_tagged&facet=tags")
360+
result = query.search()
361+
362+
assert result["hits"]["total"]["value"] == 1
363+
assert "aggregations" in result
364+
assert result["aggregations"]["tags.values"]["buckets"] == [
365+
{"key": "businessman", "doc_count": 1},
366+
{"key": "controversial", "doc_count": 1},
367+
{"key": "politician", "doc_count": 1},
368+
]

0 commit comments

Comments
 (0)