Skip to content

Commit 89fd00e

Browse files
committed
✨ (backend) Add async triggers to enable document indexation with find.
Signed-off-by: Fabre Florian <[email protected]>
1 parent f7186c1 commit 89fd00e

File tree

7 files changed

+226
-14
lines changed

7 files changed

+226
-14
lines changed

src/backend/core/models.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@
2020
from django.core.files.storage import default_storage
2121
from django.core.mail import send_mail
2222
from django.db import models, transaction
23+
from django.db.models import signals
2324
from django.db.models.functions import Left, Length
25+
from django.dispatch import receiver
2426
from django.template.loader import render_to_string
2527
from django.utils import timezone
2628
from django.utils.functional import cached_property
@@ -39,6 +41,7 @@
3941
RoleChoices,
4042
get_equivalent_link_definition,
4143
)
44+
from .tasks.find import trigger_document_indexer
4245

4346
logger = getLogger(__name__)
4447

@@ -949,6 +952,16 @@ def restore(self):
949952
)
950953

951954

955+
@receiver(signals.post_save, sender=Document)
956+
def document_post_save(sender, instance, **kwargs):
957+
"""
958+
Asynchronous call to the document indexer at the end of the transaction.
959+
Note : Within the transaction we can have an empty content and a serialization
960+
error.
961+
"""
962+
trigger_document_indexer(instance, on_commit=True)
963+
964+
952965
class LinkTrace(BaseModel):
953966
"""
954967
Relation model to trace accesses to a document via a link by a logged-in user.
@@ -1174,6 +1187,15 @@ def get_abilities(self, user):
11741187
}
11751188

11761189

1190+
@receiver(signals.post_save, sender=DocumentAccess)
1191+
def document_access_post_save(sender, instance, created, **kwargs):
1192+
"""
1193+
Asynchronous call to the document indexer at the end of the transaction.
1194+
"""
1195+
if not created:
1196+
trigger_document_indexer(instance.document, on_commit=True)
1197+
1198+
11771199
class DocumentAskForAccess(BaseModel):
11781200
"""Relation model to ask for access to a document."""
11791201

src/backend/core/services/search_indexers.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,9 @@ def get_batch_accesses_by_users_and_teams(paths):
1919
grouped by users and teams, including all ancestor paths.
2020
"""
2121
print("paths: ", paths)
22-
ancestor_map = utils.get_ancestor_to_descendants_map(paths, steplen=models.Document.steplen)
22+
ancestor_map = utils.get_ancestor_to_descendants_map(
23+
paths, steplen=models.Document.steplen
24+
)
2325
ancestor_paths = list(ancestor_map.keys())
2426
print("ancestor map: ", ancestor_map)
2527
print("ancestor paths: ", ancestor_paths)

src/backend/core/tasks/find.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
"""Trigger document indexation using celery task."""
2+
3+
from django.conf import settings
4+
from django.core.cache import cache
5+
from django.db import transaction
6+
7+
from core import models
8+
from core.services.search_indexers import (
9+
FindDocumentIndexer,
10+
get_batch_accesses_by_users_and_teams,
11+
)
12+
13+
from impress.celery_app import app
14+
15+
16+
def document_indexer_debounce_key(document_id):
17+
return f"doc-indexer-debounce-{document_id}"
18+
19+
20+
def incr_counter(key):
21+
try:
22+
return cache.incr(key)
23+
except ValueError:
24+
cache.set(key, 1)
25+
return 1
26+
27+
28+
def decr_counter(key):
29+
try:
30+
return cache.decr(key)
31+
except ValueError:
32+
cache.set(key, 0)
33+
return 0
34+
35+
36+
@app.task
37+
def document_indexer_task(document_id):
38+
"""Send indexation query for a document using celery task."""
39+
key = document_indexer_debounce_key(document_id)
40+
41+
# check if the counter : if still up, skip the task. only the last one
42+
# within the countdown delay will do the query.
43+
if decr_counter(key) > 0:
44+
return
45+
46+
doc = models.Document.objects.get(pk=document_id)
47+
indexer = FindDocumentIndexer()
48+
accesses = get_batch_accesses_by_users_and_teams((doc.path,))
49+
50+
data = indexer.serialize_document(document=doc, accesses=accesses)
51+
52+
indexer.push(data)
53+
54+
55+
def trigger_document_indexer(document, on_commit=False):
56+
"""
57+
Trigger indexation task with debounce a delay set by the SEARCH_INDEXER_COUNTDOWN setting.
58+
59+
Args:
60+
document (Document): The document instance.
61+
on_commit (bool): Wait for the end of the transaction before starting the task
62+
(some fields may be in wrong state within the transaction)
63+
"""
64+
65+
if document.deleted_at or document.ancestors_deleted_at:
66+
pass
67+
68+
if on_commit:
69+
70+
def _aux():
71+
trigger_document_indexer(document, on_commit=False)
72+
73+
transaction.on_commit(_aux)
74+
else:
75+
key = document_indexer_debounce_key(document.pk)
76+
countdown = getattr(settings, "SEARCH_INDEXER_COUNTDOWN", 1)
77+
78+
# Each time this method is called during the countdown, we increment the
79+
# counter and each task decrease it, so the index be run only once.
80+
incr_counter(key)
81+
82+
document_indexer_task.apply_async(args=[document.pk], countdown=countdown)

src/backend/core/tests/test_models_documents.py

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import random
77
import smtplib
8+
import time
89
from logging import Logger
910
from unittest import mock
1011

@@ -13,12 +14,15 @@
1314
from django.core.cache import cache
1415
from django.core.exceptions import ValidationError
1516
from django.core.files.storage import default_storage
17+
from django.db import transaction
1618
from django.test.utils import override_settings
1719
from django.utils import timezone
1820

1921
import pytest
2022

2123
from core import factories, models
24+
from core.services.search_indexers import FindDocumentIndexer
25+
from core.tasks.find import document_indexer_debounce_key
2226

2327
pytestmark = pytest.mark.django_db
2428

@@ -1323,3 +1327,99 @@ def test_models_documents_compute_ancestors_links_paths_mapping_structure(
13231327
{"link_reach": sibling.link_reach, "link_role": sibling.link_role},
13241328
],
13251329
}
1330+
1331+
1332+
@mock.patch.object(FindDocumentIndexer, "push")
1333+
@pytest.mark.django_db(transaction=True)
1334+
def test_models_documents_post_save_indexer(mock_push, settings):
1335+
settings.SEARCH_INDEXER_COUNTDOWN = 0
1336+
1337+
user = factories.UserFactory()
1338+
1339+
with transaction.atomic():
1340+
doc1, doc2, doc3 = factories.DocumentFactory.create_batch(3)
1341+
1342+
factories.UserDocumentAccessFactory(document=doc1, user=user)
1343+
factories.UserDocumentAccessFactory(document=doc2, user=user)
1344+
factories.UserDocumentAccessFactory(document=doc3, user=user)
1345+
1346+
time.sleep(0.1) # waits for the end of the tasks
1347+
1348+
accesses = {
1349+
str(doc1.path): {"users": [user.sub]},
1350+
str(doc2.path): {"users": [user.sub]},
1351+
str(doc3.path): {"users": [user.sub]},
1352+
}
1353+
1354+
data = [call.args[0] for call in mock_push.call_args_list]
1355+
1356+
indexer = FindDocumentIndexer()
1357+
1358+
def sortkey(d):
1359+
return d["id"]
1360+
1361+
assert sorted(data, key=sortkey) == sorted(
1362+
[
1363+
indexer.serialize_document(doc1, accesses),
1364+
indexer.serialize_document(doc2, accesses),
1365+
indexer.serialize_document(doc3, accesses),
1366+
],
1367+
key=sortkey,
1368+
)
1369+
1370+
1371+
@mock.patch.object(FindDocumentIndexer, "push")
1372+
@pytest.mark.django_db(transaction=True)
1373+
def test_models_documents_post_save_indexer_debounce(mock_push, settings):
1374+
settings.SEARCH_INDEXER_COUNTDOWN = 0
1375+
1376+
indexer = FindDocumentIndexer()
1377+
user = factories.UserFactory()
1378+
1379+
with transaction.atomic():
1380+
doc = factories.DocumentFactory()
1381+
factories.UserDocumentAccessFactory(document=doc, user=user)
1382+
1383+
accesses = {
1384+
str(doc.path): {"users": [user.sub]},
1385+
}
1386+
1387+
time.sleep(0.1) # waits for the end of the tasks
1388+
1389+
# One save from the factory
1390+
assert [call.args[0] for call in mock_push.call_args_list] == [
1391+
indexer.serialize_document(doc, accesses),
1392+
]
1393+
1394+
# The debounce counter should be reset
1395+
assert cache.get(document_indexer_debounce_key(doc.pk)) == 0
1396+
1397+
# Now simulate 1 waiting task
1398+
cache.set(document_indexer_debounce_key(doc.pk), 1)
1399+
1400+
# save it again to trigger the indexer, but nothing should be done since
1401+
# the counter is over 0
1402+
with transaction.atomic():
1403+
updated_doc = models.Document.objects.get(pk=doc.pk)
1404+
updated_doc.save()
1405+
1406+
time.sleep(0.1) # waits for the end of the tasks
1407+
1408+
# Still on the first indexation.
1409+
assert [call.args[0] for call in mock_push.call_args_list] == [
1410+
indexer.serialize_document(doc, accesses),
1411+
]
1412+
1413+
# Reset the counter
1414+
cache.set(document_indexer_debounce_key(doc.pk), 0)
1415+
1416+
with transaction.atomic():
1417+
updated_doc = models.Document.objects.get(pk=doc.pk)
1418+
updated_doc.save()
1419+
1420+
time.sleep(0.1) # waits for the end of the tasks
1421+
1422+
assert [call.args[0] for call in mock_push.call_args_list] == [
1423+
indexer.serialize_document(doc, accesses),
1424+
indexer.serialize_document(updated_doc, accesses),
1425+
]

src/backend/core/tests/test_services_search_indexers.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ def test_services_search_indexers_serialize_document_returns_expected_json():
107107
"created_at": document.created_at.isoformat(),
108108
"updated_at": document.updated_at.isoformat(),
109109
"reach": document.link_reach,
110-
'size': 13,
110+
"size": 13,
111111
"is_active": True,
112112
}
113113

@@ -168,7 +168,9 @@ def test_services_search_indexers_batches_pass_only_batch_accesses(mock_push, se
168168
def test_services_search_indexers_ancestors_link_reach(mock_push):
169169
"""Document accesses and reach should take into account ancestors link reaches."""
170170
great_grand_parent = factories.DocumentFactory(link_reach="restricted")
171-
grand_parent = factories.DocumentFactory(parent=great_grand_parent, link_reach="authenticated")
171+
grand_parent = factories.DocumentFactory(
172+
parent=great_grand_parent, link_reach="authenticated"
173+
)
172174
parent = factories.DocumentFactory(parent=grand_parent, link_reach="public")
173175
document = factories.DocumentFactory(parent=parent, link_reach="restricted")
174176

@@ -199,7 +201,11 @@ def test_services_search_indexers_ancestors_users(mock_push):
199201
assert len(results) == 3
200202
assert results[str(grand_parent.id)]["users"] == [str(user_gp.sub)]
201203
assert set(results[str(parent.id)]["users"]) == {str(user_gp.sub), str(user_p.sub)}
202-
assert set(results[str(document.id)]["users"]) == {str(user_gp.sub), str(user_p.sub), str(user_d.sub)}
204+
assert set(results[str(document.id)]["users"]) == {
205+
str(user_gp.sub),
206+
str(user_p.sub),
207+
str(user_d.sub),
208+
}
203209

204210

205211
@patch.object(FindDocumentIndexer, "push")

src/backend/core/tests/test_utils.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -79,24 +79,24 @@ def test_utils_extract_attachments():
7979

8080
def test_utils_get_ancestor_to_descendants_map_single_path():
8181
"""Test ancestor mapping of a single path."""
82-
paths = ['000100020005']
82+
paths = ["000100020005"]
8383
result = utils.get_ancestor_to_descendants_map(paths, steplen=4)
8484

8585
assert result == {
86-
'0001': {'000100020005'},
87-
'00010002': {'000100020005'},
88-
'000100020005': {'000100020005'},
86+
"0001": {"000100020005"},
87+
"00010002": {"000100020005"},
88+
"000100020005": {"000100020005"},
8989
}
9090

9191

9292
def test_utils_get_ancestor_to_descendants_map_multiple_paths():
9393
"""Test ancestor mapping of multiple paths with shared prefixes."""
94-
paths = ['000100020005', '00010003']
94+
paths = ["000100020005", "00010003"]
9595
result = utils.get_ancestor_to_descendants_map(paths, steplen=4)
9696

9797
assert result == {
98-
'0001': {'000100020005', '00010003'},
99-
'00010002': {'000100020005'},
100-
'000100020005': {'000100020005'},
101-
'00010003': {'00010003'},
98+
"0001": {"000100020005", "00010003"},
99+
"00010002": {"000100020005"},
100+
"000100020005": {"000100020005"},
101+
"00010003": {"00010003"},
102102
}

src/backend/core/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
"""Utils for the core app."""
22

33
import base64
4-
from collections import defaultdict
54
import re
5+
from collections import defaultdict
66

77
import pycrdt
88
from bs4 import BeautifulSoup

0 commit comments

Comments
 (0)