Skip to content

Commit 8220fc1

Browse files
committed
✨(backend) add document search indexer
Add indexer that loops across documents in the database, formats them as json objects and indexes them in the remote "Find" mico-service.
1 parent cd587c5 commit 8220fc1

File tree

7 files changed

+503
-0
lines changed

7 files changed

+503
-0
lines changed

Makefile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,10 @@ demo: ## flush db then create a demo for load testing purpose
179179
@$(MANAGE) create_demo
180180
.PHONY: demo
181181

182+
index: ## index all documents to remote search
183+
@$(MANAGE) index
184+
.PHONY: index
185+
182186
# Nota bene: Black should come after isort just in case they don't agree...
183187
lint: ## lint back-end python sources
184188
lint: \
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
"""
2+
Handle search setup that needs to be done at bootstrap time.
3+
"""
4+
5+
import logging
6+
import time
7+
8+
from django.core.management.base import BaseCommand
9+
10+
from ...services.search_indexers import FindDocumentIndexer
11+
12+
logger = logging.getLogger("docs.search.bootstrap_search")
13+
14+
15+
class Command(BaseCommand):
16+
"""Index all documents to remote search service"""
17+
18+
help = __doc__
19+
20+
def handle(self, *args, **options):
21+
"""Launch and log search index generation."""
22+
logger.info("Starting to regenerate Find index...")
23+
start = time.perf_counter()
24+
25+
FindDocumentIndexer().index()
26+
27+
duration = time.perf_counter() - start
28+
logger.info(f"Search index regenerated in {duration:.2f} seconds.")
Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
"""Document search index management utilities and indexers"""
2+
3+
import logging
4+
from abc import ABC, abstractmethod
5+
from collections import defaultdict
6+
7+
from django.conf import settings
8+
9+
import requests
10+
11+
from core import models, utils
12+
13+
logger = logging.getLogger(__name__)
14+
15+
16+
def get_batch_accesses_by_users_and_teams(paths):
17+
"""
18+
Get accesses related to a list of document paths,
19+
grouped by users and teams, including all ancestor paths.
20+
"""
21+
print("paths: ", paths)
22+
ancestor_map = utils.get_ancestor_to_descendants_map(paths, steplen=models.Document.steplen)
23+
ancestor_paths = list(ancestor_map.keys())
24+
print("ancestor map: ", ancestor_map)
25+
print("ancestor paths: ", ancestor_paths)
26+
27+
access_qs = models.DocumentAccess.objects.filter(
28+
document__path__in=ancestor_paths
29+
).values("document__path", "user__sub", "team")
30+
31+
access_by_document_path = defaultdict(lambda: {"users": set(), "teams": set()})
32+
33+
for access in access_qs:
34+
ancestor_path = access["document__path"]
35+
user_sub = access["user__sub"]
36+
team = access["team"]
37+
38+
for descendant_path in ancestor_map.get(ancestor_path, []):
39+
if user_sub:
40+
access_by_document_path[descendant_path]["users"].add(str(user_sub))
41+
if team:
42+
access_by_document_path[descendant_path]["teams"].add(team)
43+
44+
return dict(access_by_document_path)
45+
46+
47+
class BaseDocumentIndexer(ABC):
48+
"""
49+
Base class for document indexers.
50+
51+
Handles batching and access resolution. Subclasses must implement both
52+
`serialize_document()` and `push()` to define backend-specific behavior.
53+
"""
54+
55+
def __init__(self, batch_size=None):
56+
"""
57+
Initialize the indexer.
58+
59+
Args:
60+
batch_size (int, optional): Number of documents per batch.
61+
Defaults to settings.SEARCH_INDEXER_BATCH_SIZE.
62+
"""
63+
self.batch_size = batch_size or settings.SEARCH_INDEXER_BATCH_SIZE
64+
65+
def index(self):
66+
"""
67+
Fetch documents in batches, serialize them, and push to the search backend.
68+
"""
69+
last_id = 0
70+
while True:
71+
documents_batch = list(
72+
models.Document.objects.filter(
73+
id__gt=last_id,
74+
).order_by("id")[: self.batch_size]
75+
)
76+
77+
if not documents_batch:
78+
break
79+
80+
doc_paths = [doc.path for doc in documents_batch]
81+
last_id = documents_batch[-1].id
82+
accesses_by_document_path = get_batch_accesses_by_users_and_teams(doc_paths)
83+
84+
serialized_batch = [
85+
self.serialize_document(document, accesses_by_document_path)
86+
for document in documents_batch
87+
]
88+
self.push(serialized_batch)
89+
90+
@abstractmethod
91+
def serialize_document(self, document, accesses):
92+
"""
93+
Convert a Document instance to a JSON-serializable format for indexing.
94+
95+
Must be implemented by subclasses.
96+
"""
97+
98+
@abstractmethod
99+
def push(self, data):
100+
"""
101+
Push a batch of serialized documents to the backend.
102+
103+
Must be implemented by subclasses.
104+
"""
105+
106+
107+
class FindDocumentIndexer(BaseDocumentIndexer):
108+
"""
109+
Document indexer that pushes documents to La Suite Find app.
110+
"""
111+
112+
def serialize_document(self, document, accesses):
113+
"""
114+
Convert a Document to the JSON format expected by La Suite Find.
115+
116+
Args:
117+
document (Document): The document instance.
118+
accesses (dict): Mapping of document ID to user/team access.
119+
120+
Returns:
121+
dict: A JSON-serializable dictionary.
122+
"""
123+
doc_path = document.path
124+
text_content = utils.base64_yjs_to_text(document.content)
125+
return {
126+
"id": str(document.id),
127+
"title": document.title,
128+
"content": text_content,
129+
"depth": document.depth,
130+
"path": document.path,
131+
"numchild": document.numchild,
132+
"created_at": document.created_at.isoformat(),
133+
"updated_at": document.updated_at.isoformat(),
134+
"users": list(accesses.get(doc_path, {}).get("users", set())),
135+
"groups": list(accesses.get(doc_path, {}).get("teams", set())),
136+
"reach": document.computed_link_reach,
137+
"size": len(text_content.encode("utf-8")),
138+
"is_active": not bool(document.ancestors_deleted_at),
139+
}
140+
141+
def push(self, data):
142+
"""
143+
Push a batch of documents to the Find backend.
144+
145+
Args:
146+
data (list): List of document dictionaries.
147+
"""
148+
url = getattr(settings, "SEARCH_INDEXER_URL", None)
149+
if not url:
150+
raise RuntimeError(
151+
"SEARCH_INDEXER_URL must be set in Django settings before indexing."
152+
)
153+
154+
secret = getattr(settings, "SEARCH_INDEXER_SECRET", None)
155+
if not secret:
156+
raise RuntimeError(
157+
"SEARCH_INDEXER_SECRET must be set in Django settings before indexing."
158+
)
159+
try:
160+
response = requests.post(
161+
url,
162+
json=data,
163+
headers={"Authorization": f"Bearer {secret}"},
164+
timeout=10,
165+
)
166+
response.raise_for_status()
167+
except requests.exceptions.HTTPError as e:
168+
logger.error("HTTPError: %s", e)
169+
logger.error("Response content: %s", response.text)
170+
raise

0 commit comments

Comments
 (0)