Skip to content

Index to search #1276

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ venv.bak/
env.d/development/*.local
env.d/terraform

# Docker
compose.override.yml
docker/auth/*.local

# npm
node_modules

Expand Down
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,10 @@ demo: ## flush db then create a demo for load testing purpose
@$(MANAGE) create_demo
.PHONY: demo

index: ## index all documents to remote search
@$(MANAGE) index
.PHONY: index

# Nota bene: Black should come after isort just in case they don't agree...
lint: ## lint back-end python sources
lint: \
Expand Down
8 changes: 8 additions & 0 deletions compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,9 @@ services:
- env.d/development/postgresql.local
ports:
- "8071:8000"
networks:
- default
- lasuite-net
volumes:
- ./src/backend:/app
- ./data/static:/data/static
Expand Down Expand Up @@ -219,3 +222,8 @@ services:
kc_postgresql:
condition: service_healthy
restart: true

networks:
lasuite-net:
name: lasuite-net
driver: bridge
13 changes: 13 additions & 0 deletions src/backend/core/api/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -801,3 +801,16 @@ class MoveDocumentSerializer(serializers.Serializer):
choices=enums.MoveNodePositionChoices.choices,
default=enums.MoveNodePositionChoices.LAST_CHILD,
)


class FindDocumentSerializer(serializers.Serializer):
"""Serializer for Find search requests"""
q = serializers.CharField(required=True)

def validate_q(self, value):
"""Ensure the text field is not empty."""

if len(value.strip()) == 0:
raise serializers.ValidationError("Text field cannot be empty.")

return value
37 changes: 37 additions & 0 deletions src/backend/core/api/viewsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from django.db.models.functions import Left, Length
from django.http import Http404, StreamingHttpResponse
from django.urls import reverse
from django.utils.decorators import method_decorator
from django.utils.functional import cached_property
from django.utils.text import capfirst, slugify
from django.utils.translation import gettext_lazy as _
Expand All @@ -29,6 +30,7 @@
from csp.constants import NONE
from csp.decorators import csp_update
from lasuite.malware_detection import malware_detection
from lasuite.oidc_login.decorators import refresh_oidc_access_token
from rest_framework import filters, status, viewsets
from rest_framework import response as drf_response
from rest_framework.permissions import AllowAny
Expand All @@ -37,6 +39,7 @@
from core import authentication, choices, enums, models
from core.services.ai_services import AIService
from core.services.collaboration_services import CollaborationService
from core.services.search_indexers import FindDocumentIndexer
from core.tasks.mail import send_ask_for_access_mail
from core.utils import extract_attachments, filter_descendants

Expand All @@ -48,6 +51,12 @@
# pylint: disable=too-many-ancestors


class ServiceUnavailable(drf.exceptions.APIException):
status_code = 503
default_detail = 'Service unavailable.'
default_code = 'service_unavailable'


class NestedGenericViewSet(viewsets.GenericViewSet):
"""
A generic Viewset aims to be used in a nested route context.
Expand Down Expand Up @@ -367,6 +376,7 @@ class DocumentViewSet(
list_serializer_class = serializers.ListDocumentSerializer
trashbin_serializer_class = serializers.ListDocumentSerializer
tree_serializer_class = serializers.ListDocumentSerializer
search_serializer_class = serializers.ListDocumentSerializer

def get_queryset(self):
"""Get queryset performing all annotation and filtering on the document tree structure."""
Expand Down Expand Up @@ -980,6 +990,33 @@ def duplicate(self, request, *args, **kwargs):
{"id": str(duplicated_document.id)}, status=status.HTTP_201_CREATED
)

@drf.decorators.action(detail=False, methods=["get"], url_path="search")
@method_decorator(refresh_oidc_access_token)
def search(self, request, *args, **kwargs):
access_token = request.session.get("oidc_access_token")

serializer = serializers.FindDocumentSerializer(
data=request.query_params
)
serializer.is_valid(raise_exception=True)

indexer = FindDocumentIndexer()
try:
queryset = indexer.search(
text=serializer.validated_data.get("q", ""),
user=request.user,
token=access_token
)
except RuntimeError as err:
raise ServiceUnavailable()

return self.get_response_for_queryset(
queryset,
context={
"request": request,
},
)

@drf.decorators.action(detail=True, methods=["get"], url_path="versions")
def versions_list(self, request, *args, **kwargs):
"""
Expand Down
28 changes: 28 additions & 0 deletions src/backend/core/management/commands/index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""
Handle search setup that needs to be done at bootstrap time.
"""

import logging
import time

from django.core.management.base import BaseCommand

from ...services.search_indexers import FindDocumentIndexer

logger = logging.getLogger("docs.search.bootstrap_search")


class Command(BaseCommand):
"""Index all documents to remote search service"""

help = __doc__

def handle(self, *args, **options):
"""Launch and log search index generation."""
logger.info("Starting to regenerate Find index...")
start = time.perf_counter()

FindDocumentIndexer().index()

duration = time.perf_counter() - start
logger.info("Search index regenerated in %.2f seconds.", duration)
69 changes: 47 additions & 22 deletions src/backend/core/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@
from django.core.files.storage import default_storage
from django.core.mail import send_mail
from django.db import models, transaction
from django.db.models import signals
from django.db.models.functions import Left, Length
from django.dispatch import receiver
from django.template.loader import render_to_string
from django.utils import timezone
from django.utils.functional import cached_property
Expand All @@ -39,6 +41,7 @@
RoleChoices,
get_equivalent_link_definition,
)
from .tasks.find import trigger_document_indexer

logger = getLogger(__name__)

Expand Down Expand Up @@ -439,32 +442,35 @@ def __init__(self, *args, **kwargs):
def save(self, *args, **kwargs):
"""Write content to object storage only if _content has changed."""
super().save(*args, **kwargs)

if self._content:
file_key = self.file_key
bytes_content = self._content.encode("utf-8")
self.save_content(self._content)

# Attempt to directly check if the object exists using the storage client.
try:
response = default_storage.connection.meta.client.head_object(
Bucket=default_storage.bucket_name, Key=file_key
)
except ClientError as excpt:
# If the error is a 404, the object doesn't exist, so we should create it.
if excpt.response["Error"]["Code"] == "404":
has_changed = True
else:
raise
def save_content(self, content):
"""Save content to object storage."""

file_key = self.file_key
bytes_content = content.encode("utf-8")

# Attempt to directly check if the object exists using the storage client.
try:
response = default_storage.connection.meta.client.head_object(
Bucket=default_storage.bucket_name, Key=file_key
)
except ClientError as excpt:
# If the error is a 404, the object doesn't exist, so we should create it.
if excpt.response["Error"]["Code"] == "404":
has_changed = True
else:
# Compare the existing ETag with the MD5 hash of the new content.
has_changed = (
response["ETag"].strip('"')
!= hashlib.md5(bytes_content).hexdigest() # noqa: S324
)
raise
else:
# Compare the existing ETag with the MD5 hash of the new content.
has_changed = (
response["ETag"].strip('"') != hashlib.md5(bytes_content).hexdigest() # noqa: S324
)

if has_changed:
content_file = ContentFile(bytes_content)
default_storage.save(file_key, content_file)
if has_changed:
content_file = ContentFile(bytes_content)
default_storage.save(file_key, content_file)

def is_leaf(self):
"""
Expand Down Expand Up @@ -946,6 +952,16 @@ def restore(self):
)


@receiver(signals.post_save, sender=Document)
def document_post_save(sender, instance, **kwargs):
"""
Asynchronous call to the document indexer at the end of the transaction.
Note : Within the transaction we can have an empty content and a serialization
error.
"""
trigger_document_indexer(instance, on_commit=True)


class LinkTrace(BaseModel):
"""
Relation model to trace accesses to a document via a link by a logged-in user.
Expand Down Expand Up @@ -1171,6 +1187,15 @@ def get_abilities(self, user):
}


@receiver(signals.post_save, sender=DocumentAccess)
def document_access_post_save(sender, instance, created, **kwargs):
"""
Asynchronous call to the document indexer at the end of the transaction.
"""
if not created:
trigger_document_indexer(instance.document, on_commit=True)


class DocumentAskForAccess(BaseModel):
"""Relation model to ask for access to a document."""

Expand Down
Loading
Loading