Skip to content

Commit 685464f

Browse files
Ash-CrowAntoLC
andauthored
🚸(backend) sort user search results by proximity with the active user (#1802)
## Purpose Allows a user to find more easily the other users they search, with the following order of priority: - users they already share documents with (more recent first) - users that share the same full email domain - ~~users that share the same partial email domain (last two parts)~~ - ~~other users~~ Edit: We need to ilter out other users in order to not reveal email addresses from members of other organisations. It's still possible to invite them by email. Solves #1521 ## Proposal - [x] Add a new function in `core/utils.py`: `users_sharing_documents_with()` - [x] Use it as a key to sort the results of a basic user search - [x] Filter user results to avoid reveal of users (and email addresses) of other orgs or that have not been interacted with. - [x] User research through "full" email address (contains the '@') is left unaffected. --------- Co-authored-by: Anthony LC <anthony.le-courric@mail.numerique.gouv.fr>
1 parent 9af540d commit 685464f

File tree

14 files changed

+416
-35
lines changed

14 files changed

+416
-35
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ and this project adheres to
1313
### Changed
1414

1515
- ♿️(frontend) Focus main container after navigation #1854
16+
- 🚸(backend) sort user search results by proximity with the active user #1802
1617

1718

1819
### Fixed

docs/env.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ These are the environment variables you can set for the `impress-backend` contai
1717
| API_USERS_LIST_LIMIT | Limit on API users | 5 |
1818
| API_USERS_LIST_THROTTLE_RATE_BURST | Throttle rate for api on burst | 30/minute |
1919
| API_USERS_LIST_THROTTLE_RATE_SUSTAINED | Throttle rate for api | 180/hour |
20+
| API_USERS_SEARCH_QUERY_MIN_LENGTH | Minimum characters to insert to search a user | 3 |
2021
| AWS_S3_ACCESS_KEY_ID | Access id for s3 endpoint | |
2122
| AWS_S3_ENDPOINT_URL | S3 endpoint | |
2223
| AWS_S3_REGION_NAME | Region name for s3 endpoint | |

src/backend/core/api/filters.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import unicodedata
44

5+
from django.conf import settings
56
from django.utils.translation import gettext_lazy as _
67

78
import django_filters
@@ -135,4 +136,6 @@ class UserSearchFilter(django_filters.FilterSet):
135136
Custom filter for searching users.
136137
"""
137138

138-
q = django_filters.CharFilter(min_length=5, max_length=254)
139+
q = django_filters.CharFilter(
140+
min_length=settings.API_USERS_SEARCH_QUERY_MIN_LENGTH, max_length=254
141+
)

src/backend/core/api/viewsets.py

Lines changed: 73 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
from csp.decorators import csp_update
3838
from lasuite.malware_detection import malware_detection
3939
from lasuite.oidc_login.decorators import refresh_oidc_access_token
40+
from lasuite.tools.email import get_domain_from_email
4041
from rest_framework import filters, status, viewsets
4142
from rest_framework import response as drf_response
4243
from rest_framework.permissions import AllowAny
@@ -61,7 +62,11 @@
6162
get_visited_document_ids_of,
6263
)
6364
from core.tasks.mail import send_ask_for_access_mail
64-
from core.utils import extract_attachments, filter_descendants
65+
from core.utils import (
66+
extract_attachments,
67+
filter_descendants,
68+
users_sharing_documents_with,
69+
)
6570

6671
from . import permissions, serializers, utils
6772
from .filters import DocumentFilter, ListDocumentFilter, UserSearchFilter
@@ -220,18 +225,80 @@ def get_queryset(self):
220225

221226
# Use trigram similarity for non-email-like queries
222227
# For performance reasons we filter first by similarity, which relies on an
223-
# index, then only calculate precise similarity scores for sorting purposes
224-
225-
return (
228+
# index, then only calculate precise similarity scores for sorting purposes.
229+
#
230+
# Additionally results are reordered to prefer users "closer" to the current
231+
# user: users they recently shared documents with, then same email domain.
232+
# To achieve that without complex SQL, we build a proximity score in Python
233+
# and return the top N results.
234+
# For security results, users that match neither of these proximity criteria
235+
# are not returned at all, to prevent email enumeration.
236+
current_user = self.request.user
237+
shared_map = users_sharing_documents_with(current_user)
238+
239+
user_email_domain = get_domain_from_email(current_user.email) or ""
240+
241+
candidates = list(
226242
queryset.annotate(
227243
sim_email=TrigramSimilarity("email", query),
228244
sim_name=TrigramSimilarity("full_name", query),
229245
)
230246
.annotate(similarity=Greatest("sim_email", "sim_name"))
231247
.filter(similarity__gt=0.2)
232-
.order_by("-similarity")[: settings.API_USERS_LIST_LIMIT]
248+
.order_by("-similarity")
233249
)
234250

251+
# Keep only users that either share documents with the current user
252+
# or have an email with the same domain as the current user.
253+
filtered_candidates = []
254+
for u in candidates:
255+
candidate_domain = get_domain_from_email(u.email) or ""
256+
if shared_map.get(u.id) or (
257+
user_email_domain and candidate_domain == user_email_domain
258+
):
259+
filtered_candidates.append(u)
260+
261+
candidates = filtered_candidates
262+
263+
# Build ordering key for each candidate
264+
def _sort_key(u):
265+
# shared priority: most recent first
266+
# Use shared_last_at timestamp numeric for secondary ordering when shared.
267+
shared_last_at = shared_map.get(u.id)
268+
if shared_last_at:
269+
is_shared = 1
270+
shared_score = int(shared_last_at.timestamp())
271+
else:
272+
is_shared = 0
273+
shared_score = 0
274+
275+
# domain proximity
276+
candidate_email_domain = get_domain_from_email(u.email) or ""
277+
278+
same_full_domain = (
279+
1
280+
if candidate_email_domain
281+
and candidate_email_domain == user_email_domain
282+
else 0
283+
)
284+
285+
# similarity fallback
286+
sim = getattr(u, "similarity", 0) or 0
287+
288+
return (
289+
is_shared,
290+
shared_score,
291+
same_full_domain,
292+
sim,
293+
)
294+
295+
# Sort candidates by the key descending and return top N as a queryset-like
296+
# list. Keep return type consistent with previous behavior (QuerySet slice
297+
# was returned) by returning a list of model instances.
298+
candidates.sort(key=_sort_key, reverse=True)
299+
300+
return candidates[: settings.API_USERS_LIST_LIMIT]
301+
235302
@drf.decorators.action(
236303
detail=False,
237304
methods=["get"],
@@ -2338,6 +2405,7 @@ def get(self, request):
23382405
"""
23392406
array_settings = [
23402407
"AI_FEATURE_ENABLED",
2408+
"API_USERS_SEARCH_QUERY_MIN_LENGTH",
23412409
"COLLABORATION_WS_URL",
23422410
"COLLABORATION_WS_NOT_CONNECTED_READY_ONLY",
23432411
"CONVERSION_FILE_EXTENSIONS_ALLOWED",

src/backend/core/signals.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,14 @@
44

55
from functools import partial
66

7+
from django.core.cache import cache
78
from django.db import transaction
89
from django.db.models import signals
910
from django.dispatch import receiver
1011

11-
from . import models
12-
from .tasks.search import trigger_batch_document_indexer
12+
from core import models
13+
from core.tasks.search import trigger_batch_document_indexer
14+
from core.utils import get_users_sharing_documents_with_cache_key
1315

1416

1517
@receiver(signals.post_save, sender=models.Document)
@@ -26,8 +28,24 @@ def document_post_save(sender, instance, **kwargs): # pylint: disable=unused-ar
2628
def document_access_post_save(sender, instance, created, **kwargs): # pylint: disable=unused-argument
2729
"""
2830
Asynchronous call to the document indexer at the end of the transaction.
31+
Clear cache for the affected user.
2932
"""
3033
if not created:
3134
transaction.on_commit(
3235
partial(trigger_batch_document_indexer, instance.document)
3336
)
37+
38+
# Invalidate cache for the user
39+
if instance.user:
40+
cache_key = get_users_sharing_documents_with_cache_key(instance.user)
41+
cache.delete(cache_key)
42+
43+
44+
@receiver(signals.post_delete, sender=models.DocumentAccess)
45+
def document_access_post_delete(sender, instance, **kwargs): # pylint: disable=unused-argument
46+
"""
47+
Clear cache for the affected user when document access is deleted.
48+
"""
49+
if instance.user:
50+
cache_key = get_users_sharing_documents_with_cache_key(instance.user)
51+
cache.delete(cache_key)

src/backend/core/tests/test_api_config.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
@override_settings(
2222
AI_FEATURE_ENABLED=False,
23+
API_USERS_SEARCH_QUERY_MIN_LENGTH=6,
2324
COLLABORATION_WS_URL="http://testcollab/",
2425
COLLABORATION_WS_NOT_CONNECTED_READY_ONLY=True,
2526
CRISP_WEBSITE_ID="123",
@@ -44,6 +45,7 @@ def test_api_config(is_authenticated):
4445
assert response.status_code == HTTP_200_OK
4546
assert response.json() == {
4647
"AI_FEATURE_ENABLED": False,
48+
"API_USERS_SEARCH_QUERY_MIN_LENGTH": 6,
4749
"COLLABORATION_WS_URL": "http://testcollab/",
4850
"COLLABORATION_WS_NOT_CONNECTED_READY_ONLY": True,
4951
"CONVERSION_FILE_EXTENSIONS_ALLOWED": [".docx", ".md"],

src/backend/core/tests/test_api_users.py

Lines changed: 83 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
Test users API endpoints in the impress core app.
33
"""
44

5+
from django.utils import timezone
6+
57
import pytest
68
from rest_framework.test import APIClient
79

@@ -121,12 +123,12 @@ def test_api_users_list_query_full_name():
121123
Authenticated users should be able to list users and filter by full name.
122124
Only results with a Trigram similarity greater than 0.2 with the query should be returned.
123125
"""
124-
user = factories.UserFactory()
126+
user = factories.UserFactory(email="user@example.com")
125127

126128
client = APIClient()
127129
client.force_login(user)
128130

129-
dave = factories.UserFactory(email="contact@work.com", full_name="David Bowman")
131+
dave = factories.UserFactory(email="contact@example.com", full_name="David Bowman")
130132

131133
response = client.get(
132134
"/api/v1.0/users/?q=David",
@@ -166,13 +168,13 @@ def test_api_users_list_query_accented_full_name():
166168
Authenticated users should be able to list users and filter by full name with accents.
167169
Only results with a Trigram similarity greater than 0.2 with the query should be returned.
168170
"""
169-
user = factories.UserFactory()
171+
user = factories.UserFactory(email="user@example.com")
170172

171173
client = APIClient()
172174
client.force_login(user)
173175

174176
fred = factories.UserFactory(
175-
email="contact@work.com", full_name="Frédérique Lefèvre"
177+
email="contact@example.com", full_name="Frédérique Lefèvre"
176178
)
177179

178180
response = client.get("/api/v1.0/users/?q=Frédérique")
@@ -201,12 +203,82 @@ def test_api_users_list_query_accented_full_name():
201203
assert users == []
202204

203205

206+
def test_api_users_list_sorted_by_closest_match():
207+
"""
208+
Authenticated users should be able to list users and the results should be
209+
sorted by closest match to the query.
210+
211+
Sorting criteria are :
212+
- Shared documents with the user (most recent first)
213+
- Same full email domain (example.gouv.fr)
214+
215+
Addresses that match neither criteria should be excluded from the results.
216+
217+
Case in point: the logged-in user has recently shared documents
218+
with pierre.dupont@beta.gouv.fr and less recently with pierre.durand@impots.gouv.fr.
219+
220+
Other users named Pierre also exist:
221+
- pierre.thomas@example.com
222+
- pierre.petit@anct.gouv.fr
223+
- pierre.robert@culture.gouv.fr
224+
225+
The search results should be ordered as follows:
226+
227+
# Shared with first
228+
- pierre.dupond@beta.gouv.fr # Most recent first
229+
- pierre.durand@impots.gouv.fr
230+
# Same full domain second
231+
- pierre.petit@anct.gouv.fr
232+
"""
233+
234+
user = factories.UserFactory(
235+
email="martin.bernard@anct.gouv.fr", full_name="Martin Bernard"
236+
)
237+
238+
client = APIClient()
239+
client.force_login(user)
240+
241+
pierre_1 = factories.UserFactory(email="pierre.dupont@beta.gouv.fr")
242+
pierre_2 = factories.UserFactory(email="pierre.durand@impots.gouv.fr")
243+
_pierre_3 = factories.UserFactory(email="pierre.thomas@example.com")
244+
pierre_4 = factories.UserFactory(email="pierre.petit@anct.gouv.fr")
245+
_pierre_5 = factories.UserFactory(email="pierre.robert@culture.gouv.fr")
246+
247+
document_1 = factories.DocumentFactory(creator=user)
248+
document_2 = factories.DocumentFactory(creator=user)
249+
factories.UserDocumentAccessFactory(user=user, document=document_1)
250+
factories.UserDocumentAccessFactory(user=user, document=document_2)
251+
252+
now = timezone.now()
253+
last_week = now - timezone.timedelta(days=7)
254+
last_month = now - timezone.timedelta(days=30)
255+
256+
# The factory cannot set the created_at directly, so we force it after creation
257+
p1_d1 = factories.UserDocumentAccessFactory(user=pierre_1, document=document_1)
258+
p1_d1.created_at = last_week
259+
p1_d1.save()
260+
261+
p2_d2 = factories.UserDocumentAccessFactory(user=pierre_2, document=document_2)
262+
p2_d2.created_at = last_month
263+
p2_d2.save()
264+
265+
response = client.get("/api/v1.0/users/?q=Pierre")
266+
assert response.status_code == 200
267+
user_ids = [user["email"] for user in response.json()]
268+
269+
assert user_ids == [
270+
str(pierre_1.email),
271+
str(pierre_2.email),
272+
str(pierre_4.email),
273+
]
274+
275+
204276
def test_api_users_list_limit(settings):
205277
"""
206278
Authenticated users should be able to list users and the number of results
207-
should be limited to 10.
279+
should be limited to API_USERS_LIST_LIMIT (by default 5).
208280
"""
209-
user = factories.UserFactory()
281+
user = factories.UserFactory(email="user@example.com")
210282

211283
client = APIClient()
212284
client.force_login(user)
@@ -309,28 +381,16 @@ def test_api_users_list_query_email_exclude_doc_user():
309381

310382
def test_api_users_list_query_short_queries():
311383
"""
312-
Queries shorter than 5 characters should return an empty result set.
384+
If API_USERS_SEARCH_QUERY_MIN_LENGTH is not set, the default minimum length should be 3.
313385
"""
314386
user = factories.UserFactory(email="paul@example.com", full_name="Paul")
315387
client = APIClient()
316388
client.force_login(user)
317389

318-
factories.UserFactory(email="john.doe@example.com")
319-
factories.UserFactory(email="john.lennon@example.com")
320-
321-
response = client.get("/api/v1.0/users/?q=jo")
322-
assert response.status_code == 400
323-
assert response.json() == {
324-
"q": ["Ensure this value has at least 5 characters (it has 2)."]
325-
}
326-
327-
response = client.get("/api/v1.0/users/?q=john")
328-
assert response.status_code == 400
329-
assert response.json() == {
330-
"q": ["Ensure this value has at least 5 characters (it has 4)."]
331-
}
390+
factories.UserFactory(email="john.doe@example.com", full_name="John Doe")
391+
factories.UserFactory(email="john.lennon@example.com", full_name="John Lennon")
332392

333-
response = client.get("/api/v1.0/users/?q=john.")
393+
response = client.get("/api/v1.0/users/?q=joh")
334394
assert response.status_code == 200
335395
assert len(response.json()) == 2
336396

@@ -356,7 +416,7 @@ def test_api_users_list_query_long_queries():
356416

357417
def test_api_users_list_query_inactive():
358418
"""Inactive users should not be listed."""
359-
user = factories.UserFactory()
419+
user = factories.UserFactory(email="user@example.com")
360420
client = APIClient()
361421
client.force_login(user)
362422

0 commit comments

Comments
 (0)