diff --git a/pyproject.toml b/pyproject.toml index 742e9758..d5af37f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,11 +37,13 @@ dependencies = [ "django-extensions", "django-filter", "django-flags", + "django-redis>=6", "django-regex", "django-smart-env", "django-storages[azure]", "django-svelte-jsoneditor", "djangorestframework", + "dotenv>=0.9.9", "drf-nested-routers", "drf-spectacular[sidecar]", "flower", @@ -49,6 +51,7 @@ dependencies = [ "numpy<2", "psycopg[binary]", "psycopg-pool", + "pyzstd>=0.17", "requests", "sentry-sdk[celery,django]", "setuptools", diff --git a/src/hope_dedup_engine/apps/api/admin/deduplicationset.py b/src/hope_dedup_engine/apps/api/admin/deduplicationset.py index 22c97dee..70a11b82 100644 --- a/src/hope_dedup_engine/apps/api/admin/deduplicationset.py +++ b/src/hope_dedup_engine/apps/api/admin/deduplicationset.py @@ -1,3 +1,4 @@ +from django.contrib import messages from django.contrib.admin import ModelAdmin, register from admin_extra_buttons.decorators import button, link @@ -8,8 +9,8 @@ from django.db.models import QuerySet from django.http import HttpRequest from rest_framework.reverse import reverse - -from hope_dedup_engine.apps.api.models import DeduplicationSet +from django.utils.translation import gettext as _ +from hope_dedup_engine.apps.api.models import DedupJob, DeduplicationSet @register(DeduplicationSet) @@ -56,5 +57,34 @@ def findings(self, button: button) -> str | None: button.visible = False return None + @button( + label="Terminate Job", + permission=lambda request, obj, **kwargs: ( + request.user.is_staff + and obj + and obj.state not in [DeduplicationSet.State.FAILED, DeduplicationSet.State.CANCELED] + ), + confirm=_("Are you sure you want to terminate the running job for this deduplication set?"), + ) + def terminate_job(self, request: HttpRequest, pk: str) -> None: + ds = self.get_object(request, pk) + + job = DedupJob.objects.filter(deduplication_set=ds).exclude(curr_async_result_id__isnull=True).first() + + if job and job.curr_async_result_id: + new_status = job.terminate(job.curr_async_result_id) + self.message_user( + request, + f"Job termination initiated. New job status: {new_status}.", + messages.SUCCESS, + ) + else: + self.message_user( + request, + _("No active job found. Setting state to Canceled."), + messages.WARNING, + ) + ds.set_state(DeduplicationSet.State.CANCELED) + def get_queryset(self, request: HttpRequest) -> QuerySet[DeduplicationSet]: return DeduplicationSet.objects.only(*self.get_list_display(request)) diff --git a/src/hope_dedup_engine/apps/api/deduplication/process.py b/src/hope_dedup_engine/apps/api/deduplication/process.py index 40813b9a..4fa12d9b 100644 --- a/src/hope_dedup_engine/apps/api/deduplication/process.py +++ b/src/hope_dedup_engine/apps/api/deduplication/process.py @@ -19,7 +19,6 @@ encode_chunk, get_chunks, finish_with_error, - ChunkPurpose, ) HOUR = 60 * 60 @@ -68,6 +67,7 @@ def find_duplicates(self, dedup_job_id: int, version: int) -> dict[str, Any]: send_notification(deduplication_set.notification_url) config = asdict(DeduplicationSetConfig.from_deduplication_set(deduplication_set)) + config["dedup_job_id"] = dedup_job_id # clean results Finding.objects.filter(deduplication_set=deduplication_set).delete() @@ -78,7 +78,7 @@ def find_duplicates(self, dedup_job_id: int, version: int) -> dict[str, Any]: deduplication_set.finding_set.update(score=F("score") / weight_total) filenames = deduplication_set.filenames_without_encodings() - chunks = get_chunks(filenames, purpose=ChunkPurpose.ENCODE) + chunks = get_chunks(filenames) tasks = [encode_chunk.s(chunk, config) for chunk in chunks] chord_id = chord(tasks)(callback_encodings.s(config=config)) diff --git a/src/hope_dedup_engine/apps/api/models/deduplication.py b/src/hope_dedup_engine/apps/api/models/deduplication.py index 6f6475c5..61d1697f 100644 --- a/src/hope_dedup_engine/apps/api/models/deduplication.py +++ b/src/hope_dedup_engine/apps/api/models/deduplication.py @@ -27,6 +27,7 @@ class State(models.IntegerChoices): ) # Images are added to deduplication set, but not yet processed PROCESSING = 2, "Processing" # deduplication set is being processed FAILED = 3, "Failed" # an error occurred + CANCELED = 4, "Canceled" # Process was canceled by a user id = models.UUIDField(primary_key=True, default=uuid4) name = models.CharField(max_length=128, unique=True, null=True, blank=True, db_index=True) diff --git a/src/hope_dedup_engine/apps/api/urls.py b/src/hope_dedup_engine/apps/api/urls.py index b29b3780..79452af0 100644 --- a/src/hope_dedup_engine/apps/api/urls.py +++ b/src/hope_dedup_engine/apps/api/urls.py @@ -43,14 +43,14 @@ urlpatterns = [ path("", include(router.urls)), path("", include(deduplication_sets_router.urls)), - path("api/rest/", SpectacularAPIView.as_view(), name="schema"), + path("rest/", SpectacularAPIView.as_view(), name="schema"), path( - "api/rest/swagger/", + "rest/swagger/", SpectacularSwaggerView.as_view(url_name="schema"), name="swagger-ui", ), path( - "api/rest/redoc/", + "rest/redoc/", SpectacularRedocView.as_view(url_name="schema"), name="redoc", ), diff --git a/src/hope_dedup_engine/apps/api/views.py b/src/hope_dedup_engine/apps/api/views.py index 6cdb4589..44c7f118 100644 --- a/src/hope_dedup_engine/apps/api/views.py +++ b/src/hope_dedup_engine/apps/api/views.py @@ -1,9 +1,9 @@ -from dataclasses import dataclass from http import HTTPMethod from typing import Any from uuid import UUID from django.db.models import QuerySet, Model +from django.http import HttpRequest from drf_spectacular.utils import extend_schema from rest_framework import mixins, status, viewsets from rest_framework.decorators import action @@ -61,6 +61,7 @@ class DeduplicationSetViewSet( HasAccessToDeduplicationSet, ) serializer_class = DeduplicationSetSerializer + queryset = DeduplicationSet.objects.all() def get_queryset(self) -> QuerySet: return DeduplicationSet.objects.filter(system=self.request.auth.system, deleted=False) @@ -160,35 +161,8 @@ def destroy(self, request: Request, *args: Any, **kwargs: Any) -> Response: return super().destroy(request, *args, **kwargs) -@dataclass -class ListDataWrapper: - data: list[dict[str, Any]] - - def __setitem__(self, key: str, value: Any) -> None: - for item in self.data: - item[key] = value - - -class WrapRequestDataMixin: - def initialize_request(self, request: Request, *args: Any, **kwargs: Any) -> Request: - request = super().initialize_request(request, *args, **kwargs) - request._full_data = ListDataWrapper(request.data) - return request - - -class UnwrapRequestDataMixin: - def initialize_request(self, request: Request, *args: Any, **kwargs: Any) -> Request: - request = super().initialize_request(request, *args, **kwargs) - request._full_data = request._full_data.data - return request - - -# drf-nested-routers doesn't work correctly when request data is a list, so we use WrapRequestDataMixin, -# UnwrapRequestDataMixin, and ListDataWrapper to make it work with list of objects class BulkImageViewSet( - UnwrapRequestDataMixin, nested_viewsets.NestedViewSetMixin[Image], - WrapRequestDataMixin, mixins.CreateModelMixin, viewsets.GenericViewSet, ): @@ -204,14 +178,45 @@ class BulkImageViewSet( DEDUPLICATION_SET_PARAM: DEDUPLICATION_SET_FILTER, } + def _inject_parent_lookup_kwargs(self, request_data: Any, view_kwargs: dict) -> None: + """Inject parent lookup kwargs into request data, handling list-based data for bulk operations.""" + if getattr(self, "swagger_fake_view", False): + return + + for url_kwarg, fk_filter in self._get_parent_lookup_kwargs().items(): + parent_arg = fk_filter.partition("__")[0] + parent_pk = view_kwargs[url_kwarg] + + if isinstance(request_data, list): + for item in request_data: + if isinstance(item, dict): + item[parent_arg] = parent_pk + elif isinstance(request_data, dict): + request_data[parent_arg] = parent_pk + + def initialize_request(self, request: HttpRequest, *args: Any, **kwargs: Any) -> Request: + """Override to bypass faulty drf-nested-routers logic and inject parent kwargs correctly.""" + drf_request: Request = super(nested_viewsets.NestedViewSetMixin, self).initialize_request( + request, *args, **kwargs + ) + self._inject_parent_lookup_kwargs(drf_request.data, kwargs) + return drf_request + + def initial(self, request: Request, *args: Any, **kwargs: Any) -> None: + """Override to bypass faulty drf-nested-routers logic and inject parent kwargs correctly.""" + super(nested_viewsets.NestedViewSetMixin, self).initial(request, *args, **kwargs) + self._inject_parent_lookup_kwargs(request.data, kwargs) + def get_serializer(self, *args: Any, **kwargs: Any) -> Serializer: if self.action == "create": - return CreateImageSerializer(*args, **kwargs, many=True) - return super().get_serializer(*args, **kwargs, many=True) + kwargs.setdefault("many", True) + return CreateImageSerializer(*args, **kwargs) + return super().get_serializer(*args, **kwargs) def perform_create(self, serializer: Serializer) -> None: super().perform_create(serializer) - if deduplication_set := (serializer.instance[0].deduplication_set if serializer.instance else None): + if serializer.instance: + deduplication_set = serializer.instance[0].deduplication_set deduplication_set.updated_by = self.request.user deduplication_set.save() diff --git a/src/hope_dedup_engine/apps/faces/celery_tasks.py b/src/hope_dedup_engine/apps/faces/celery_tasks.py index 8fd4b01c..6691e530 100644 --- a/src/hope_dedup_engine/apps/faces/celery_tasks.py +++ b/src/hope_dedup_engine/apps/faces/celery_tasks.py @@ -1,21 +1,20 @@ import traceback -from itertools import combinations_with_replacement +from functools import partial +import json +from typing import Any, Final, TYPE_CHECKING + +from django.conf import settings +from django_redis import get_redis_connection import sentry_sdk -from functools import partial -from typing import TYPE_CHECKING, Any, Iterable -from enum import IntEnum -from itertools import batched -from celery import Task, chord, shared_task, states +from celery import Task, chord, shared_task, signals, states from celery.utils.imports import qualname -from django.conf import settings from hope_dedup_engine.apps.api.models import DeduplicationSet from hope_dedup_engine.apps.api.utils.notification import send_notification from hope_dedup_engine.apps.faces.managers import FileSyncManager from hope_dedup_engine.apps.faces.services.facial import dedupe_images, encode_faces -from hope_dedup_engine.apps.faces.utils import report_long_execution from hope_dedup_engine.config.celery import DedupeTask, app from hope_dedup_engine.type_aliases import FindingType @@ -23,19 +22,25 @@ if TYPE_CHECKING: from celery.canvas import Signature +TARGET_CHUNKS: Final[int] = 30 -class ChunkPurpose(IntEnum): - ENCODE = 25 - DEDUPE = 7000 +def get_chunks(files: list[str]) -> list[list[str]]: + """Divide elements into a target number of chunks for parallel processing.""" + if not files: + return [] + num_chunks = min(len(files), TARGET_CHUNKS) + chunk_size = (len(files) + num_chunks - 1) // num_chunks + return [files[i : i + chunk_size] for i in range(0, len(files), chunk_size)] -def get_chunks(filenames: Iterable[str], *, purpose: ChunkPurpose) -> list[list[str]]: - return [list(b) for b in batched(filenames, int(purpose))] # noqa: B911 - -def notify_status(task: Task, dedup_job_id: int | None = None, **kwargs): - # This is temporary and should be replaced with proper logging or removed completely - return True +def notify_status(task: Task, config: dict[str, Any], **kwargs): + dedup_job_id = config.get("dedup_job_id") + signals.task_prerun.send( + sender=task, + task_id=task.request.id, + dedup_job_id=dedup_job_id, + ) def shadow_name(task, args, kwargs, options): @@ -44,7 +49,6 @@ def shadow_name(task, args, kwargs, options): group: str = options["group_id"].split("-")[-1] chunk = int(options["group_index"]) return f"{qualname(s.type)}({group})-{chunk:03}" - # we do not care about the actual error here except Exception as e: # noqa: BLE001 sentry_sdk.capture_exception(e) return str(e) @@ -66,44 +70,51 @@ def finish_with_success(ds: DeduplicationSet) -> None: finish_processing(ds) -@app.task(bind=True, base=DedupeTask, shadow_name=shadow_name) +@app.task(bind=True, base=DedupeTask, shadow_name=shadow_name, acks_late=True) def encode_chunk( self: DedupeTask, files: list[str], config: dict[str, Any], -) -> None: +) -> list[str]: """Encode faces in a chunk of files.""" - with report_long_execution('DeduplicationSet.objects.get(pk=config.get("deduplication_set_id"))'): - ds = DeduplicationSet.objects.get(pk=config.get("deduplication_set_id")) + ds = DeduplicationSet.objects.get(pk=config.get("deduplication_set_id")) try: - callback = partial(notify_status, task=self) - with report_long_execution('encode_faces(files, config.get("encoding"), pre_encodings, progress=callback)'): - results = encode_faces(files, config.get("encoding"), progress=callback) - with report_long_execution("ds.update_encodings(results[0])"): - ds.update_encodings(results[0]) + callback = partial(notify_status, task=self, config=config) + ds.get_encodings() + results = encode_faces(files, config.get("encoding"), progress=callback) + ds.update_encodings(results[0]) + return results[1] except Exception as e: sentry_sdk.capture_exception(e) finish_with_error(ds, e) raise -@app.task(bind=True, base=DedupeTask) +@app.task(bind=True, base=DedupeTask, acks_late=True) def dedupe_chunk( self: Task, - files0: list[str], - files1: list[str], + chunk_key1: str, + chunk_key2: str, + ignored_pairs_key: str, config: dict[str, Any], ) -> FindingType: - """Deduplicate faces in a chunk of files.""" + """Deduplicate faces between two chunks of files.""" ds = DeduplicationSet.objects.get(pk=config.get("deduplication_set_id")) try: - callback = partial(notify_status, task=self) - encoded = ds.get_encodings() - ignored_pairs = set(ds.get_ignored_pairs()) + redis_conn = get_redis_connection("default") + encodings1 = json.loads(redis_conn.get(chunk_key1)) + + if chunk_key1 == chunk_key2: + encodings2 = encodings1 + else: + encodings2 = json.loads(redis_conn.get(chunk_key2)) + + ignored_pairs = {tuple(p) for p in json.loads(redis_conn.get(ignored_pairs_key))} + + callback = partial(notify_status, task=self, config=config) return dedupe_images( - files0, - files1, - encoded, + encodings1, + encodings2, ignored_pairs, dedupe_threshold=config.get("deduplicate", {}).get("threshold"), options=config.get("deduplicate"), @@ -119,10 +130,12 @@ def dedupe_chunk( def callback_findings( self: Task, results: FindingType, + context: dict[str, Any], config: dict[str, Any], ) -> dict[str, Any]: """Aggregate and save findings.""" ds = DeduplicationSet.objects.get(pk=config.get("deduplication_set_id")) + keys_to_delete = context["keys_to_delete"] try: seen_pairs = set() findings = [ @@ -144,18 +157,66 @@ def callback_findings( sentry_sdk.capture_exception(e) finish_with_error(ds, e) raise + finally: + try: + if keys_to_delete: + redis_conn = get_redis_connection("default") + redis_conn.delete(*keys_to_delete) + except OSError as e: + sentry_sdk.capture_exception(e) @app.task(bind=True, base=DedupeTask) def callback_encodings( self: Task, - results: list[None], + results: list[list[str]], config: dict[str, Any], ) -> dict[str, Any]: - """Aggregate and save encodings.""" + """Cache encodings and ignored pairs, then start deduplication.""" ds = DeduplicationSet.objects.get(pk=config.get("deduplication_set_id")) try: - deduplicate_dataset.delay(config) + new_files = {file for file_list in results for file in file_list} + encodings = ds.get_encodings() + ignored_pairs = list(ds.get_ignored_pairs()) + + new_encodings = {f: encodings[f] for f in new_files if f in encodings} + existing_encodings = {f: e for f, e in encodings.items() if f not in new_files} + + # Split encodings into chunks + new_key_chunks = get_chunks(list(new_encodings.keys())) + existing_key_chunks = get_chunks(list(existing_encodings.keys())) + + redis_conn = get_redis_connection("default") + dedup_prefix = f"hde:dedup:{ds.pk}" + + # Save ignored pairs + ignored_pairs_key = f"{dedup_prefix}:ignored_pairs" + payload = json.dumps(ignored_pairs) + redis_conn.set(ignored_pairs_key, payload, ex=86400) + + # Save encoding chunks + new_chunk_keys = [] + for i, key_chunk in enumerate(new_key_chunks): + chunk_encodings = {k: new_encodings[k] for k in key_chunk} + key = f"{dedup_prefix}:chunk:new:{i}" + payload = json.dumps(chunk_encodings) + redis_conn.set(key, payload, ex=86400) + new_chunk_keys.append(key) + + existing_chunk_keys = [] + for i, key_chunk in enumerate(existing_key_chunks): + chunk_encodings = {k: existing_encodings[k] for k in key_chunk} + key = f"{dedup_prefix}:chunk:existing:{i}" + payload = json.dumps(chunk_encodings) + redis_conn.set(key, payload, ex=86400) + existing_chunk_keys.append(key) + + deduplicate_dataset.delay( + config=config, + new_chunk_keys=new_chunk_keys, + existing_chunk_keys=existing_chunk_keys, + ignored_pairs_key=ignored_pairs_key, + ) return { "Encoded": True, } @@ -169,17 +230,39 @@ def callback_encodings( def deduplicate_dataset( self: Task, config: dict[str, Any], + new_chunk_keys: list[str], + existing_chunk_keys: list[str], + ignored_pairs_key: str, ) -> dict[str, Any]: """Deduplicate the dataset.""" ds = DeduplicationSet.objects.get(pk=config.get("deduplication_set_id")) try: - chunks = get_chunks(ds.get_encodings().keys(), purpose=ChunkPurpose.DEDUPE) - tasks = [dedupe_chunk.s(chunk0, chunk1, config) for chunk0, chunk1 in combinations_with_replacement(chunks, 2)] - chord_id = chord(tasks)(callback_findings.s(config=config)) + # new vs new + tasks = [ + dedupe_chunk.s(new_chunk_keys[i], new_chunk_keys[j], ignored_pairs_key, config) + for i in range(len(new_chunk_keys)) + for j in range(i, len(new_chunk_keys)) + ] + + # new vs existing + tasks.extend( + dedupe_chunk.s(chunk_key1, chunk_key2, ignored_pairs_key, config) + for chunk_key1 in new_chunk_keys + for chunk_key2 in existing_chunk_keys + ) + + context = { + "keys_to_delete": new_chunk_keys + existing_chunk_keys + [ignored_pairs_key], + } + callback = callback_findings.s( + config=config, + context=context, + ) + chord_id = chord(tasks)(callback) return { "deduplication_set": str(ds), "chord_id": str(chord_id), - "chunks": len(chunks), + "tasks": len(tasks), } except Exception as e: sentry_sdk.capture_exception(e) diff --git a/src/hope_dedup_engine/apps/faces/managers/storage.py b/src/hope_dedup_engine/apps/faces/managers/storage.py index 69f51da5..8bb9baa2 100644 --- a/src/hope_dedup_engine/apps/faces/managers/storage.py +++ b/src/hope_dedup_engine/apps/faces/managers/storage.py @@ -1,7 +1,10 @@ +from pathlib import Path + from django.conf import settings import cv2 import numpy as np +from azure.core.exceptions import ResourceNotFoundError from storages.backends.azure_storage import AzureStorage @@ -13,3 +16,27 @@ def load_image(self, file: str) -> np.ndarray: with self.storage.open(file, "rb") as img_file: img_array = np.frombuffer(img_file.read(), dtype=np.uint8) return cv2.imdecode(img_array, cv2.IMREAD_COLOR) + + +class LocalImagesStorageManager: + def __init__(self) -> None: + self.base_dir = Path(settings.LOCAL_IMAGE_DIR) + + def load_image(self, file: str) -> np.ndarray: + image_path = self.base_dir / file + if not image_path.is_file(): + raise ResourceNotFoundError(f"Image file not found: {image_path}") + return cv2.imread(str(image_path), cv2.IMREAD_COLOR) + + +def get_storage_manager() -> ImagesStorageManager | LocalImagesStorageManager: + backend = settings.IMAGE_STORAGE_BACKEND + if backend == "local": + if not settings.LOCAL_IMAGE_DIR: + raise ValueError("LOCAL_IMAGE_DIR setting is required for 'local' storage backend.") + if not Path(settings.LOCAL_IMAGE_DIR).is_dir(): + raise FileNotFoundError(f"Local image directory not found: {settings.LOCAL_IMAGE_DIR}") + return LocalImagesStorageManager() + if backend == "azure": + return ImagesStorageManager() + raise ValueError(f"Unsupported image storage backend: {backend}") diff --git a/src/hope_dedup_engine/apps/faces/services/facial.py b/src/hope_dedup_engine/apps/faces/services/facial.py index 2618f22b..f9d662d1 100644 --- a/src/hope_dedup_engine/apps/faces/services/facial.py +++ b/src/hope_dedup_engine/apps/faces/services/facial.py @@ -2,12 +2,14 @@ from collections import defaultdict from typing import Any +import numpy as np from azure.core.exceptions import ResourceNotFoundError from deepface import DeepFace +from numpy.linalg import norm from hope_dedup_engine.apps.api.models import Image -from hope_dedup_engine.apps.faces.managers import ImagesStorageManager -from hope_dedup_engine.apps.faces.utils import is_facial_error, report_long_execution +from hope_dedup_engine.apps.faces.managers.storage import get_storage_manager +from hope_dedup_engine.apps.faces.utils import coerce_status_code, is_facial_error from hope_dedup_engine.type_aliases import EncodingType, FindingType, IgnoredPairType logger = logging.getLogger(__name__) @@ -22,48 +24,46 @@ def encode_faces( options=None, pre_encodings=None, progress=None, -) -> tuple[EncodingType, int, int]: +) -> tuple[EncodingType, list[str], int, int]: if not callable(progress): progress = default_progress - with report_long_execution("ImagesStorageManager()"): - storage = ImagesStorageManager() + options = options or {} + + storage = get_storage_manager() encoded = {} if pre_encodings: - with report_long_execution("encoded.update(pre_encodings)"): - encoded.update(pre_encodings) + encoded.update(pre_encodings) added_cnt = existing_cnt = 0 - existing_cnt = 1000 + newly_encoded_files = [] for file in files: - with report_long_execution("progress()"): - progress() + progress() if file in encoded: existing_cnt += 1 continue + newly_encoded_files.append(file) try: - with report_long_execution("DeepFace.represent(storage.load_image(file), **(options or {}))"): - result = DeepFace.represent(storage.load_image(file), **(options or {})) + result = DeepFace.represent(storage.load_image(file), **options) if len(result) > 1: - encoded[file] = Image.StatusCode.MULTIPLE_FACES_DETECTED.value + encoded[file] = Image.StatusCode.MULTIPLE_FACES_DETECTED.name else: encoded[file] = result[0]["embedding"] added_cnt += 1 except TypeError as e: logger.exception(e) - encoded[file] = Image.StatusCode.GENERIC_ERROR.value + encoded[file] = Image.StatusCode.GENERIC_ERROR.name except ValueError: - encoded[file] = Image.StatusCode.NO_FACE_DETECTED.value + encoded[file] = Image.StatusCode.NO_FACE_DETECTED.name except ResourceNotFoundError: - encoded[file] = Image.StatusCode.NO_FILE_FOUND.value + encoded[file] = Image.StatusCode.NO_FILE_FOUND.name - return encoded, added_cnt, existing_cnt + return encoded, newly_encoded_files, added_cnt, existing_cnt def dedupe_images( # noqa 901 - files0: list[str], - files1: list[str], - encodings: EncodingType, + encodings1: EncodingType, + encodings2: EncodingType, ignored_pairs: IgnoredPairType, dedupe_threshold: float, options: dict[str, Any] | None = None, @@ -73,41 +73,57 @@ def dedupe_images( # noqa 901 progress = default_progress findings = defaultdict(list) - config = options or {} + all_encodings = {**encodings1, **encodings2} + encodings_np = {file: np.array(enc) for file, enc in all_encodings.items() if not is_facial_error(enc)} - for i, file1 in enumerate(files0): - progress() - enc1 = encodings[file1] - if is_facial_error(enc1): - findings[file1].append([enc1, None]) - continue + files1 = list(encodings1.keys()) + + def compare_and_find(file1: str, file2: str) -> None: + if (file1, file2) in ignored_pairs or (file2, file1) in ignored_pairs: + return + + if file1 not in encodings_np or file2 not in encodings_np: + return + + enc1_np = encodings_np[file1] + enc2_np = encodings_np[file2] - if files0 == files1: - files1_ = files1[i + 1 :] - else: - files1_ = files1 - - for file2 in files1_: - enc2 = encodings[file2] - if ( - file2 in findings - or (file1, file2) in ignored_pairs - or (file2, file1) in ignored_pairs - or is_facial_error(enc2) - or any(file2 == dup[0] for dup in findings.get(file1, [])) - ): + similarity = float(np.dot(enc1_np, enc2_np) / (norm(enc1_np) * norm(enc2_np))) + + if similarity >= dedupe_threshold: + findings[file1].append([file2, similarity]) + + if id(encodings1) == id(encodings2): # Intra-chunk comparison + for i, file1 in enumerate(files1): + progress() + enc1 = encodings1[file1] + if is_facial_error(enc1): + findings[file1].append([enc1, None]) continue - res = DeepFace.verify(enc1, enc2, **config) - similarity = float(1 - res["distance"]) - if similarity >= dedupe_threshold: - findings[file1].append([file2, similarity]) + + for j in range(i + 1, len(files1)): + file2 = files1[j] + compare_and_find(file1, file2) + else: # Inter-chunk comparison + files2 = list(encodings2.keys()) + for file1 in files1: + progress() + enc1 = encodings1[file1] + if is_facial_error(enc1): + findings[file1].append([enc1, None]) + continue + + for file2 in files2: + compare_and_find(file1, file2) results: FindingType = [] for img, duplicates in findings.items(): for dup in duplicates: if is_facial_error(dup[0]): - results.append((img, "", 0, Image.StatusCode(dup[0]).value)) + status = coerce_status_code(dup[0]) + status_value = status.value if status is not None else Image.StatusCode.GENERIC_ERROR.value + results.append((img, "", 0, status_value)) else: results.append((img, dup[0], dup[1], Image.StatusCode.DEDUPLICATE_SUCCESS.value)) diff --git a/src/hope_dedup_engine/apps/faces/utils.py b/src/hope_dedup_engine/apps/faces/utils.py index 6a99cf43..811b580a 100644 --- a/src/hope_dedup_engine/apps/faces/utils.py +++ b/src/hope_dedup_engine/apps/faces/utils.py @@ -1,29 +1,33 @@ -import time -from contextlib import contextmanager -from typing import Generator +from typing import Any -import sentry_sdk -from django.conf import settings from hope_dedup_engine.apps.api.models import Image -def is_facial_error(value): - if isinstance(value, int | str): - return value not in { - Image.StatusCode.DEDUPLICATE_SUCCESS, - Image.StatusCode.DEDUPLICATE_SUCCESS.name, - Image.StatusCode.DEDUPLICATE_SUCCESS.label, - } and value in ( - Image.StatusCode.values + Image.StatusCode.names + [choice.label for choice in Image.StatusCode] - ) - return False - - -@contextmanager -def report_long_execution(message: str, threshold_seconds: int | None = None) -> Generator: - if threshold_seconds is None: - threshold_seconds = settings.DEFAULT_THRESHOLD_SECONDS - start = time.time() - yield - if (total := time.time() - start) > threshold_seconds: - sentry_sdk.capture_message(f"Execution took {total} seconds: {message}") +def coerce_status_code(value: Any) -> Image.StatusCode | None: + if value is None: + return None + + if isinstance(value, Image.StatusCode): + return value + + try: + return Image.StatusCode(value) + except (ValueError, TypeError): + pass + + if isinstance(value, str): + try: + return Image.StatusCode[value] + except KeyError: + pass + + for status in Image.StatusCode: + if value == status.label: + return status + + return None + + +def is_facial_error(value: Any) -> bool: + status = coerce_status_code(value) + return status is not None and status != Image.StatusCode.DEDUPLICATE_SUCCESS diff --git a/src/hope_dedup_engine/config/__init__.py b/src/hope_dedup_engine/config/__init__.py index 7833b1cf..584c20d9 100644 --- a/src/hope_dedup_engine/config/__init__.py +++ b/src/hope_dedup_engine/config/__init__.py @@ -168,6 +168,20 @@ class Group(Enum): "storages.backends.azure_storage.AzureStorage", setting("storages"), ), + "IMAGE_STORAGE_BACKEND": ( + str, + "azure", + "local", + False, + "Storage backend for images ('azure' or 'local')", + ), + "LOCAL_IMAGE_DIR": ( + str, + "", + "", + False, + "Path to local image directory when using 'local' storage backend", + ), "LOG_LEVEL": (str, "CRITICAL", "DEBUG", False, setting("logging-level")), "MEDIA_ROOT": ( str, diff --git a/src/hope_dedup_engine/config/fragments/celery.py b/src/hope_dedup_engine/config/fragments/celery.py index fa853f24..628713e7 100644 --- a/src/hope_dedup_engine/config/fragments/celery.py +++ b/src/hope_dedup_engine/config/fragments/celery.py @@ -24,10 +24,8 @@ CELERY_CACHE_BACKEND = "django-cache" -CELERY_RESULT_BACKEND = "django-db" -CELERY_RESULT_PERSISTENT = True +CELERY_RESULT_BACKEND = CELERY_BROKER_URL CELERY_RESULT_EXPIRES = None -CELERY_RESULT_EXTENDED = True CELERY_RESULT_SERIALIZER = "json" CELERY_BROKER_CONNECTION_RETRY = False diff --git a/src/hope_dedup_engine/config/fragments/constance.py b/src/hope_dedup_engine/config/fragments/constance.py index 02a2ff97..f12cf4c8 100644 --- a/src/hope_dedup_engine/config/fragments/constance.py +++ b/src/hope_dedup_engine/config/fragments/constance.py @@ -4,7 +4,7 @@ CONSTANCE_CONFIG = { "FACE_RECOGNITION_MODEL": ( - "VGG-Face", + "ArcFace", "Specifies the face recognition model to be used for encoding face landmarks.", "face_recognition_models", ), @@ -14,7 +14,7 @@ "face_detector_backend", ), "FACE_DISTANCE_THRESHOLD": ( - 0.4, + 0.32, """ Specifies the maximum allowable distance between two face embeddings for them to be considered a match. This tolerance threshold is crucial for assessing whether two faces belong to the same individual, @@ -60,7 +60,7 @@ "face_recognition_models": [ "django.forms.ChoiceField", { - "choices": (("VGG-Face", "VGG-Face"),), + "choices": (("ArcFace", "ArcFace"),), }, ], "face_detector_backend": [ diff --git a/src/hope_dedup_engine/config/fragments/models.py b/src/hope_dedup_engine/config/fragments/models.py index 1c17b865..dcee5e6f 100644 --- a/src/hope_dedup_engine/config/fragments/models.py +++ b/src/hope_dedup_engine/config/fragments/models.py @@ -7,6 +7,6 @@ DEEPFACE_WEIGHTS_BASE_LOCATION: Final[Path] = DEEPFACE_HOME / ".deepface/weights" DEEPFACE_WEIGHTS: Final[dict[str, dict[str, str]]] = { - "vgg_face_weights.h5": "https://github.com/serengil/deepface_models/releases/download/v1.0/vgg_face_weights.h5", + "arcface_weights.h5": "https://github.com/serengil/deepface_models/releases/download/v1.0/arcface_weights.h5", "retinaface.h5": "https://github.com/serengil/deepface_models/releases/download/v1.0/retinaface.h5", } diff --git a/src/hope_dedup_engine/config/settings.py b/src/hope_dedup_engine/config/settings.py index e1de1c76..c91fd0b6 100644 --- a/src/hope_dedup_engine/config/settings.py +++ b/src/hope_dedup_engine/config/settings.py @@ -35,10 +35,10 @@ "adminfilters", "adminfilters.depot", "constance", + "django_extensions", "rest_framework", "django_filters", "django_celery_beat", - "django_celery_results", "drf_spectacular", "drf_spectacular_sidecar", "hope_dedup_engine.apps.api", @@ -88,8 +88,9 @@ # FILE_STORAGE_DNN=storages.backends.azure_storage.AzureStorage?azure_container=dnn&overwrite_files=True&connection_string=DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://azurite:10000/devstoreaccount1; # noqa "dnn": env.storage("FILE_STORAGE_DNN"), } -DEFAULT_ROOT = env("DEFAULT_ROOT") -STORAGES["default"].get("OPTIONS", {}).update({"location": DEFAULT_ROOT}) + +IMAGE_STORAGE_BACKEND = env("IMAGE_STORAGE_BACKEND", default="azure") +LOCAL_IMAGE_DIR = env("LOCAL_IMAGE_DIR", default="") SECRET_KEY = env("SECRET_KEY") ALLOWED_HOSTS = env("ALLOWED_HOSTS") @@ -122,8 +123,12 @@ REDIS_URL = urlparse(CACHE_URL).hostname CACHES = { "default": { - "BACKEND": "django.core.cache.backends.redis.RedisCache", + "BACKEND": "django_redis.cache.RedisCache", "LOCATION": CACHE_URL, + "OPTIONS": { + "CLIENT_CLASS": "django_redis.client.DefaultClient", + "COMPRESSOR": "django_redis.compressors.zstd.ZStdCompressor", + }, } } X_FRAME_OPTIONS = "SAMEORIGIN" diff --git a/src/hope_dedup_engine/config/urls.py b/src/hope_dedup_engine/config/urls.py index ce3a648e..c5c069e4 100644 --- a/src/hope_dedup_engine/config/urls.py +++ b/src/hope_dedup_engine/config/urls.py @@ -10,8 +10,8 @@ path(r"accounts/", include("django.contrib.auth.urls")), path(r"adminactions/", include("adminactions.urls")), path(r"sentry_debug/", lambda _: 1 / 0), + path("api/", include("hope_dedup_engine.apps.api.urls")), path(r"", include("hope_dedup_engine.web.urls")), - path("", include("hope_dedup_engine.apps.api.urls")), ] if settings.DEBUG: diff --git a/tests/api/admin/test_deduplicationset.py b/tests/api/admin/test_deduplicationset.py new file mode 100644 index 00000000..bf3c0cb4 --- /dev/null +++ b/tests/api/admin/test_deduplicationset.py @@ -0,0 +1,62 @@ +from unittest.mock import patch + +import pytest +from django.contrib.messages import get_messages +from django.urls import reverse + +from hope_dedup_engine.apps.api.models import DedupJob, DeduplicationSet +from testutils.factories.api import DedupJobFactory, DeduplicationSetFactory + + +@pytest.mark.django_db +def test_terminate_job_with_active_job(client): + """Test that an active job is terminated.""" + ds = DeduplicationSetFactory() + DedupJobFactory(deduplication_set=ds, curr_async_result_id="some-id") + url = reverse("admin:api_deduplicationset_terminate_job", args=[ds.pk]) + + mock_terminate_path = "hope_dedup_engine.apps.api.models.jobs.DedupJob.terminate" + with patch(mock_terminate_path, return_value=DedupJob.CANCELED) as mock_terminate: + response = client.post(url, follow=True) + + mock_terminate.assert_called_once() + ds.refresh_from_db() + assert ds.state == DeduplicationSet.State.CANCELED + messages = list(get_messages(response.wsgi_request)) + assert len(messages) == 1 + assert "Job termination initiated. New job status: CANCELED." in str(messages[0]) + + +@pytest.mark.django_db +def test_terminate_job_with_no_job(client): + """Test terminating when no job is associated.""" + ds = DeduplicationSetFactory() + url = reverse("admin:api_deduplicationset_terminate_job", args=[ds.pk]) + + with patch("hope_dedup_engine.apps.api.models.jobs.DedupJob.terminate") as mock_terminate: + response = client.post(url, follow=True) + + mock_terminate.assert_not_called() + ds.refresh_from_db() + assert ds.state == DeduplicationSet.State.CANCELED + messages = list(get_messages(response.wsgi_request)) + assert len(messages) == 1 + assert "No active job found. Setting state to Canceled." in str(messages[0]) + + +@pytest.mark.django_db +def test_terminate_job_with_job_but_no_async_id(client): + """Test terminating when job exists but is not active (no async_result_id).""" + ds = DeduplicationSetFactory() + DedupJobFactory(deduplication_set=ds, curr_async_result_id=None) + url = reverse("admin:api_deduplicationset_terminate_job", args=[ds.pk]) + + with patch("hope_dedup_engine.apps.api.models.jobs.DedupJob.terminate") as mock_terminate: + response = client.post(url, follow=True) + + mock_terminate.assert_not_called() + ds.refresh_from_db() + assert ds.state == DeduplicationSet.State.CANCELED + messages = list(get_messages(response.wsgi_request)) + assert len(messages) == 1 + assert "No active job found. Setting state to Canceled." in str(messages[0]) diff --git a/tests/api/conftest.py b/tests/api/conftest.py index 2c85b533..f3781096 100644 --- a/tests/api/conftest.py +++ b/tests/api/conftest.py @@ -1,55 +1,32 @@ -from typing import Any -from unittest.mock import MagicMock +from __future__ import annotations + +import os +import random +from pathlib import Path +from typing import Any, TYPE_CHECKING + +if TYPE_CHECKING: + from unittest.mock import MagicMock + from pytest_mock import MockerFixture + from rest_framework.test import APIClient + import pytest -from pytest_factoryboy import LazyFixture, register -from pytest_mock import MockerFixture -from rest_framework.test import APIClient - -from api.utils import create_api_client -from testutils.factories.api import ( - ConfigFactory, - DedupJobFactory, - DeduplicationSetFactory, - FindingFactory, - IgnoredFilenamePairFactory, - IgnoredReferencePkPairFactory, - ImageFactory, - HDETokenFactory, -) -from testutils.factories.user import SystemFactory, UserFactory - -from hope_dedup_engine.apps.api.models import HDEToken - -register(SystemFactory) -register(UserFactory) -register(DeduplicationSetFactory, system=LazyFixture("system")) -register(ImageFactory, deduplication_set=LazyFixture("deduplication_set")) -register( - ImageFactory, - _name="second_image", - deduplication_Set=LazyFixture("deduplication_set"), -) -register(FindingFactory, deduplication_set=LazyFixture("deduplication_set")) -register(IgnoredFilenamePairFactory, deduplication_set=LazyFixture("deduplication_set")) -register(IgnoredReferencePkPairFactory, deduplication_set=LazyFixture("deduplication_set")) -register(ConfigFactory) -register(DedupJobFactory, deduplication_set=LazyFixture("deduplication_set")) -register(HDETokenFactory, user=LazyFixture("user"), system=LazyFixture("system")) @pytest.fixture def anonymous_api_client() -> APIClient: - return APIClient() + from rest_framework.test import APIClient # noqa: PLC0415 - -@pytest.fixture -def api_client(hde_token: HDEToken) -> APIClient: - return create_api_client(hde_token) + return APIClient() @pytest.fixture def another_system_api_client(db: Any) -> APIClient: + from api.utils import create_api_client # noqa: PLC0415 + from testutils.factories.api import HDETokenFactory # noqa: PLC0415 + from testutils.factories.user import SystemFactory, UserFactory # noqa: PLC0415 + token = HDETokenFactory(user=UserFactory(), system=SystemFactory()) return create_api_client(token) @@ -62,3 +39,16 @@ def delete_model_data(mocker: MockerFixture) -> MagicMock: @pytest.fixture(autouse=True) def send_notification(mocker: MockerFixture) -> MagicMock: return mocker.patch("hope_dedup_engine.apps.api.deduplication.process.send_notification") + + +@pytest.fixture(scope="session") +def images_dir() -> Path: + """Returns the absolute path to the test images directory.""" + return Path(__file__).parent.parent / "utils" / "images" + + +@pytest.fixture +def random_image_filename(images_dir: Path) -> str: + """Returns a random filename from the test images directory.""" + files = [f for f in os.listdir(images_dir) if os.path.isfile(images_dir / f)] + return random.choice(files) diff --git a/tests/api/deduplication/test_process.py b/tests/api/deduplication/test_process.py index 8282b6e5..0070adcd 100644 --- a/tests/api/deduplication/test_process.py +++ b/tests/api/deduplication/test_process.py @@ -44,13 +44,17 @@ def test_find_duplicates_orchestration( assert job.progress == 0 assert dedup_set.finding_set.count() == 0 - assert mock_encode_chunk.s.call_count == 1 - mock_chord.assert_called_once_with([mock_encode_chunk.s.return_value]) + assert mock_encode_chunk.s.call_count == 2 + + mock_chord.assert_called_once() + tasks_arg = mock_chord.call_args.args[0] + assert len(tasks_arg) == 2 + mock_callback_encodings.s.assert_called_once() mock_chord.return_value.assert_called_once_with(mock_callback_encodings.s.return_value) -@patch("sentry_sdk.capture_exception") +@patch("hope_dedup_engine.apps.api.deduplication.process.sentry_sdk.capture_exception") @patch( "hope_dedup_engine.apps.api.deduplication.process.send_notification", side_effect=Exception("Test Error"), diff --git a/tests/api/test_deduplication_set_process.py b/tests/api/test_deduplication_set_process.py index b79a6251..99017187 100644 --- a/tests/api/test_deduplication_set_process.py +++ b/tests/api/test_deduplication_set_process.py @@ -22,6 +22,7 @@ def test_can_trigger_deduplication_set_processing( mock_dedup_job_queue: MagicMock, api_client: APIClient, deduplication_set: DeduplicationSet, + deduplication_set__state: str, ) -> None: response = api_client.post(reverse(DEDUPLICATION_SET_PROCESS_VIEW, (deduplication_set.pk,))) assert response.status_code == status.HTTP_200_OK diff --git a/tests/apps/faces/managers/test_storage.py b/tests/apps/faces/managers/test_storage.py new file mode 100644 index 00000000..e1c34e6f --- /dev/null +++ b/tests/apps/faces/managers/test_storage.py @@ -0,0 +1,84 @@ +from pathlib import Path + +import cv2 +import numpy as np +import pytest +from azure.core.exceptions import ResourceNotFoundError + +from hope_dedup_engine.apps.faces.managers.storage import ( + ImagesStorageManager, + LocalImagesStorageManager, + get_storage_manager, +) + + +@pytest.mark.parametrize( + ("backend", "local_dir", "dir_exists", "expected_result", "expected_exception"), + [ + ("local", "/tmp/images", True, LocalImagesStorageManager, None), + ("azure", None, False, ImagesStorageManager, None), + ("local", None, False, None, ValueError), + ("local", "/nonexistent_dir", False, None, FileNotFoundError), + ("invalid", None, False, None, ValueError), + ], +) +def test_get_storage_manager(settings, tmp_path, backend, local_dir, dir_exists, expected_result, expected_exception): + """Test the get_storage_manager factory function under various conditions.""" + settings.IMAGE_STORAGE_BACKEND = backend + if local_dir: + if dir_exists: + test_dir = tmp_path / "images" + test_dir.mkdir() + settings.LOCAL_IMAGE_DIR = str(test_dir) + else: + settings.LOCAL_IMAGE_DIR = local_dir + else: + settings.LOCAL_IMAGE_DIR = None + + if expected_exception: + with pytest.raises(expected_exception): + get_storage_manager() + else: + manager = get_storage_manager() + assert isinstance(manager, expected_result) + + +def test_local_images_storage_manager(tmp_path: Path): + """Test successfully loading an image with LocalImagesStorageManager.""" + image_dir = tmp_path / "images" + image_dir.mkdir() + image_path = image_dir / "test.jpg" + + dummy_image = np.zeros((100, 100, 3), dtype=np.uint8) + cv2.imwrite(str(image_path), dummy_image) + + manager = LocalImagesStorageManager() + manager.base_dir = image_dir + + loaded_image = manager.load_image("test.jpg") + assert isinstance(loaded_image, np.ndarray) + assert loaded_image.shape == (100, 100, 3) + + +def test_local_images_storage_manager_load_not_found(tmp_path: Path): + """Test that ResourceNotFoundError is raised for a missing file.""" + manager = LocalImagesStorageManager() + manager.base_dir = tmp_path + with pytest.raises(ResourceNotFoundError): + manager.load_image("nonexistent.jpg") + + +def test_images_storage_manager_load_image(mocker): + """Test that ImagesStorageManager correctly calls the azure storage backend.""" + mock_azure_storage_class = mocker.patch("hope_dedup_engine.apps.faces.managers.storage.AzureStorage") + mock_azure_storage_instance = mock_azure_storage_class.return_value + + mock_file = mocker.MagicMock() + mock_file.read.return_value = b"some_image_bytes" + mock_azure_storage_instance.open.return_value.__enter__.return_value = mock_file + mocker.patch("cv2.imdecode", return_value=np.zeros((10, 10, 3))) + + manager = ImagesStorageManager() + manager.load_image("test.jpg") + + mock_azure_storage_instance.open.assert_called_once_with("test.jpg", "rb") diff --git a/tests/apps/faces/services/test_facial.py b/tests/apps/faces/services/test_facial.py index 843439d3..c4252147 100644 --- a/tests/apps/faces/services/test_facial.py +++ b/tests/apps/faces/services/test_facial.py @@ -1,4 +1,3 @@ -import copy from unittest.mock import Mock import pytest @@ -25,8 +24,9 @@ def mock_deepface(mocker): @pytest.fixture def mock_storage(mocker): - """Fixture to mock the ImagesStorageManager.""" - storage_mock = mocker.patch("hope_dedup_engine.apps.faces.services.facial.ImagesStorageManager").return_value + """Fixture to mock the get_storage_manager function.""" + mock_get_storage = mocker.patch("hope_dedup_engine.apps.faces.services.facial.get_storage_manager") + storage_mock = mock_get_storage.return_value storage_mock.load_image.return_value = "image_data" return storage_mock @@ -49,10 +49,11 @@ def test_encode_faces_success(mock_deepface, mock_storage): files = ["file1.jpg", "file2.jpg"] mock_deepface.represent.side_effect = [[{"embedding": [1.0]}], [{"embedding": [2.0]}]] - encoded, added, existing = encode_faces(files) + encoded, newly_encoded, added, existing = encode_faces(files) assert added == 2 - assert existing == 1000 # Based on hardcoded value in function + assert existing == 0 + assert newly_encoded == files assert encoded == {"file1.jpg": [1.0], "file2.jpg": [2.0]} assert mock_deepface.represent.call_count == 2 @@ -64,10 +65,11 @@ def test_encode_faces_with_pre_encodings(mock_deepface, mock_storage): pre_encodings = {"file1.jpg": [1.0]} mock_deepface.represent.return_value = [{"embedding": [2.0]}] - encoded, added, existing = encode_faces(files, pre_encodings=pre_encodings) + encoded, newly_encoded, added, existing = encode_faces(files, pre_encodings=pre_encodings) assert added == 1 - assert existing == 1001 + assert existing == 1 + assert newly_encoded == ["file2.jpg"] assert encoded == {"file1.jpg": [1.0], "file2.jpg": [2.0]} mock_deepface.represent.assert_called_once_with("image_data") @@ -100,8 +102,8 @@ def test_encode_faces_deepface_outcomes(mock_deepface, mock_storage, represent_k files = ["file1.jpg"] mock_deepface.represent.configure_mock(**represent_kwargs) - encoded, _, _ = encode_faces(files) - assert encoded["file1.jpg"] == expected_status.value + encoded, _, _, _ = encode_faces(files) + assert encoded["file1.jpg"] == expected_status.name @pytest.mark.django_db @@ -110,88 +112,150 @@ def test_encode_faces_file_not_found(mock_deepface, mock_storage): files = ["file1.jpg"] mock_storage.load_image.side_effect = ResourceNotFoundError("File not found") - encoded, _, _ = encode_faces(files) - assert encoded["file1.jpg"] == Image.StatusCode.NO_FILE_FOUND.value + encoded, _, _, _ = encode_faces(files) + assert encoded["file1.jpg"] == Image.StatusCode.NO_FILE_FOUND.name mock_deepface.represent.assert_not_called() @pytest.mark.django_db -@pytest.mark.parametrize( - ("verify_return", "expected_result"), - [ - ({"distance": 0.05}, [("file1.jpg", "file2.jpg", 0.95, Image.StatusCode.DEDUPLICATE_SUCCESS.value)]), - ({"distance": 0.2}, []), - ], -) -def test_dedupe_images_similarity_threshold(mock_deepface, sample_data, verify_return, expected_result): +def test_dedupe_images_similarity_threshold(): """Test dedupe_images with different similarity scores.""" - mock_deepface.verify.return_value = verify_return - results = dedupe_images(**sample_data) - assert results == expected_result - mock_deepface.verify.assert_called_once_with( - sample_data["encodings"]["file1.jpg"], sample_data["encodings"]["file2.jpg"] - ) + encodings = { + "file1.jpg": [1.0, 0.0], # vector for file1 + "file2.jpg": [0.9, 0.1], # very similar to file1 + "file3.jpg": [0.0, 1.0], # very different from file1 + } + results = dedupe_images(encodings, encodings, ignored_pairs=set(), dedupe_threshold=0.9) + assert len(results) == 1 + assert results[0][:2] == ("file1.jpg", "file2.jpg") + assert results[0][2] == pytest.approx(0.994, abs=1e-3) + + results = dedupe_images(encodings, encodings, ignored_pairs=set(), dedupe_threshold=0.995) + assert results == [] @pytest.mark.django_db -def test_dedupe_images_with_ignored_pair(mock_deepface, sample_data): +def test_dedupe_images_with_ignored_pair(sample_data): """Test that ignored pairs are not compared.""" - test_data = copy.deepcopy(sample_data) - test_data["ignored_pairs"] = {("file1.jpg", "file2.jpg")} - results = dedupe_images(**test_data) + encodings = sample_data["encodings"] + ignored = {("file1.jpg", "file2.jpg")} + results = dedupe_images(encodings, encodings, ignored_pairs=ignored, dedupe_threshold=0.9) assert results == [] - mock_deepface.verify.assert_not_called() @pytest.mark.django_db -def test_dedupe_images_with_facial_error(mock_deepface, sample_data): +def test_dedupe_images_with_facial_error(sample_data): """Test that files with facial errors are reported correctly.""" - test_data = copy.deepcopy(sample_data) - test_data["encodings"]["file1.jpg"] = Image.StatusCode.NO_FACE_DETECTED.value - results = dedupe_images(**test_data) + encodings = sample_data["encodings"] + encodings["file1.jpg"] = Image.StatusCode.NO_FACE_DETECTED.name + results = dedupe_images(encodings, encodings, ignored_pairs=set(), dedupe_threshold=0.9) expected = [("file1.jpg", "", 0, Image.StatusCode.NO_FACE_DETECTED.value)] assert results == expected - mock_deepface.verify.assert_not_called() @pytest.mark.django_db -def test_dedupe_images_progress_callback(mock_deepface, sample_data): +def test_dedupe_images_progress_callback(sample_data): """Test that the progress callback is called for each file.""" - mock_deepface.verify.return_value = {"distance": 0.2} progress_mock = Mock() - sample_data["progress"] = progress_mock - - dedupe_images(**sample_data) - assert progress_mock.call_count == len(sample_data["files0"]) + encodings = sample_data["encodings"] + dedupe_images( + encodings, + encodings, + ignored_pairs=set(), + dedupe_threshold=0.9, + progress=progress_mock, + ) + assert progress_mock.call_count == len(encodings) @pytest.mark.django_db -def test_dedupe_images_complex_scenario(mock_deepface, complex_deduplication_data): +def test_dedupe_images_complex_scenario(): """Test dedupe_images with a mix of duplicates, non-duplicates, errors, and ignored pairs.""" - mock_deepface.verify.side_effect = [ - {"distance": 0.01}, # f1-f2 - {"distance": 0.5}, # f1-f3 - # f1-f4 skipped because of no face detected in f4 - # f1-f5 in ignored pairs - {"distance": 0.5}, # f2-f3 - # f2-f4 skipped because of no face detected in f4 - {"distance": 0.5}, # f2-f5 - # f3-f4 skipped because of no face detected in f4 - {"distance": 0.5}, # f3-f5 - # f4-f5 skipped because of no face detected in f4 - ] - - results = dedupe_images(**complex_deduplication_data) - - expected_findings = [ - ("f4.jpg", "", 0, Image.StatusCode.NO_FACE_DETECTED.value), - ("f1.jpg", "f2.jpg", 0.99, Image.StatusCode.DEDUPLICATE_SUCCESS.value), - ] - - # The order of findings might not be guaranteed - assert len(results) == len(expected_findings) - # Convert to set of tuples for order-agnostic comparison - assert {tuple(item) for item in results} == {tuple(item) for item in expected_findings} - # check all all expected calls were made - with pytest.raises(StopIteration): - mock_deepface.verify() + encodings = { + "f1.jpg": [1.0, 0.0], # a + "f2.jpg": [0.9, 0.1], # similar to a + "f3.jpg": [0.0, 1.0], # b + "f4.jpg": Image.StatusCode.NO_FACE_DETECTED.name, # error + "f5.jpg": [0.8, 0.2], # also similar to a + "f6.jpg": [0.85, 0.15], # also similar to a, and to f5. and ignored with f5 + } + ignored = {("f5.jpg", "f6.jpg")} + dedupe_threshold = 0.9 + + results = dedupe_images(encodings, encodings, ignored, dedupe_threshold) + + # Expected findings: + # f4 -> error finding + # f1 -> f2 (sim ~0.99) + # f1 -> f5 (sim ~0.98) + # f1 -> f6 (sim ~0.99) + # f2 -> f5 (sim ~0.99) + # f2 -> f6 (sim ~1.0) + # f5 -> f6 (ignored) + expected_pairs = { + ("f1.jpg", "f2.jpg"), + ("f1.jpg", "f5.jpg"), + ("f1.jpg", "f6.jpg"), + ("f2.jpg", "f5.jpg"), + ("f2.jpg", "f6.jpg"), + } + error_finding_found = False + found_pairs = set() + + for finding in results: + if finding[0] == "f4.jpg": + assert finding[3] == Image.StatusCode.NO_FACE_DETECTED.value + error_finding_found = True + else: + assert finding[3] == Image.StatusCode.DEDUPLICATE_SUCCESS.value + found_pairs.add(tuple(sorted((finding[0], finding[1])))) + + assert error_finding_found + assert found_pairs == expected_pairs + + +@pytest.mark.django_db +def test_dedupe_images_no_progress_callback(sample_data): + """Test that dedupe_images runs without a progress callback.""" + encodings = sample_data["encodings"] + results = dedupe_images(encodings, encodings, ignored_pairs=set(), dedupe_threshold=0.9, progress=None) + assert len(results) > 0 + + +@pytest.mark.django_db +def test_dedupe_images_with_zero_norm_vector(): + """Test that encodings with a zero-norm vector are handled correctly.""" + encodings = { + "file1.jpg": [1.0, 0.0], + "file2.jpg": [0.0, 0.0], # Zero-norm vector + "file3.jpg": [0.9, 0.1], # Similar to file1 + } + results = dedupe_images(encodings, encodings, ignored_pairs=set(), dedupe_threshold=0.9) + found_files = {item for res in results for item in res[:2]} + assert "file2.jpg" not in found_files + + assert len(results) == 1 + assert results[0][:2] == ("file1.jpg", "file3.jpg") + + +@pytest.mark.django_db +def test_dedupe_images_inter_chunk(): + """Test inter-chunk comparison logic.""" + encodings1 = {"file1.jpg": [1.0, 0.0]} + encodings2 = {"file2.jpg": [0.9, 0.1], "file3.jpg": [0.0, 1.0]} + results = dedupe_images(encodings1, encodings2, ignored_pairs=set(), dedupe_threshold=0.9) + assert len(results) == 1 + assert results[0][:2] == ("file1.jpg", "file2.jpg") + + +@pytest.mark.django_db +def test_dedupe_images_inter_chunk_with_error(): + """Test inter-chunk comparison with a facial error in one chunk.""" + encodings1 = {"file1.jpg": [1.0, 0.0]} + encodings2 = { + "file2.jpg": Image.StatusCode.NO_FACE_DETECTED.name, + "file3.jpg": [0.9, 0.1], + } + results = dedupe_images(encodings1, encodings2, ignored_pairs=set(), dedupe_threshold=0.9) + assert len(results) == 1 + assert results[0][:2] == ("file1.jpg", "file3.jpg") diff --git a/tests/apps/faces/test_celery_tasks.py b/tests/apps/faces/test_celery_tasks.py index fb664066..9390cc2f 100644 --- a/tests/apps/faces/test_celery_tasks.py +++ b/tests/apps/faces/test_celery_tasks.py @@ -1,5 +1,6 @@ +import json from unittest.mock import ANY, Mock, patch -from enum import IntEnum + import pytest from celery import states from celery.canvas import Signature @@ -15,8 +16,8 @@ get_chunks, shadow_name, sync_dnn_files, - ChunkPurpose, ) +from testutils.factories.api import ImageFactory @pytest.fixture @@ -37,28 +38,23 @@ def mock_task(): return task -@pytest.mark.parametrize( - ("filenames", "purpose"), - [ - (["f0", "f1", "f2", "f3", "f4"], ChunkPurpose.ENCODE), - (["a", "b", "c", "d", "e"], 2), - ], - ids=["enum", "numeric"], -) -def test_get_chunks(filenames, purpose): - size = int(purpose) - expected = [filenames[i : i + size] for i in range(0, len(filenames), size)] +def test_get_chunks_balances(monkeypatch): + monkeypatch.setattr("hope_dedup_engine.apps.faces.celery_tasks.TARGET_CHUNKS", 3) + filenames = [f"f{i}" for i in range(10)] - out = get_chunks(filenames, purpose=purpose) + chunks = get_chunks(filenames) - assert out == expected - assert isinstance(out, list) - assert all(isinstance(c, list) for c in out) + assert len(chunks) == min(len(filenames), 3) + [len(chunk) for chunk in chunks] + expected_chunk_size = (len(filenames) + len(chunks) - 1) // len(chunks) + assert all(len(chunk) == expected_chunk_size for chunk in chunks[:-1]) + assert len(chunks[-1]) <= expected_chunk_size + flattened = [item for chunk in chunks for item in chunk] + assert set(flattened) == set(filenames) -@pytest.mark.parametrize("purpose", list(ChunkPurpose)) -def test_get_chunks_empty_for_all_enum_values(purpose): - assert get_chunks([], purpose=purpose) == [] +def test_get_chunks_empty(): + assert get_chunks([]) == [] def test_shadow_name_success(mocker): @@ -99,11 +95,11 @@ def test_encode_chunk_success(mock_notify, mock_encode_faces, mock_get_ds, dedup def encode_side_effect(*args, **kwargs): kwargs["progress"]() - return {"file1.jpg": [1.0]}, 1, 0 + return {"file1.jpg": [1.0]}, ["file1.jpg"], None mock_encode_faces.side_effect = encode_side_effect - encode_chunk(["file1.jpg"], {"deduplication_set_id": ds.pk, "encoding": {}}) + encode_chunk.run(["file1.jpg"], {"deduplication_set_id": ds.pk, "encoding": {}}) mock_encode_faces.assert_called_once() ds.update_encodings.assert_called_once_with({"file1.jpg": [1.0]}) @@ -117,22 +113,30 @@ def test_encode_chunk_error(mock_sentry, mock_encode_faces, dedup_set_with_job): """Test encode_chunk handles exceptions correctly.""" ds = dedup_set_with_job with pytest.raises(Exception, match="mock error"): - encode_chunk(["file1.jpg"], {"deduplication_set_id": ds.pk}) + encode_chunk.run(["file1.jpg"], {"deduplication_set_id": ds.pk}) ds.refresh_from_db() assert ds.state == DeduplicationSet.State.FAILED mock_sentry.capture_exception.assert_called_once() @pytest.mark.django_db +@patch("hope_dedup_engine.apps.faces.celery_tasks.get_redis_connection") @patch("hope_dedup_engine.apps.faces.celery_tasks.DeduplicationSet.objects.get") @patch("hope_dedup_engine.apps.faces.celery_tasks.dedupe_images") @patch("hope_dedup_engine.apps.faces.celery_tasks.notify_status") -def test_dedupe_chunk_success(mock_notify, mock_dedupe_images, mock_get_ds, dedup_set_with_job, mocker): +def test_dedupe_chunk_success(mock_notify, mock_dedupe_images, mock_get_ds, mock_get_redis, dedup_set_with_job): """Test dedupe_chunk successfully finds duplicates.""" ds = dedup_set_with_job mock_get_ds.return_value = ds - mocker.patch.object(ds, "get_encodings", return_value={}) - mocker.patch.object(ds, "get_ignored_pairs", return_value=set()) + + redis_data = { + "chunk:new:0": json.dumps({"file1.jpg": [1.0]}), + "chunk:new:1": json.dumps({"file2.jpg": [2.0]}), + "ignored:key": json.dumps([["ignored1", "ignored2"]]), + } + redis_conn = Mock() + redis_conn.get.side_effect = lambda key: redis_data[key] + mock_get_redis.return_value = redis_conn def dedupe_side_effect(*args, **kwargs): kwargs["progress"]() @@ -140,112 +144,205 @@ def dedupe_side_effect(*args, **kwargs): mock_dedupe_images.side_effect = dedupe_side_effect - result = dedupe_chunk(["file1.jpg"], [], {"deduplication_set_id": ds.pk}) + config = {"deduplication_set_id": ds.pk, "deduplicate": {"threshold": 0.5}} + result = dedupe_chunk.run("chunk:new:0", "chunk:new:1", "ignored:key", config) assert result == "findings" mock_notify.assert_called() + mock_dedupe_images.assert_called_once_with( + {"file1.jpg": [1.0]}, + {"file2.jpg": [2.0]}, + {("ignored1", "ignored2")}, + dedupe_threshold=0.5, + options={"threshold": 0.5}, + progress=mock_dedupe_images.call_args.kwargs["progress"], + ) @pytest.mark.django_db +@patch("hope_dedup_engine.apps.faces.celery_tasks.get_redis_connection") @patch("hope_dedup_engine.apps.faces.celery_tasks.dedupe_images", side_effect=Exception("mock error")) @patch("hope_dedup_engine.apps.faces.celery_tasks.sentry_sdk") -def test_dedupe_chunk_error(mock_sentry, mock_dedupe_images, dedup_set_with_job): +def test_dedupe_chunk_error(mock_sentry, mock_dedupe_images, mock_get_redis, dedup_set_with_job): """Test dedupe_chunk handles exceptions correctly.""" ds = dedup_set_with_job + + redis_data = { + "chunk:new:0": json.dumps({"file1.jpg": [1.0]}), + "chunk:new:1": json.dumps({"file2.jpg": [2.0]}), + "ignored:key": json.dumps([["ignored1", "ignored2"]]), + } + redis_conn = Mock() + redis_conn.get.side_effect = lambda key: redis_data[key] + mock_get_redis.return_value = redis_conn + with pytest.raises(Exception, match="mock error"): - dedupe_chunk(["file1.jpg"], [], {"deduplication_set_id": ds.pk}) + dedupe_chunk.run( + "chunk:new:0", + "chunk:new:1", + "ignored:key", + {"deduplication_set_id": ds.pk, "deduplicate": {}}, + ) ds.refresh_from_db() assert ds.state == DeduplicationSet.State.FAILED mock_sentry.capture_exception.assert_called_once() @pytest.mark.django_db +@patch("hope_dedup_engine.apps.faces.celery_tasks.get_redis_connection") @patch("hope_dedup_engine.apps.faces.celery_tasks.DeduplicationSet.objects.get") -def test_callback_findings_error_on_update(mock_get_ds, dedup_set_with_job, mocker): +def test_callback_findings_error_on_update(mock_get_ds, mock_get_redis, dedup_set_with_job, mocker): """Test callback_findings handles exceptions during update_findings.""" ds = dedup_set_with_job mock_get_ds.return_value = ds mocker.patch.object(ds, "update_findings", side_effect=Exception("DB Error")) + redis_conn = Mock() + mock_get_redis.return_value = redis_conn + + context = {"keys_to_delete": ["key1", "key2"]} + config = {"deduplication_set_id": ds.pk} + with pytest.raises(Exception, match="DB Error"): - callback_findings([], {"deduplication_set_id": ds.pk}) + callback_findings.run([], context=context, config=config) ds.refresh_from_db() assert ds.state == DeduplicationSet.State.FAILED + redis_conn.delete.assert_called_once_with(*context["keys_to_delete"]) @pytest.mark.django_db +@patch("hope_dedup_engine.apps.faces.celery_tasks.get_redis_connection") @patch("hope_dedup_engine.apps.faces.celery_tasks.DeduplicationSet.objects.get") @patch("hope_dedup_engine.apps.faces.celery_tasks.send_notification") -def test_callback_findings_success(mock_send_notification, mock_get_ds, dedup_set_with_job, mocker): +def test_callback_findings_success(mock_send_notification, mock_get_ds, mock_get_redis, dedup_set_with_job, mocker): """Test callback_findings aggregates results and updates the dataset.""" ds = dedup_set_with_job + ImageFactory.create_batch(3, deduplication_set=ds) mock_get_ds.return_value = ds - mocker.patch.object(ds.image_set, "all", return_value=[1, 2, 3]) mocker.patch.object(ds, "update_findings") + + redis_conn = Mock() + mock_get_redis.return_value = redis_conn + results = [ [("file1.jpg", "file2.jpg", 0.99, 1)], [("file2.jpg", "file1.jpg", 0.99, 1)], # Duplicate pair ] + context = {"keys_to_delete": ["key1", "key2"]} + config = {"deduplication_set_id": ds.pk, "deduplicate": {"threshold": 0.9}} - result = callback_findings(results, {"deduplication_set_id": ds.pk}) + result = callback_findings.run(results, context=context, config=config) ds.refresh_from_db() ds.update_findings.assert_called_once_with([("file1.jpg", "file2.jpg", 0.99, 1)]) assert ds.state == DeduplicationSet.State.READY - mock_send_notification.assert_called_once() + mock_send_notification.assert_called_once_with(ds.notification_url) + redis_conn.delete.assert_called_once_with(*context["keys_to_delete"]) assert result["Findings"] == 1 + assert result["Files"] == 3 + assert result["Config"] == config["deduplicate"] @pytest.mark.django_db +@patch("hope_dedup_engine.apps.faces.celery_tasks.get_redis_connection") @patch("hope_dedup_engine.apps.faces.celery_tasks.DeduplicationSet.objects.get") @patch("hope_dedup_engine.apps.faces.celery_tasks.deduplicate_dataset.delay") -def test_callback_encodings_success(mock_delay, mock_get_ds, dedup_set_with_job): +def test_callback_encodings_success(mock_delay, mock_get_ds, mock_get_redis, dedup_set_with_job, mocker): """Test callback_encodings triggers the next step in the deduplication process.""" - mock_get_ds.return_value = dedup_set_with_job - config = {"deduplication_set_id": dedup_set_with_job.pk} - result = callback_encodings([], config) - assert result == {"Encoded": True} - mock_delay.assert_called_once_with(config) + ds = dedup_set_with_job + mock_get_ds.return_value = ds + mocker.patch.object( + ds, + "get_encodings", + return_value={ + "new1.jpg": [1], + "new2.jpg": [2], + "existing1.jpg": [3], + }, + ) + mocker.patch.object(ds, "get_ignored_pairs", return_value={("existing1.jpg", "existing2.jpg")}) + + redis_conn = Mock() + mock_get_redis.return_value = redis_conn + + config = {"deduplication_set_id": ds.pk} + results = [["new1.jpg", "new2.jpg"]] + + response = callback_encodings.run(results, config=config) + + assert response == {"Encoded": True} + + assert redis_conn.set.call_count == 1 + len(results[0]) + 1 # ignored pairs + new chunks + existing chunk + called_kwargs = mock_delay.call_args.kwargs + assert called_kwargs["config"] == config + assert called_kwargs["ignored_pairs_key"] == f"hde:dedup:{ds.pk}:ignored_pairs" + assert len(called_kwargs["new_chunk_keys"]) == len(results[0]) + assert all(key.startswith(f"hde:dedup:{ds.pk}:chunk:new:") for key in called_kwargs["new_chunk_keys"]) + assert len(called_kwargs["existing_chunk_keys"]) == 1 + assert all(key.startswith(f"hde:dedup:{ds.pk}:chunk:existing:") for key in called_kwargs["existing_chunk_keys"]) @pytest.mark.django_db @patch("hope_dedup_engine.apps.faces.celery_tasks.DeduplicationSet.objects.get") @patch("hope_dedup_engine.apps.faces.celery_tasks.chord") -def test_deduplicate_dataset_success(mock_chord, mock_get_ds, dedup_set_with_job, mocker): +def test_deduplicate_dataset_success(mock_chord, mock_get_ds, dedup_set_with_job): """Test deduplicate_dataset creates a chord of deduplication tasks.""" ds = dedup_set_with_job mock_get_ds.return_value = ds - mocker.patch.object(ds, "get_encodings", return_value={"f1": [1], "f2": [2], "f3": [3]}) - result = deduplicate_dataset({"deduplication_set_id": ds.pk}) - assert result["chunks"] == 1 + chord_callable = Mock(return_value="sig-123") + mock_chord.return_value = chord_callable + + config = {"deduplication_set_id": ds.pk} + new_chunk_keys = ["new:0"] + existing_chunk_keys: list[str] = [] + + result = deduplicate_dataset.run( + config=config, + new_chunk_keys=new_chunk_keys, + existing_chunk_keys=existing_chunk_keys, + ignored_pairs_key="ignored", + ) + + assert result["tasks"] == 1 + assert result["deduplication_set"] == str(ds) + assert result["chord_id"] == "sig-123" mock_chord.assert_called_once() + header = mock_chord.call_args[0][0] + assert len(header) == 1 + chord_callable.assert_called_once() @pytest.mark.django_db @patch("hope_dedup_engine.apps.faces.celery_tasks.DeduplicationSet.objects.get") @patch("hope_dedup_engine.apps.faces.celery_tasks.chord") -def test_deduplicate_dataset_multiple_chunks(mock_chord, mock_get_ds, dedup_set_with_job, mocker, monkeypatch): - """Test deduplicate_dataset with enough encodings to create multiple chunks.""" - chunks = 3 - chunk_size = 2 - filenames_count = 2 * chunk_size + 1 - - class _P(IntEnum): - DEDUPE = chunk_size - - monkeypatch.setattr("hope_dedup_engine.apps.faces.celery_tasks.ChunkPurpose", _P) +def test_deduplicate_dataset_multiple_chunks(mock_chord, mock_get_ds, dedup_set_with_job): + """Test deduplicate_dataset with multiple new and existing chunks.""" ds = dedup_set_with_job mock_get_ds.return_value = ds - encodings = {f"f{i}": [i] for i in range(filenames_count)} - mocker.patch.object(ds, "get_encodings", return_value=encodings) - - result = deduplicate_dataset({"deduplication_set_id": ds.pk}) + chord_callable = Mock(return_value="sig-456") + mock_chord.return_value = chord_callable + + config = {"deduplication_set_id": ds.pk} + new_chunk_keys = ["new:0", "new:1", "new:2"] + existing_chunk_keys = ["existing:0", "existing:1"] + + result = deduplicate_dataset.run( + config=config, + new_chunk_keys=new_chunk_keys, + existing_chunk_keys=existing_chunk_keys, + ignored_pairs_key="ignored", + ) - assert result["chunks"] == chunks + expected_task_count = len(new_chunk_keys) * (len(new_chunk_keys) + 1) // 2 + len(new_chunk_keys) * len( + existing_chunk_keys + ) + assert result["tasks"] == expected_task_count + assert result["chord_id"] == "sig-456" mock_chord.assert_called_once() header = mock_chord.call_args[0][0] - assert len(header) == chunk_size * chunks + assert len(header) == expected_task_count + chord_callable.assert_called_once() @patch("hope_dedup_engine.apps.faces.celery_tasks.FileSyncManager") @@ -253,7 +350,7 @@ def test_sync_dnn_files_success(mock_fsm, settings): """Test sync_dnn_files successfully syncs all required files.""" settings.DNN_FILES = {"model": {"filename": "f1.dat", "sources": {"azure": "b1"}}} mock_fsm.return_value.downloader.sync.return_value = True - assert sync_dnn_files(force=True) is True + assert sync_dnn_files.run(force=True) is True mock_fsm.return_value.downloader.sync.assert_called_once_with("f1.dat", "b1", force=True) @@ -263,7 +360,7 @@ def test_sync_dnn_files_error(mock_fsm, mock_update_state, settings): """Test sync_dnn_files handles exceptions and updates task state.""" settings.DNN_FILES = {"model": {"filename": "f1.dat", "sources": {"azure": "b1"}}} with pytest.raises(Exception, match="FSM Error"): - sync_dnn_files() + sync_dnn_files.run() mock_update_state.assert_called_once_with( state=states.FAILURE, diff --git a/tests/apps/faces/test_utils.py b/tests/apps/faces/test_utils.py index 30554fc0..b7adfbdc 100644 --- a/tests/apps/faces/test_utils.py +++ b/tests/apps/faces/test_utils.py @@ -1,33 +1,7 @@ -from unittest.mock import patch - import pytest from hope_dedup_engine.apps.api.models import Image -from hope_dedup_engine.apps.faces.utils import is_facial_error, report_long_execution - - -@pytest.mark.parametrize( - ("time_side_effect", "should_be_called"), - [ - ([0, 1], False), - ([0, 6], True), - ], -) -@patch("hope_dedup_engine.apps.faces.utils.sentry_sdk") -def test_report_long_execution(mock_sentry, time_side_effect, should_be_called, settings): - """Test that `report_long_execution` only sends a message to Sentry for long executions.""" - settings.DEFAULT_THRESHOLD_SECONDS = 5 - with patch("time.time", side_effect=time_side_effect): - with report_long_execution("Test Task"): - pass - - if should_be_called: - mock_sentry.capture_message.assert_called_once() - message = mock_sentry.capture_message.call_args.args[0] - assert "Execution took" in message - assert "seconds: Test Task" in message - else: - mock_sentry.capture_message.assert_not_called() +from hope_dedup_engine.apps.faces.utils import is_facial_error @pytest.mark.parametrize( diff --git a/tests/conftest.py b/tests/conftest.py index 3dc18048..34dd6373 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,15 +1,40 @@ +from __future__ import annotations + import os import sys from pathlib import Path +from typing import TYPE_CHECKING import django from django.conf import settings +from django.contrib.auth import get_user_model from django.core.management import CommandError, call_command +from django.test import Client import pytest import responses from constance import config +if TYPE_CHECKING: + from rest_framework.test import APIClient + from hope_dedup_engine.apps.api.models import ( + DeduplicationSet, + HDEToken, + Image, + Finding, + IgnoredFilenamePair, + IgnoredReferencePkPair, + ) + from hope_dedup_engine.apps.security.models import System, User + from testutils.factories.api import ( + DedupJobFactory, + FindingFactory, + IgnoredFilenamePairFactory, + IgnoredReferencePkPairFactory, + ImageFactory, + ) + + here = Path(__file__).parent sys.path.insert(0, str(here / "../src")) sys.path.insert(0, str(here / "extras")) @@ -89,3 +114,119 @@ def complex_deduplication_data(): "ignored_pairs": {("f1.jpg", "f5.jpg")}, "dedupe_threshold": 0.9, } + + +@pytest.fixture +def admin_user(db): + user_model = get_user_model() + return user_model.objects.create_superuser(username="admin", password="admin", email="admin@example.com") + + +@pytest.fixture +def client(admin_user): + client = Client() + client.force_login(admin_user) + return client + + +@pytest.fixture +def system() -> "System": + from testutils.factories.user import SystemFactory # noqa: PLC0415 + + return SystemFactory() + + +@pytest.fixture +def user() -> "User": + from testutils.factories.user import UserFactory # noqa: PLC0415 + + return UserFactory() + + +@pytest.fixture +def hdetoken(user: "User", system: "System") -> "HDEToken": + from testutils.factories.api import HDETokenFactory # noqa: PLC0415 + + return HDETokenFactory(user=user, system=system) + + +@pytest.fixture +def deduplication_set(system: "System") -> "DeduplicationSet": + from testutils.factories.api import DeduplicationSetFactory # noqa: PLC0415 + + return DeduplicationSetFactory(system=system) + + +@pytest.fixture +def api_client(hdetoken: "HDEToken") -> "APIClient": + from api.utils import create_api_client # noqa: PLC0415 + + return create_api_client(hdetoken) + + +@pytest.fixture +def dedup_job_factory() -> type[DedupJobFactory]: + """Provides the DedupJobFactory class to tests.""" + from testutils.factories.api import DedupJobFactory # noqa: PLC0415 + + return DedupJobFactory + + +@pytest.fixture +def image_factory() -> type[ImageFactory]: + """Provides the ImageFactory class to tests.""" + from testutils.factories.api import ImageFactory # noqa: PLC0415 + + return ImageFactory + + +@pytest.fixture +def finding_factory() -> type[FindingFactory]: + """Provides the FindingFactory class to tests.""" + from testutils.factories.api import FindingFactory # noqa: PLC0415 + + return FindingFactory + + +@pytest.fixture +def ignored_filename_pair_factory() -> type[IgnoredFilenamePairFactory]: + """Provides the IgnoredFilenamePairFactory class to tests.""" + from testutils.factories.api import IgnoredFilenamePairFactory # noqa: PLC0415 + + return IgnoredFilenamePairFactory + + +@pytest.fixture +def ignored_reference_pk_pair_factory() -> type[IgnoredReferencePkPairFactory]: + """Provides the IgnoredReferencePkPairFactory class to tests.""" + from testutils.factories.api import IgnoredReferencePkPairFactory # noqa: PLC0415 + + return IgnoredReferencePkPairFactory + + +@pytest.fixture +def image(image_factory: type[ImageFactory], deduplication_set: "DeduplicationSet") -> "Image": + """Provides an Image instance linked to a deduplication_set.""" + return image_factory(deduplication_set=deduplication_set) + + +@pytest.fixture +def finding(finding_factory: type[FindingFactory], deduplication_set: "DeduplicationSet") -> "Finding": + """Provides a Finding instance linked to a deduplication_set.""" + return finding_factory(deduplication_set=deduplication_set) + + +@pytest.fixture +def ignored_filename_pair( + ignored_filename_pair_factory: type[IgnoredFilenamePairFactory], deduplication_set: "DeduplicationSet" +) -> "IgnoredFilenamePair": + """Provides an IgnoredFilenamePair instance linked to a deduplication_set.""" + return ignored_filename_pair_factory(deduplication_set=deduplication_set) + + +@pytest.fixture +def ignored_reference_pk_pair( + ignored_reference_pk_pair_factory: type[IgnoredReferencePkPairFactory], deduplication_set: "DeduplicationSet" +) -> "IgnoredReferencePkPair": + """Provides an IgnoredReferencePkPair instance linked to a deduplication_set.""" + return ignored_reference_pk_pair_factory(deduplication_set=deduplication_set) diff --git a/tests/extras/testutils/factories/__init__.py b/tests/extras/testutils/factories/__init__.py index 5d6ec693..db5c037e 100644 --- a/tests/extras/testutils/factories/__init__.py +++ b/tests/extras/testutils/factories/__init__.py @@ -20,6 +20,7 @@ UserFactory, ) from .userrole import UserRole, UserRoleFactory # noqa +from .api import DedupJobFactory, DeduplicationSetFactory, HDETokenFactory # noqa for _, name, _ in pkgutil.iter_modules([str(Path(__file__).parent)]): importlib.import_module(f".{name}", __package__) diff --git a/tests/extras/testutils/factories/api.py b/tests/extras/testutils/factories/api.py index ec96245c..e51b73d6 100644 --- a/tests/extras/testutils/factories/api.py +++ b/tests/extras/testutils/factories/api.py @@ -2,6 +2,7 @@ from factory import Factory, SubFactory, Sequence, fuzzy, lazy_attribute, Trait from factory.django import DjangoModelFactory +from pytest_factoryboy import LazyFixture from testutils.factories import SystemFactory, UserFactory from hope_dedup_engine.apps.api.deduplication.config import ( @@ -50,7 +51,7 @@ class Meta: class ImageFactory(DjangoModelFactory): deduplication_set = SubFactory(DeduplicationSetFactory) - filename = fuzzy.FuzzyText() + filename = LazyFixture("random_image_filename") reference_pk = fuzzy.FuzzyText() class Meta: diff --git a/tests/faces/conftest.py b/tests/faces/conftest.py index 3533cdb5..ca2f4005 100644 --- a/tests/faces/conftest.py +++ b/tests/faces/conftest.py @@ -5,9 +5,7 @@ import numpy as np import pytest from PIL import Image -from django.contrib.auth import get_user_model from django.core.files.storage import FileSystemStorage -from django.test import Client from docker import from_env from freezegun import freeze_time from pytest_mock import MockerFixture @@ -174,16 +172,3 @@ def mock_file_sync_manager(): mock_downloader = MagicMock() mock_manager_instance.downloader = mock_downloader yield mock_manager_instance - - -@pytest.fixture -def admin_user(db): - user_model = get_user_model() - return user_model.objects.create_superuser(username="admin", password="admin", email="admin@example.com") - - -@pytest.fixture -def client(admin_user): - client = Client() - client.force_login(admin_user) - return client diff --git a/tests/utils/images/af1.jpg b/tests/utils/images/af1.jpg new file mode 100644 index 00000000..b09f18a2 Binary files /dev/null and b/tests/utils/images/af1.jpg differ diff --git a/tests/utils/images/af2.jpg b/tests/utils/images/af2.jpg new file mode 100644 index 00000000..b09f18a2 Binary files /dev/null and b/tests/utils/images/af2.jpg differ diff --git a/tests/utils/images/yv1.jpg b/tests/utils/images/yv1.jpg new file mode 100644 index 00000000..1ca62e84 Binary files /dev/null and b/tests/utils/images/yv1.jpg differ diff --git a/tests/utils/images/yv2.jpg b/tests/utils/images/yv2.jpg new file mode 100644 index 00000000..c074c327 Binary files /dev/null and b/tests/utils/images/yv2.jpg differ diff --git a/uv.lock b/uv.lock index 07e8bab6..3b92fc4c 100644 --- a/uv.lock +++ b/uv.lock @@ -682,6 +682,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/34/e1/988fa6ded7275bb11e373ccd4b708af477f12027d3ee86b7cb5fc5779412/django_picklefield-3.3-py3-none-any.whl", hash = "sha256:d6f6fd94a17177fe0d16b0b452a9860b8a1da97b6e70633ab53ade4975f1ce9a", size = 9565, upload-time = "2025-03-13T03:34:11.209Z" }, ] +[[package]] +name = "django-redis" +version = "6.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "django" }, + { name = "redis" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/08/53/dbcfa1e528e0d6c39947092625b2c89274b5d88f14d357cee53c4d6dbbd4/django_redis-6.0.0.tar.gz", hash = "sha256:2d9cb12a20424a4c4dde082c6122f486628bae2d9c2bee4c0126a4de7fda00dd", size = 56904, upload-time = "2025-06-17T18:15:46.376Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7e/79/055dfcc508cfe9f439d9f453741188d633efa9eab90fc78a67b0ab50b137/django_redis-6.0.0-py3-none-any.whl", hash = "sha256:20bf0063a8abee567eb5f77f375143c32810c8700c0674ced34737f8de4e36c0", size = 33687, upload-time = "2025-06-17T18:15:34.165Z" }, +] + [[package]] name = "django-regex" version = "0.5.0" @@ -808,6 +821,17 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e3/26/57c6fb270950d476074c087527a558ccb6f4436657314bfb6cdf484114c4/docker-7.1.0-py3-none-any.whl", hash = "sha256:c96b93b7f0a746f9e77d325bcfb87422a3d8bd4f03136ae8a85b37f1898d5fc0", size = 147774, upload-time = "2024-05-23T11:13:55.01Z" }, ] +[[package]] +name = "dotenv" +version = "0.9.9" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "python-dotenv" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/b2/b7/545d2c10c1fc15e48653c91efde329a790f2eecfbbf2bd16003b5db2bab0/dotenv-0.9.9-py2.py3-none-any.whl", hash = "sha256:29cf74a087b31dafdb5a446b6d7e11cbce8ed2741540e2339c69fbef92c94ce9", size = 1892, upload-time = "2025-02-19T22:15:01.647Z" }, +] + [[package]] name = "drf-nested-routers" version = "0.94.2" @@ -1199,11 +1223,13 @@ dependencies = [ { name = "django-extensions" }, { name = "django-filter" }, { name = "django-flags" }, + { name = "django-redis" }, { name = "django-regex" }, { name = "django-smart-env" }, { name = "django-storages", extra = ["azure"] }, { name = "django-svelte-jsoneditor" }, { name = "djangorestframework" }, + { name = "dotenv" }, { name = "drf-nested-routers" }, { name = "drf-spectacular", extra = ["sidecar"] }, { name = "flower" }, @@ -1211,6 +1237,7 @@ dependencies = [ { name = "numpy" }, { name = "psycopg", extra = ["binary"] }, { name = "psycopg-pool" }, + { name = "pyzstd" }, { name = "requests" }, { name = "sentry-sdk", extra = ["celery", "django"] }, { name = "setuptools" }, @@ -1291,11 +1318,13 @@ requires-dist = [ { name = "django-extensions" }, { name = "django-filter" }, { name = "django-flags" }, + { name = "django-redis", specifier = ">=6.0.0" }, { name = "django-regex" }, { name = "django-smart-env" }, { name = "django-storages", extras = ["azure"] }, { name = "django-svelte-jsoneditor" }, { name = "djangorestframework" }, + { name = "dotenv", specifier = ">=0.9.9" }, { name = "drf-nested-routers" }, { name = "drf-spectacular", extras = ["sidecar"] }, { name = "flower" }, @@ -1309,6 +1338,7 @@ requires-dist = [ { name = "numpy", specifier = "<2" }, { name = "psycopg", extras = ["binary"] }, { name = "psycopg-pool" }, + { name = "pyzstd", specifier = ">=0.17.0" }, { name = "requests" }, { name = "sentry-sdk", extras = ["celery", "django"] }, { name = "setuptools" }, @@ -2555,6 +2585,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" }, ] +[[package]] +name = "python-dotenv" +version = "1.1.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f6/b0/4bc07ccd3572a2f9df7e6782f52b0c6c90dcbb803ac4a167702d7d0dfe1e/python_dotenv-1.1.1.tar.gz", hash = "sha256:a8a6399716257f45be6a007360200409fce5cda2661e3dec71d23dc15f6189ab", size = 41978, upload-time = "2025-06-24T04:21:07.341Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5f/ed/539768cf28c661b5b068d66d96a2f155c4971a5d55684a514c1a0e0dec2f/python_dotenv-1.1.1-py3-none-any.whl", hash = "sha256:31f23644fe2602f88ff55e1f5c79ba497e01224ee7737937930c448e4d0e24dc", size = 20556, upload-time = "2025-06-24T04:21:06.073Z" }, +] + [[package]] name = "python3-openid" version = "3.2.0" @@ -2615,6 +2654,32 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/04/11/432f32f8097b03e3cd5fe57e88efb685d964e2e5178a48ed61e841f7fdce/pyyaml_env_tag-1.1-py3-none-any.whl", hash = "sha256:17109e1a528561e32f026364712fee1264bc2ea6715120891174ed1b980d2e04", size = 4722, upload-time = "2025-05-13T15:23:59.629Z" }, ] +[[package]] +name = "pyzstd" +version = "0.17.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/8f/a2/54d860ccbd07e3c67e4d0321d1c29fc7963ac82cf801a078debfc4ef7c15/pyzstd-0.17.0.tar.gz", hash = "sha256:d84271f8baa66c419204c1dd115a4dec8b266f8a2921da21b81764fa208c1db6", size = 1212160, upload-time = "2025-05-10T14:14:49.764Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/76/50/7fa47d0a13301b1ce20972aa0beb019c97f7ee8b0658d7ec66727b5967f9/pyzstd-0.17.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:2ac330fc4f64f97a411b6f3fc179d2fe3050b86b79140e75a9a6dd9d6d82087f", size = 379056, upload-time = "2025-05-10T14:13:17.091Z" }, + { url = "https://files.pythonhosted.org/packages/9d/f2/67b03b1fa4e2a0b05e147cc30ac6d271d3d11017b47b30084cb4699451f4/pyzstd-0.17.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:725180c0c4eb2e643b7048ebfb45ddf43585b740535907f70ff6088f5eda5096", size = 298381, upload-time = "2025-05-10T14:13:18.812Z" }, + { url = "https://files.pythonhosted.org/packages/01/8b/807ff0a13cf3790fe5de85e18e10c22b96d92107d2ce88699cefd3f890cb/pyzstd-0.17.0-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9c20fe0a60019685fa1f7137cb284f09e3f64680a503d9c0d50be4dd0a3dc5ec", size = 443770, upload-time = "2025-05-10T14:13:20.495Z" }, + { url = "https://files.pythonhosted.org/packages/f0/88/832d8d8147691ee37736a89ea39eaf94ceac5f24a6ce2be316ff5276a1f8/pyzstd-0.17.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d97f7aaadc3b6e2f8e51bfa6aa203ead9c579db36d66602382534afaf296d0db", size = 391167, upload-time = "2025-05-10T14:13:22.236Z" }, + { url = "https://files.pythonhosted.org/packages/a8/a5/2e09bee398dfb0d94ca43f3655552a8770a6269881dc4710b8f29c7f71aa/pyzstd-0.17.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:42dcb34c5759b59721997036ff2d94210515d3ef47a9de84814f1c51a1e07e8a", size = 478960, upload-time = "2025-05-10T14:13:23.584Z" }, + { url = "https://files.pythonhosted.org/packages/da/b5/1f3b778ad1ccc395161fab7a3bf0dfbd85232234b6657c93213ed1ceda7e/pyzstd-0.17.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6bf05e18be6f6c003c7129e2878cffd76fcbebda4e7ebd7774e34ae140426cbf", size = 421891, upload-time = "2025-05-10T14:13:25.417Z" }, + { url = "https://files.pythonhosted.org/packages/83/c4/6bfb4725f4f38e9fe9735697060364fb36ee67546e7e8d78135044889619/pyzstd-0.17.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c40f7c3a5144aa4fbccf37c30411f6b1db4c0f2cb6ad4df470b37929bffe6ca0", size = 413608, upload-time = "2025-05-10T14:13:26.75Z" }, + { url = "https://files.pythonhosted.org/packages/95/a2/c48b543e3a482e758b648ea025b94efb1abe1f4859c5185ff02c29596035/pyzstd-0.17.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:9efd4007f8369fd0890701a4fc77952a0a8c4cb3bd30f362a78a1adfb3c53c12", size = 416429, upload-time = "2025-05-10T14:13:28.096Z" }, + { url = "https://files.pythonhosted.org/packages/5c/62/2d039ee4dbc8116ca1f2a2729b88a1368f076f5dadad463f165993f7afa8/pyzstd-0.17.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:5f8add139b5fd23b95daa844ca13118197f85bd35ce7507e92fcdce66286cc34", size = 446671, upload-time = "2025-05-10T14:13:29.772Z" }, + { url = "https://files.pythonhosted.org/packages/be/ec/9ec9f0957cf5b842c751103a2b75ecb0a73cf3d99fac57e0436aab6748e0/pyzstd-0.17.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:259a60e8ce9460367dcb4b34d8b66e44ca3d8c9c30d53ed59ae7037622b3bfc7", size = 520290, upload-time = "2025-05-10T14:13:31.585Z" }, + { url = "https://files.pythonhosted.org/packages/cc/42/2e2f4bb641c2a9ab693c31feebcffa1d7c24e946d8dde424bba371e4fcce/pyzstd-0.17.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:86011a93cc3455c5d2e35988feacffbf2fa106812a48e17eb32c2a52d25a95b3", size = 563785, upload-time = "2025-05-10T14:13:32.971Z" }, + { url = "https://files.pythonhosted.org/packages/4d/e4/25e198d382faa4d322f617d7a5ff82af4dc65749a10d90f1423af2d194f6/pyzstd-0.17.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:425c31bc3de80313054e600398e4f1bd229ee61327896d5d015e2cd0283c9012", size = 433390, upload-time = "2025-05-10T14:13:34.668Z" }, + { url = "https://files.pythonhosted.org/packages/ad/7c/1ab970f5404ace9d343a36a86f1bd0fcf2dc1adf1ef8886394cf0a58bd9e/pyzstd-0.17.0-cp312-cp312-win32.whl", hash = "sha256:7c4b88183bb36eb2cebbc0352e6e9fe8e2d594f15859ae1ef13b63ebc58be158", size = 220291, upload-time = "2025-05-10T14:13:36.005Z" }, + { url = "https://files.pythonhosted.org/packages/b2/52/d35bf3e4f0676a74359fccef015eabe3ceaba95da4ac2212f8be4dde16de/pyzstd-0.17.0-cp312-cp312-win_amd64.whl", hash = "sha256:3c31947e0120468342d74e0fa936d43f7e1dad66a2262f939735715aa6c730e8", size = 246451, upload-time = "2025-05-10T14:13:37.712Z" }, + { url = "https://files.pythonhosted.org/packages/34/da/a44705fe44dd87e0f09861b062f93ebb114365640dbdd62cbe80da9b8306/pyzstd-0.17.0-cp312-cp312-win_arm64.whl", hash = "sha256:1d0346418abcef11507356a31bef5470520f6a5a786d4e2c69109408361b1020", size = 222967, upload-time = "2025-05-10T14:13:38.94Z" }, +] + [[package]] name = "redis" version = "5.2.1"