diff --git a/config/settings/test.py b/config/settings/test.py index d7eaa130..80374d2a 100644 --- a/config/settings/test.py +++ b/config/settings/test.py @@ -30,3 +30,7 @@ TEMPLATES[0]["OPTIONS"]["debug"] = True # type: ignore # noqa F405 # Your stuff... # ------------------------------------------------------------------------------ + + +CELERY_TASK_ALWAYS_EAGER = True # Executes tasks immediately instead of sending to the queue +CELERY_TASK_EAGER_PROPAGATES = True # Raises exceptions in the main thread for easier debugging diff --git a/sde_collections/migrations/0076_deltaresolvedtitle_status_and_more_squashed_0082_alter_deltaresolvedtitle_status.py b/sde_collections/migrations/0076_deltaresolvedtitle_status_and_more_squashed_0082_alter_deltaresolvedtitle_status.py new file mode 100644 index 00000000..d3e2dc3d --- /dev/null +++ b/sde_collections/migrations/0076_deltaresolvedtitle_status_and_more_squashed_0082_alter_deltaresolvedtitle_status.py @@ -0,0 +1,128 @@ +# Generated by Django 4.2.9 on 2025-02-24 22:40 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0075_alter_collection_reindexing_status_and_more"), + ] + + operations = [ + migrations.AddField( + model_name="deltaresolvedtitle", + name="status", + field=models.CharField( + choices=[ + ("pending", "Pending"), + ("processing", "Processing"), + ("resolved", "Resolved"), + ("failed", "Failed"), + ], + default="pending", + max_length=20, + ), + ), + migrations.AddField( + model_name="deltaresolvedtitle", + name="updated_at", + field=models.DateTimeField(auto_now=True), + ), + migrations.AddField( + model_name="deltaresolvedtitleerror", + name="updated_at", + field=models.DateTimeField(auto_now=True), + ), + migrations.AddIndex( + model_name="deltaresolvedtitle", + index=models.Index(fields=["status", "created_at"], name="sde_collect_status_42dc80_idx"), + ), + migrations.AlterField( + model_name="deltaresolvedtitle", + name="status", + field=models.CharField( + blank=True, + choices=[ + ("pending", "Pending"), + ("processing", "Processing"), + ("resolved", "Resolved"), + ("failed", "Failed"), + ], + max_length=20, + null=True, + ), + ), + migrations.AlterField( + model_name="deltaresolvedtitle", + name="status", + field=models.CharField( + choices=[ + ("pending", "Pending"), + ("processing", "Processing"), + ("resolved", "Resolved"), + ("failed", "Failed"), + ], + default="", + max_length=20, + null=True, + ), + ), + migrations.AlterField( + model_name="deltaresolvedtitle", + name="status", + field=models.CharField( + choices=[ + ("pending", "Pending"), + ("processing", "Processing"), + ("resolved", "Resolved"), + ("failed", "Failed"), + ], + default="", + max_length=20, + ), + ), + migrations.AlterField( + model_name="deltaresolvedtitle", + name="status", + field=models.CharField( + choices=[ + ("pending", "Pending"), + ("processing", "Processing"), + ("resolved", "Resolved"), + ("failed", "Failed"), + ], + default="", + max_length=20, + null=True, + ), + ), + migrations.AlterField( + model_name="deltaresolvedtitle", + name="status", + field=models.CharField( + choices=[ + ("pending", "Pending"), + ("processing", "Processing"), + ("resolved", "Resolved"), + ("failed", "Failed"), + ], + default="", + max_length=20, + ), + ), + migrations.AlterField( + model_name="deltaresolvedtitle", + name="status", + field=models.CharField( + choices=[ + ("pending", "Pending"), + ("processing", "Processing"), + ("resolved", "Resolved"), + ("failed", "Failed"), + ], + max_length=20, + null=True, + ), + ), + ] diff --git a/sde_collections/models/delta_patterns.py b/sde_collections/models/delta_patterns.py index 61c8e9ea..8b116a53 100644 --- a/sde_collections/models/delta_patterns.py +++ b/sde_collections/models/delta_patterns.py @@ -3,7 +3,7 @@ from django.apps import apps from django.core.exceptions import ValidationError -from django.db import models +from django.db import models, transaction from ..utils.title_resolver import ( is_valid_xpath, @@ -476,81 +476,21 @@ def generate_title_for_url(self, url_obj) -> tuple[str, str | None]: def apply(self) -> None: """ - Apply the title pattern to matching URLs: + Queue title pattern resolution for matching URLs: 1. Find new Curated URLs that match but weren't previously affected 2. Create Delta URLs only where the generated title differs - 3. Update all matching Delta URLs with new titles + 3. Queue background tasks for title resolution 4. Track title resolution status and errors """ - DeltaUrl = apps.get_model("sde_collections", "DeltaUrl") - DeltaResolvedTitle = apps.get_model("sde_collections", "DeltaResolvedTitle") - DeltaResolvedTitleError = apps.get_model("sde_collections", "DeltaResolvedTitleError") - - # Get newly matching Curated URLs - matching_curated_urls = self.get_matching_curated_urls() - previously_unaffected_curated = matching_curated_urls.exclude( - id__in=self.curated_urls.values_list("id", flat=True) - ) - - # Process each previously unaffected curated URL - for curated_url in previously_unaffected_curated: - if not self.is_most_distinctive_pattern(curated_url): - continue - - new_title, error = self.generate_title_for_url(curated_url) - - if error: - DeltaResolvedTitleError.objects.update_or_create( - delta_url=curated_url, defaults={"title_pattern": self, "error_string": error} # lookup field - ) - continue - - # Skip if the generated title matches existing or if Delta already exists - if ( - curated_url.generated_title == new_title - or DeltaUrl.objects.filter(url=curated_url.url, collection=self.collection).exists() - ): - continue - - # Create new Delta URL with the new title - fields = { - field.name: getattr(curated_url, field.name) - for field in curated_url._meta.fields - if field.name not in ["id", "collection"] - } - fields["generated_title"] = new_title - fields["to_delete"] = False - fields["collection"] = self.collection - - delta_url = DeltaUrl.objects.create(**fields) - # Record successful title resolution - DeltaResolvedTitle.objects.create(title_pattern=self, delta_url=delta_url, resolved_title=new_title) + # Inserting here to avoid circular import issue + from ..tasks import process_title_resolutions - # Update titles for all matching Delta URLs - for delta_url in self.get_matching_delta_urls(): - if not self.is_most_distinctive_pattern(delta_url): - continue + def queue_task(): + process_title_resolutions.delay(self.id) - new_title, error = self.generate_title_for_url(delta_url) - - if error: - DeltaResolvedTitleError.objects.update_or_create( - delta_url=delta_url, defaults={"title_pattern": self, "error_string": error} # lookup field - ) - continue - - # Update title and record resolution - key change here - DeltaResolvedTitle.objects.update_or_create( - delta_url=delta_url, # Only use delta_url for lookup - defaults={"title_pattern": self, "resolved_title": new_title}, - ) - - delta_url.generated_title = new_title - delta_url.save() - - # Update pattern relationships - self.update_affected_delta_urls_list() + # Queue the background task only after the transaction commits (i.e, after apply() method) + transaction.on_commit(queue_task) def unapply(self) -> None: """ @@ -670,24 +610,45 @@ class DeltaResolvedTitleBase(models.Model): title_pattern = models.ForeignKey(DeltaTitlePattern, on_delete=models.CASCADE) delta_url = models.OneToOneField("sde_collections.DeltaUrl", on_delete=models.CASCADE) created_at = models.DateTimeField(auto_now_add=True) + updated_at = models.DateTimeField(auto_now=True) class Meta: abstract = True class DeltaResolvedTitle(DeltaResolvedTitleBase): + class Status(models.TextChoices): + PENDING = "pending", "Pending" + PROCESSING = "processing", "Processing" + RESOLVED = "resolved", "Resolved" + FAILED = "failed", "Failed" + resolved_title = models.CharField(blank=True, default="") + status = models.CharField(max_length=20, choices=Status.choices, null=True) class Meta: verbose_name = "Resolved Title" verbose_name_plural = "Resolved Titles" + indexes = [ + models.Index(fields=["status", "created_at"]), + ] def save(self, *args, **kwargs): - # Finds the linked delta URL and deletes DeltaResolvedTitleError objects linked to it - DeltaResolvedTitleError.objects.filter(delta_url=self.delta_url).delete() + if self.status == self.Status.RESOLVED: + # Finds the linked delta URL and deletes DeltaResolvedTitleError objects linked to it + DeltaResolvedTitleError.objects.filter(delta_url=self.delta_url).delete() super().save(*args, **kwargs) class DeltaResolvedTitleError(DeltaResolvedTitleBase): error_string = models.TextField(null=False, blank=False) http_status_code = models.IntegerField(null=True, blank=True) + + def save(self, *args, **kwargs): + # When saving an error, update the related DeltaResolvedTitle status + DeltaResolvedTitle.objects.update_or_create( + delta_url=self.delta_url, + title_pattern=self.title_pattern, + defaults={"status": DeltaResolvedTitle.Status.FAILED, "resolved_title": ""}, + ) + super().save(*args, **kwargs) diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py index 86605124..67c583c7 100644 --- a/sde_collections/tasks.py +++ b/sde_collections/tasks.py @@ -1,4 +1,5 @@ import json +import logging import os import shutil @@ -7,7 +8,7 @@ from django.conf import settings from django.core import management from django.core.management.commands import loaddata -from django.db import transaction +from django.db import IntegrityError, transaction from config import celery_app from sde_collections.models.collection_choice_fields import ( @@ -19,6 +20,8 @@ from .sinequa_api import Api from .utils.github_helper import GitHubHandler +logger = logging.getLogger(__name__) + def _get_data_to_import(collection, server_name): # ignore these because they are API collections and don't have URLs @@ -148,11 +151,115 @@ def pull_latest_collection_metadata_from_github(): s3_client.upload_file(FILENAME, s3_bucket_name, s3_key) -@celery_app.task() -def resolve_title_pattern(title_pattern_id): - TitlePattern = apps.get_model("sde_collections", "TitlePattern") - title_pattern = TitlePattern.objects.get(id=title_pattern_id) - title_pattern.apply() +@celery_app.task(name="sde_collections.tasks.process_title_resolutions", soft_time_limit=10000) +def process_title_resolutions(pattern_id: int) -> None: + """Background task to process and resolve title patterns""" + + DeltaTitlePattern = apps.get_model("sde_collections", "DeltaTitlePattern") + DeltaUrl = apps.get_model("sde_collections", "DeltaUrl") + DeltaResolvedTitle = apps.get_model("sde_collections", "DeltaResolvedTitle") + DeltaResolvedTitleError = apps.get_model("sde_collections", "DeltaResolvedTitleError") + + pattern = DeltaTitlePattern.objects.get(id=pattern_id) + + # Process curated URLs + matching_curated_urls = pattern.get_matching_curated_urls() + previously_unaffected_curated = matching_curated_urls.exclude( + id__in=pattern.curated_urls.values_list("id", flat=True) + ) + + for curated_url in previously_unaffected_curated: + if not pattern.is_most_distinctive_pattern(curated_url): + continue + + # Generate new title + new_title, error = pattern.generate_title_for_url(curated_url) + + if error: + DeltaResolvedTitleError.objects.update_or_create( + delta_url=curated_url, defaults={"title_pattern": pattern, "error_string": error} # lookup field + ) + logger.error(f"Title resolution FAILED for CuratedURL {curated_url.id}: {error}") + continue + + # Skip if the generated title matches existing or if Delta already exists + if ( + curated_url.generated_title == new_title + or DeltaUrl.objects.filter(url=curated_url.url, collection=pattern.collection).exists() + ): + continue + + # Create Delta URL with the new title + fields = { + field.name: getattr(curated_url, field.name) + for field in curated_url._meta.fields + if field.name not in ["id", "collection"] + } + fields["generated_title"] = new_title + fields["to_delete"] = False + fields["collection"] = pattern.collection + + delta_url = DeltaUrl.objects.create(**fields) + + DeltaResolvedTitle.objects.create( + title_pattern=pattern, + delta_url=delta_url, + resolved_title=new_title, + status=DeltaResolvedTitle.Status.RESOLVED, + ) + + # Process delta URLs + # Set PENDING status initially to all the matching URLs + for delta_url in pattern.get_matching_delta_urls(): + if not pattern.is_most_distinctive_pattern(delta_url): + continue + try: + resolution, created = DeltaResolvedTitle.objects.update_or_create( + delta_url=delta_url, # lookup field + defaults={"title_pattern": pattern, "status": DeltaResolvedTitle.Status.PENDING}, + ) + except IntegrityError as e: + logger.error(f"IntegrityError for delta_url {delta_url.id}: {str(e)}") + continue + + for delta_url in pattern.get_matching_delta_urls(): + if not pattern.is_most_distinctive_pattern(delta_url): + continue + + try: + resolution, created = DeltaResolvedTitle.objects.update_or_create( + delta_url=delta_url, # lookup field + defaults={"title_pattern": pattern, "status": DeltaResolvedTitle.Status.PROCESSING}, + ) + + # Generate new title + new_title, error = pattern.generate_title_for_url(delta_url) + + if error: + DeltaResolvedTitleError.objects.update_or_create( + delta_url=delta_url, defaults={"title_pattern": pattern, "error_string": error} # lookup field + ) + resolution.status = DeltaResolvedTitle.Status.FAILED + resolution.save() + logger.error(f"Title resolution FAILED for DeltaURL {delta_url.id}: {error}") + continue + + delta_url.generated_title = new_title + delta_url.save() + resolution.resolved_title = new_title + resolution.status = DeltaResolvedTitle.Status.RESOLVED + resolution.save() + + except Exception as e: + logger.error(f"Error processing delta URL {delta_url.id}: {str(e)}") + DeltaResolvedTitleError.objects.update_or_create( + delta_url=delta_url, defaults={"title_pattern": pattern, "error_string": str(e)} # lookup field + ) + resolution.status = DeltaResolvedTitle.Status.FAILED + resolution.save() + + # Update relationships + pattern.update_affected_delta_urls_list() @celery_app.task(soft_time_limit=600) diff --git a/sde_collections/tests/conftest.py b/sde_collections/tests/conftest.py new file mode 100644 index 00000000..c6ce35ab --- /dev/null +++ b/sde_collections/tests/conftest.py @@ -0,0 +1,7 @@ +import pytest + + +@pytest.fixture(autouse=True) +def _use_transactional_db(transactional_db): + """Enable transaction rollback for all tests""" + pass diff --git a/sde_collections/tests/test_title_pattern_unapply.py b/sde_collections/tests/test_title_pattern_unapply.py index db8ed7e5..1a63cfce 100644 --- a/sde_collections/tests/test_title_pattern_unapply.py +++ b/sde_collections/tests/test_title_pattern_unapply.py @@ -1,6 +1,6 @@ # docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_title_pattern_unapply.py -from django.test import TestCase +from django.test import TransactionTestCase from sde_collections.models.delta_patterns import ( DeltaResolvedTitle, @@ -12,7 +12,7 @@ from .factories import CollectionFactory, DumpUrlFactory -class TestTitlePatternUnapplyLogic(TestCase): +class TestTitlePatternUnapplyLogic(TransactionTestCase): """Test complete lifecycle of title pattern application and removal.""" def setUp(self): diff --git a/sde_collections/urls.py b/sde_collections/urls.py index 9ee77759..33bfba87 100644 --- a/sde_collections/urls.py +++ b/sde_collections/urls.py @@ -67,4 +67,9 @@ name="candidate-url-api", ), path("titles-and-errors/", views.TitlesAndErrorsView.as_view(), name="titles-and-errors-list"), + path( + "api/title-patterns//status/", + views.TitlePatternStatusView.as_view(), + name="title-pattern-status", + ), ] diff --git a/sde_collections/views.py b/sde_collections/views.py index fb268170..7864b08f 100644 --- a/sde_collections/views.py +++ b/sde_collections/views.py @@ -4,6 +4,7 @@ from django.contrib.auth import get_user_model from django.contrib.auth.mixins import LoginRequiredMixin from django.db import models +from django.db.models import Count from django.shortcuts import get_object_or_404, redirect, render from django.urls import reverse from django.utils import timezone @@ -450,6 +451,34 @@ def get_queryset(self): return super().get_queryset().order_by("match_pattern") +class TitlePatternStatusView(APIView): + def get(self, request, pattern_id): + pattern = DeltaTitlePattern.objects.get(id=pattern_id) + + # Get counts for each status + if DeltaResolvedTitle.objects.filter(title_pattern=pattern).exists(): + status_counts = ( + DeltaResolvedTitle.objects.filter(title_pattern=pattern) + .values("status") + .annotate(count=Count("status")) + ) + + # Initialize counts + result = {"pending": 0, "processing": 0, "resolved": 0, "failed": 0, "total": 0} + + # Update counts + for item in status_counts: + status_type = item["status"].lower() + count = item["count"] + result[status_type] = count + result["total"] += count + + return Response(result) + + else: + return Response({"error": "DeltaResolvedTitleNotFound"}) + + class DocumentTypePatternViewSet(CollectionFilterMixin, viewsets.ModelViewSet): queryset = DeltaDocumentTypePattern.objects.all() serializer_class = DocumentTypePatternSerializer diff --git a/sde_indexing_helper/static/js/delta_url_list.js b/sde_indexing_helper/static/js/delta_url_list.js index 33e7850d..1dade57b 100644 --- a/sde_indexing_helper/static/js/delta_url_list.js +++ b/sde_indexing_helper/static/js/delta_url_list.js @@ -1724,6 +1724,7 @@ function postTitlePatterns( csrfmiddlewaretoken: csrftoken }, success: function (data) { + toastr.success("Background title resolution started for pattern: " + data.match_pattern); $('#delta_urls_table').DataTable().ajax.reload(null, false); $('#title_patterns_table').DataTable().ajax.reload(null, false); if (currentTab === "") { //Only add a notification if we are on the first tab @@ -1734,6 +1735,7 @@ function postTitlePatterns( `` ); } + pollTitleResolutionStatus(data.id, data.match_pattern); }, error: function (xhr, status, error) { var errorMessage = xhr.responseText; @@ -1749,6 +1751,63 @@ function postTitlePatterns( }); } +function pollTitleResolutionStatus(patternId, match_pattern) { + const POLL_INTERVAL = 2000; // 2 seconds + const MAX_POLLS = 150; // 5 minutes = (5 * 60 * 1000) / 2000 = 150 polls + let pollCount = 0; + + let pollInterval = setInterval(async () => { + try { + // Check if we've exceeded max polls + if (pollCount >= MAX_POLLS) { + clearInterval(pollInterval); + toastr.warning(`Title Resolution timed out after 5 minutes for pattern: ${match_pattern}`); + return; + } + + pollCount++; + const response = await fetch(`/api/title-patterns/${patternId}/status/`); + const data = await response.json(); + + if (data.error && data.error === "DeltaResolvedTitleNotFound") { + toastr.error("No URLs were affected by this pattern."); + clearInterval(pollInterval); + } + + // Check resolution status and show appropriate message + else if (data.total > 0) { + if(data.pending === 0 && data.processing === 0) { + clearInterval(pollInterval); + + // All successful + if (data.failed === 0) { + toastr.success( + `Title Resolution completed successfully for pattern: ${match_pattern}\n` + + `[ ${data.resolved} URLs processed ]` + ); + } + + // Some failed + else { + toastr.warning( + `Title Resolution completed with some issues for pattern: ${match_pattern}\n` + + `[ ${data.resolved} succeeded, ${data.failed} failed ]` + ); + } + + // Refresh tables + $('#delta_urls_table').DataTable().ajax.reload(null, false); + $('#title_patterns_table').DataTable().ajax.reload(null, false); + } } + + } catch (error) { + console.error('Error polling status:', error); + clearInterval(pollInterval); + } + }, POLL_INTERVAL); +} + + function postVisited(url) { $.ajax({ url: url,