update the code for deleting duplicates and migrating collections

CarsonDavis · CarsonDavis · commit 7348a76b0edb · 2024-12-20T15:51:45.000-06:00
diff --git a/scripts/delete_duplicate_urls_on_webapp.py b/scripts/delete_duplicate_urls_on_webapp.py
@@ -1,35 +1,109 @@
-from django.db.models import Count
+import time
+
+from django.db.models import Count, Min
 
 from sde_collections.models.candidate_url import CandidateURL
 from sde_collections.models.collection import Collection
+from sde_collections.models.collection_choice_fields import WorkflowStatusChoices
+
+
+def is_priority_collection(collection):
+    priority_statuses = {
+        WorkflowStatusChoices.CURATED,
+        WorkflowStatusChoices.QUALITY_FIXED,
+        WorkflowStatusChoices.SECRET_DEPLOYMENT_STARTED,
+        WorkflowStatusChoices.SECRET_DEPLOYMENT_FAILED,
+        WorkflowStatusChoices.READY_FOR_LRM_QUALITY_CHECK,
+        WorkflowStatusChoices.READY_FOR_FINAL_QUALITY_CHECK,
+        WorkflowStatusChoices.QUALITY_CHECK_FAILED,
+        WorkflowStatusChoices.QUALITY_CHECK_MINOR,
+        WorkflowStatusChoices.QUALITY_CHECK_PERFECT,
+        WorkflowStatusChoices.PROD_PERFECT,
+        WorkflowStatusChoices.PROD_MINOR,
+        WorkflowStatusChoices.PROD_MAJOR,
+    }
+    return collection.workflow_status in priority_statuses
+
 
+def deduplicate_candidate_urls():
+    start_time = time.time()
 
-def remove_duplicate_urls(collection_name):
-    """
-    Removes duplicate CandidateURL entries for a given collection name.
-
-    Args:
-    - collection_name: The name of the collection for which to remove duplicate URLs.
-    """
-    try:
-        collection = Collection.objects.get(name=collection_name)
-    except Collection.DoesNotExist:
-        print(f"Collection with name '{collection_name}' does not exist.")
-        return
-
-    duplicate_urls = (
-        CandidateURL.objects.filter(collection=collection)
-        .values("url")
-        .annotate(url_count=Count("id"))
-        .filter(url_count__gt=1)
+    # Keep the existing collection preprocessing
+    collection_counts = {
+        c["id"]: c["url_count"]
+        for c in Collection.objects.annotate(url_count=Count("candidate_urls")).values("id", "url_count")
+    }
+    collection_status = {c.id: is_priority_collection(c) for c in Collection.objects.all()}
+
+    # Phase 1: Intra-collection duplicates (keep this part the same)
+    intra_dupes = (
+        CandidateURL.objects.values("collection_id", "url")
+        .annotate(count=Count("id"), min_id=Min("id"))
+        .filter(count__gt=1)
     )
 
-    for entry in duplicate_urls:
-        duplicate_entries = CandidateURL.objects.filter(collection=collection, url=entry["url"]).order_by("id")
+    intra_ids_to_delete = []
+    for dupe in intra_dupes:
+        dupe_ids = set(
+            CandidateURL.objects.filter(collection_id=dupe["collection_id"], url=dupe["url"])
+            .exclude(id=dupe["min_id"])
+            .values_list("id", flat=True)
+        )
+        intra_ids_to_delete.extend(dupe_ids)
+
+    CandidateURL.objects.filter(id__in=intra_ids_to_delete).delete()
+
+    # Phase 2: Modified Cross-collection duplicates
+    cross_dupes = CandidateURL.objects.values("url").annotate(count=Count("id")).filter(count__gt=1)
+
+    cross_ids_to_delete = []
+    for dupe in cross_dupes:
+        # Get all instances of this URL with their relevant data
+        instances = list(CandidateURL.objects.filter(url=dupe["url"]).order_by("id").values("id", "collection_id"))
 
-        duplicates_to_delete = duplicate_entries.exclude(id=duplicate_entries.first().id)
-        count_deleted = duplicates_to_delete.count()
-        duplicates_to_delete.delete()
-        print(f"Deleted {count_deleted} duplicate entries for URL '{entry['url']}'.")
+        while len(instances) > 1:  # Process until we only have one instance left
+            # Create comparison data for each instance
+            instance_data = [
+                {
+                    "id": inst["id"],
+                    "collection_id": inst["collection_id"],
+                    "is_priority": collection_status[inst["collection_id"]],
+                    "url_count": collection_counts[inst["collection_id"]],
+                }
+                for inst in instances
+            ]
 
-    print("Completed deleting duplicated URLs...")
+            # Find the instance to keep based on the new rules
+            def get_instance_to_delete(instances_list):
+                # First, separate by priority
+                priority_instances = [i for i in instances_list if i["is_priority"]]
+                non_priority_instances = [i for i in instances_list if not i["is_priority"]]
+
+                # If we have both priority and non-priority, delete from non-priority
+                if priority_instances and non_priority_instances:
+                    return non_priority_instances[0]
+
+                # If all instances are of same priority type, compare url counts
+                working_list = priority_instances if priority_instances else non_priority_instances
+                min_count = min(i["url_count"] for i in working_list)
+                lowest_count_instances = [i for i in working_list if i["url_count"] == min_count]
+
+                # If multiple instances have the same count, take the one with lowest ID
+                return min(lowest_count_instances, key=lambda x: x["id"])
+
+            # Get the instance to delete
+            instance_to_delete = get_instance_to_delete(instance_data)
+
+            # Add it to our delete list and remove from instances
+            cross_ids_to_delete.append(instance_to_delete["id"])
+            instances = [inst for inst in instances if inst["id"] != instance_to_delete["id"]]
+
+    CandidateURL.objects.filter(id__in=cross_ids_to_delete).delete()
+
+    elapsed_time = time.time() - start_time
+    action = "Deleted"
+    print(
+        f"{action} {len(intra_ids_to_delete)} intra-collection and {len(cross_ids_to_delete)} "
+        f"cross-collection duplicates (total: {len(intra_ids_to_delete) + len(cross_ids_to_delete)}) "
+        f"in {elapsed_time:.2f} seconds"
+    )
diff --git a/sde_collections/management/commands/migrate_urls_and_patterns.py b/sde_collections/management/commands/migrate_urls_and_patterns.py
@@ -14,7 +14,7 @@
     DeltaIncludePattern,
     DeltaTitlePattern,
 )
-from sde_collections.models.delta_url import CuratedUrl, DeltaUrl
+from sde_collections.models.delta_url import CuratedUrl, DeltaUrl, DumpUrl
 from sde_collections.models.pattern import (
     DivisionPattern,
     DocumentTypePattern,
@@ -50,6 +50,7 @@ def handle(self, *args, **kwargs):
 
         # Step 1: Clear all Delta instances
         start_time = time.time()
+        DumpUrl.objects.all().delete()
         CuratedUrl.objects.all().delete()
         DeltaUrl.objects.all().delete()
         DeltaExcludePattern.objects.all().delete()
@@ -59,22 +60,26 @@ def handle(self, *args, **kwargs):
         DeltaDivisionPattern.objects.all().delete()
         self.stdout.write(f"Cleared all Delta instances in {time.time() - start_time:.2f} seconds.")
 
-        # Step 2: Get collections with Candidate URLs
+        # Step 2: Get collections ordered by URL count
         start_time = time.time()
-        all_collections_with_urls = Collection.objects.annotate(url_count=Count("candidate_urls")).filter(
-            url_count__gt=0
-        )
-        self.stdout.write(f"Collected collections with URLs in {time.time() - start_time:.2f} seconds.")
+        total_collections = Collection.objects.count()
+        collections = Collection.objects.annotate(url_count=Count("candidate_urls")).order_by("url_count")
+        self.stdout.write(f"Retrieved and ordered collections in {time.time() - start_time:.2f} seconds.")
 
-        # Step 3: Migrate all CandidateURLs to DeltaUrl
-        start_time = time.time()
         # Set to track URLs globally across all collections
         global_unique_urls = set()
 
-        for collection in all_collections_with_urls:
+        # Process each collection individually
+        for index, collection in enumerate(collections):
+            collection_start_time = time.time()
+            self.stdout.write(
+                f"\nProcessing collection: {collection} with {collection.url_count} URLs ({index + 1}/{total_collections})"  # noqa
+            )
+
+            # Step 3: Migrate CandidateURLs to DeltaUrl for this collection
+            urls_start_time = time.time()
             delta_urls = []
 
-            # Filter CandidateURL objects, ensuring each URL is globally unique
             for candidate_url in CandidateURL.objects.filter(collection=collection):
                 if candidate_url.url not in global_unique_urls:
                     global_unique_urls.add(candidate_url.url)
@@ -93,69 +98,50 @@ def handle(self, *args, **kwargs):
 
             # Bulk create the unique DeltaUrl instances for this collection
             DeltaUrl.objects.bulk_create(delta_urls)
+            self.stdout.write(
+                f"Migrated {len(delta_urls)} URLs to DeltaUrl in {time.time() - urls_start_time:.2f} seconds"
+            )
 
-        self.stdout.write(f"Migrated CandidateURLs to DeltaUrl in {time.time() - start_time:.2f} seconds.")
-
-        # Step 4: Migrate Patterns
-        start_time = time.time()
-
-        pattern_start_time = time.time()
-        self.migrate_patterns(ExcludePattern)
-        self.stdout.write(f"ExcludePattern migration completed in {time.time() - pattern_start_time:.2f} seconds.")
+            # Step 4: Migrate Patterns for this collection
+            patterns_start_time = time.time()
 
-        pattern_start_time = time.time()
-        self.migrate_patterns(IncludePattern)
-        self.stdout.write(f"IncludePattern migration completed in {time.time() - pattern_start_time:.2f} seconds.")
+            for pattern_model in [ExcludePattern, IncludePattern, TitlePattern, DocumentTypePattern, DivisionPattern]:
+                self.migrate_patterns_for_collection(pattern_model, collection)
 
-        pattern_start_time = time.time()
-        self.migrate_patterns(TitlePattern)
-        self.stdout.write(f"TitlePattern migration completed in {time.time() - pattern_start_time:.2f} seconds.")
+            self.stdout.write(f"Pattern migration completed in {time.time() - patterns_start_time:.2f} seconds")
 
-        pattern_start_time = time.time()
-        self.migrate_patterns(DocumentTypePattern)
-        self.stdout.write(f"DocumentTypePattern migration completed in {time.time() - pattern_start_time:.2f} seconds.")
+            # Step 5: Promote to CuratedUrl if applicable
+            if collection.workflow_status in STATUSES_TO_MIGRATE:
+                promote_start_time = time.time()
+                collection.promote_to_curated()
+                self.stdout.write(f"Promoted to CuratedUrl in {time.time() - promote_start_time:.2f} seconds")
 
-        pattern_start_time = time.time()
-        self.migrate_patterns(DivisionPattern)
-        self.stdout.write(f"DivisionPattern migration completed in {time.time() - pattern_start_time:.2f} seconds.")
-
-        self.stdout.write(f"Total patterns migration completed in {time.time() - start_time:.2f} seconds.")
-
-        # Step 5: Promote DeltaUrls to CuratedUrl
-        start_time = time.time()
-        all_curated_collections_with_urls = all_collections_with_urls.filter(workflow_status__in=STATUSES_TO_MIGRATE)
-        self.stdout.write(
-            f"""Migrating URLs for {all_curated_collections_with_urls.count()} collections
-            with CURATED or higher status..."""
-        )
-        for collection in all_curated_collections_with_urls:
-            collection.promote_to_curated()
-        self.stdout.write(f"Promotion to CuratedUrl completed in {time.time() - start_time:.2f} seconds.")
+            self.stdout.write(
+                f"Total processing time for collection: {time.time() - collection_start_time:.2f} seconds\n"
+                f"--------------------"
+            )
 
         # Log the total time for the process
         self.stdout.write(f"Total migration process completed in {time.time() - overall_start_time:.2f} seconds.")
 
-    def migrate_patterns(self, non_delta_model):
-        """Migrate patterns from a non-delta model to the corresponding delta model."""
+    def migrate_patterns_for_collection(self, non_delta_model, collection):
+        """Migrate patterns from a non-delta model to the corresponding delta model for a specific collection."""
         # Determine the delta model name and fetch the model class
         delta_model_name = "Delta" + non_delta_model.__name__
         delta_model = apps.get_model(non_delta_model._meta.app_label, delta_model_name)
 
-        self.stdout.write(f"Migrating patterns from {non_delta_model.__name__} to {delta_model_name}...")
-
         # Get all field names from both models except 'id' (primary key)
         non_delta_fields = {field.name for field in non_delta_model._meta.fields if field.name != "id"}
         delta_fields = {field.name for field in delta_model._meta.fields if field.name != "id"}
 
         # Find shared fields
         shared_fields = non_delta_fields.intersection(delta_fields)
 
-        for pattern in non_delta_model.objects.all():
+        # Only process patterns for the current collection
+        for pattern in non_delta_model.objects.filter(collection=collection):
             # Build the dictionary of shared fields to copy
             delta_fields_data = {field: getattr(pattern, field) for field in shared_fields}
 
             # Create an instance of the delta model and save it to call the custom save() method
             delta_instance = delta_model(**delta_fields_data)
             delta_instance.save()  # Explicitly call save() to trigger custom logic
-
-        self.stdout.write(f"Migration completed for {non_delta_model.__name__} to {delta_model_name}.")