|
| 1 | +import time |
| 2 | + |
| 3 | +from django.core.management.base import BaseCommand |
| 4 | +from django.db.models import Count, Min |
| 5 | + |
| 6 | +from sde_collections.models.candidate_url import CandidateURL |
| 7 | +from sde_collections.models.collection import Collection |
| 8 | +from sde_collections.models.collection_choice_fields import WorkflowStatusChoices |
| 9 | + |
| 10 | + |
| 11 | +class Command(BaseCommand): |
| 12 | + help = "Deduplicate CandidateURLs" |
| 13 | + |
| 14 | + def handle(self, *args, **kwargs): |
| 15 | + deduplicate_candidate_urls() |
| 16 | + |
| 17 | + |
| 18 | +def is_priority_collection(collection): |
| 19 | + priority_statuses = { |
| 20 | + WorkflowStatusChoices.CURATED, |
| 21 | + WorkflowStatusChoices.QUALITY_FIXED, |
| 22 | + WorkflowStatusChoices.SECRET_DEPLOYMENT_STARTED, |
| 23 | + WorkflowStatusChoices.SECRET_DEPLOYMENT_FAILED, |
| 24 | + WorkflowStatusChoices.READY_FOR_LRM_QUALITY_CHECK, |
| 25 | + WorkflowStatusChoices.READY_FOR_FINAL_QUALITY_CHECK, |
| 26 | + WorkflowStatusChoices.QUALITY_CHECK_FAILED, |
| 27 | + WorkflowStatusChoices.QUALITY_CHECK_MINOR, |
| 28 | + WorkflowStatusChoices.QUALITY_CHECK_PERFECT, |
| 29 | + WorkflowStatusChoices.PROD_PERFECT, |
| 30 | + WorkflowStatusChoices.PROD_MINOR, |
| 31 | + WorkflowStatusChoices.PROD_MAJOR, |
| 32 | + } |
| 33 | + return collection.workflow_status in priority_statuses |
| 34 | + |
| 35 | + |
| 36 | +def deduplicate_candidate_urls(): |
| 37 | + start_time = time.time() |
| 38 | + |
| 39 | + collection_counts = { |
| 40 | + c["id"]: c["url_count"] |
| 41 | + for c in Collection.objects.annotate(url_count=Count("candidate_urls")).values("id", "url_count") |
| 42 | + } |
| 43 | + |
| 44 | + collection_status = {c.id: is_priority_collection(c) for c in Collection.objects.all()} |
| 45 | + |
| 46 | + # Phase 1: Intra-collection duplicates |
| 47 | + intra_dupes = ( |
| 48 | + CandidateURL.objects.values("collection_id", "url") |
| 49 | + .annotate(count=Count("id"), min_id=Min("id")) |
| 50 | + .filter(count__gt=1) |
| 51 | + ) |
| 52 | + |
| 53 | + intra_ids_to_delete = [] |
| 54 | + for dupe in intra_dupes: |
| 55 | + dupe_ids = set( |
| 56 | + CandidateURL.objects.filter(collection_id=dupe["collection_id"], url=dupe["url"]) |
| 57 | + .exclude(id=dupe["min_id"]) |
| 58 | + .values_list("id", flat=True) |
| 59 | + ) |
| 60 | + intra_ids_to_delete.extend(dupe_ids) |
| 61 | + |
| 62 | + CandidateURL.objects.filter(id__in=intra_ids_to_delete).delete() |
| 63 | + |
| 64 | + # Phase 2: Cross-collection duplicates |
| 65 | + cross_dupes = CandidateURL.objects.values("url").annotate(count=Count("id")).filter(count__gt=1) |
| 66 | + |
| 67 | + cross_ids_to_delete = [] |
| 68 | + for dupe in cross_dupes: |
| 69 | + instances = list(CandidateURL.objects.filter(url=dupe["url"]).values("id", "collection_id")) |
| 70 | + |
| 71 | + priority_instances = [i for i in instances if collection_status[i["collection_id"]]] |
| 72 | + non_priority_instances = [i for i in instances if not collection_status[i["collection_id"]]] |
| 73 | + |
| 74 | + if priority_instances: |
| 75 | + keep_instance = min(priority_instances, key=lambda x: collection_counts[x["collection_id"]]) |
| 76 | + else: |
| 77 | + keep_instance = min(non_priority_instances, key=lambda x: collection_counts[x["collection_id"]]) |
| 78 | + |
| 79 | + delete_ids = [i["id"] for i in instances if i["id"] != keep_instance["id"]] |
| 80 | + cross_ids_to_delete.extend(delete_ids) |
| 81 | + |
| 82 | + CandidateURL.objects.filter(id__in=cross_ids_to_delete).delete() |
| 83 | + |
| 84 | + elapsed_time = time.time() - start_time |
| 85 | + action = "Deleted" |
| 86 | + print( |
| 87 | + f"{action} {len(intra_ids_to_delete)} intra-collection and {len(cross_ids_to_delete)} cross-collection duplicates (total: {len(intra_ids_to_delete) + len(cross_ids_to_delete)}) in {elapsed_time:.2f} seconds" # noqa |
| 88 | + ) |
0 commit comments