Skip to content

Commit 85ff6e5

Browse files
committed
add management command to deduplicate urls
1 parent 808c1ed commit 85ff6e5

File tree

1 file changed

+88
-0
lines changed

1 file changed

+88
-0
lines changed
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
import time
2+
3+
from django.core.management.base import BaseCommand
4+
from django.db.models import Count, Min
5+
6+
from sde_collections.models.candidate_url import CandidateURL
7+
from sde_collections.models.collection import Collection
8+
from sde_collections.models.collection_choice_fields import WorkflowStatusChoices
9+
10+
11+
class Command(BaseCommand):
12+
help = "Deduplicate CandidateURLs"
13+
14+
def handle(self, *args, **kwargs):
15+
deduplicate_candidate_urls()
16+
17+
18+
def is_priority_collection(collection):
19+
priority_statuses = {
20+
WorkflowStatusChoices.CURATED,
21+
WorkflowStatusChoices.QUALITY_FIXED,
22+
WorkflowStatusChoices.SECRET_DEPLOYMENT_STARTED,
23+
WorkflowStatusChoices.SECRET_DEPLOYMENT_FAILED,
24+
WorkflowStatusChoices.READY_FOR_LRM_QUALITY_CHECK,
25+
WorkflowStatusChoices.READY_FOR_FINAL_QUALITY_CHECK,
26+
WorkflowStatusChoices.QUALITY_CHECK_FAILED,
27+
WorkflowStatusChoices.QUALITY_CHECK_MINOR,
28+
WorkflowStatusChoices.QUALITY_CHECK_PERFECT,
29+
WorkflowStatusChoices.PROD_PERFECT,
30+
WorkflowStatusChoices.PROD_MINOR,
31+
WorkflowStatusChoices.PROD_MAJOR,
32+
}
33+
return collection.workflow_status in priority_statuses
34+
35+
36+
def deduplicate_candidate_urls():
37+
start_time = time.time()
38+
39+
collection_counts = {
40+
c["id"]: c["url_count"]
41+
for c in Collection.objects.annotate(url_count=Count("candidate_urls")).values("id", "url_count")
42+
}
43+
44+
collection_status = {c.id: is_priority_collection(c) for c in Collection.objects.all()}
45+
46+
# Phase 1: Intra-collection duplicates
47+
intra_dupes = (
48+
CandidateURL.objects.values("collection_id", "url")
49+
.annotate(count=Count("id"), min_id=Min("id"))
50+
.filter(count__gt=1)
51+
)
52+
53+
intra_ids_to_delete = []
54+
for dupe in intra_dupes:
55+
dupe_ids = set(
56+
CandidateURL.objects.filter(collection_id=dupe["collection_id"], url=dupe["url"])
57+
.exclude(id=dupe["min_id"])
58+
.values_list("id", flat=True)
59+
)
60+
intra_ids_to_delete.extend(dupe_ids)
61+
62+
CandidateURL.objects.filter(id__in=intra_ids_to_delete).delete()
63+
64+
# Phase 2: Cross-collection duplicates
65+
cross_dupes = CandidateURL.objects.values("url").annotate(count=Count("id")).filter(count__gt=1)
66+
67+
cross_ids_to_delete = []
68+
for dupe in cross_dupes:
69+
instances = list(CandidateURL.objects.filter(url=dupe["url"]).values("id", "collection_id"))
70+
71+
priority_instances = [i for i in instances if collection_status[i["collection_id"]]]
72+
non_priority_instances = [i for i in instances if not collection_status[i["collection_id"]]]
73+
74+
if priority_instances:
75+
keep_instance = min(priority_instances, key=lambda x: collection_counts[x["collection_id"]])
76+
else:
77+
keep_instance = min(non_priority_instances, key=lambda x: collection_counts[x["collection_id"]])
78+
79+
delete_ids = [i["id"] for i in instances if i["id"] != keep_instance["id"]]
80+
cross_ids_to_delete.extend(delete_ids)
81+
82+
CandidateURL.objects.filter(id__in=cross_ids_to_delete).delete()
83+
84+
elapsed_time = time.time() - start_time
85+
action = "Deleted"
86+
print(
87+
f"{action} {len(intra_ids_to_delete)} intra-collection and {len(cross_ids_to_delete)} cross-collection duplicates (total: {len(intra_ids_to_delete) + len(cross_ids_to_delete)}) in {elapsed_time:.2f} seconds" # noqa
88+
)

0 commit comments

Comments
 (0)