Skip to content

Commit 7348a76

Browse files
committed
update the code for deleting duplicates and migrating collections
1 parent f8c2329 commit 7348a76

File tree

2 files changed

+136
-76
lines changed

2 files changed

+136
-76
lines changed
Lines changed: 100 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,109 @@
1-
from django.db.models import Count
1+
import time
2+
3+
from django.db.models import Count, Min
24

35
from sde_collections.models.candidate_url import CandidateURL
46
from sde_collections.models.collection import Collection
7+
from sde_collections.models.collection_choice_fields import WorkflowStatusChoices
8+
9+
10+
def is_priority_collection(collection):
11+
priority_statuses = {
12+
WorkflowStatusChoices.CURATED,
13+
WorkflowStatusChoices.QUALITY_FIXED,
14+
WorkflowStatusChoices.SECRET_DEPLOYMENT_STARTED,
15+
WorkflowStatusChoices.SECRET_DEPLOYMENT_FAILED,
16+
WorkflowStatusChoices.READY_FOR_LRM_QUALITY_CHECK,
17+
WorkflowStatusChoices.READY_FOR_FINAL_QUALITY_CHECK,
18+
WorkflowStatusChoices.QUALITY_CHECK_FAILED,
19+
WorkflowStatusChoices.QUALITY_CHECK_MINOR,
20+
WorkflowStatusChoices.QUALITY_CHECK_PERFECT,
21+
WorkflowStatusChoices.PROD_PERFECT,
22+
WorkflowStatusChoices.PROD_MINOR,
23+
WorkflowStatusChoices.PROD_MAJOR,
24+
}
25+
return collection.workflow_status in priority_statuses
26+
527

28+
def deduplicate_candidate_urls():
29+
start_time = time.time()
630

7-
def remove_duplicate_urls(collection_name):
8-
"""
9-
Removes duplicate CandidateURL entries for a given collection name.
10-
11-
Args:
12-
- collection_name: The name of the collection for which to remove duplicate URLs.
13-
"""
14-
try:
15-
collection = Collection.objects.get(name=collection_name)
16-
except Collection.DoesNotExist:
17-
print(f"Collection with name '{collection_name}' does not exist.")
18-
return
19-
20-
duplicate_urls = (
21-
CandidateURL.objects.filter(collection=collection)
22-
.values("url")
23-
.annotate(url_count=Count("id"))
24-
.filter(url_count__gt=1)
31+
# Keep the existing collection preprocessing
32+
collection_counts = {
33+
c["id"]: c["url_count"]
34+
for c in Collection.objects.annotate(url_count=Count("candidate_urls")).values("id", "url_count")
35+
}
36+
collection_status = {c.id: is_priority_collection(c) for c in Collection.objects.all()}
37+
38+
# Phase 1: Intra-collection duplicates (keep this part the same)
39+
intra_dupes = (
40+
CandidateURL.objects.values("collection_id", "url")
41+
.annotate(count=Count("id"), min_id=Min("id"))
42+
.filter(count__gt=1)
2543
)
2644

27-
for entry in duplicate_urls:
28-
duplicate_entries = CandidateURL.objects.filter(collection=collection, url=entry["url"]).order_by("id")
45+
intra_ids_to_delete = []
46+
for dupe in intra_dupes:
47+
dupe_ids = set(
48+
CandidateURL.objects.filter(collection_id=dupe["collection_id"], url=dupe["url"])
49+
.exclude(id=dupe["min_id"])
50+
.values_list("id", flat=True)
51+
)
52+
intra_ids_to_delete.extend(dupe_ids)
53+
54+
CandidateURL.objects.filter(id__in=intra_ids_to_delete).delete()
55+
56+
# Phase 2: Modified Cross-collection duplicates
57+
cross_dupes = CandidateURL.objects.values("url").annotate(count=Count("id")).filter(count__gt=1)
58+
59+
cross_ids_to_delete = []
60+
for dupe in cross_dupes:
61+
# Get all instances of this URL with their relevant data
62+
instances = list(CandidateURL.objects.filter(url=dupe["url"]).order_by("id").values("id", "collection_id"))
2963

30-
duplicates_to_delete = duplicate_entries.exclude(id=duplicate_entries.first().id)
31-
count_deleted = duplicates_to_delete.count()
32-
duplicates_to_delete.delete()
33-
print(f"Deleted {count_deleted} duplicate entries for URL '{entry['url']}'.")
64+
while len(instances) > 1: # Process until we only have one instance left
65+
# Create comparison data for each instance
66+
instance_data = [
67+
{
68+
"id": inst["id"],
69+
"collection_id": inst["collection_id"],
70+
"is_priority": collection_status[inst["collection_id"]],
71+
"url_count": collection_counts[inst["collection_id"]],
72+
}
73+
for inst in instances
74+
]
3475

35-
print("Completed deleting duplicated URLs...")
76+
# Find the instance to keep based on the new rules
77+
def get_instance_to_delete(instances_list):
78+
# First, separate by priority
79+
priority_instances = [i for i in instances_list if i["is_priority"]]
80+
non_priority_instances = [i for i in instances_list if not i["is_priority"]]
81+
82+
# If we have both priority and non-priority, delete from non-priority
83+
if priority_instances and non_priority_instances:
84+
return non_priority_instances[0]
85+
86+
# If all instances are of same priority type, compare url counts
87+
working_list = priority_instances if priority_instances else non_priority_instances
88+
min_count = min(i["url_count"] for i in working_list)
89+
lowest_count_instances = [i for i in working_list if i["url_count"] == min_count]
90+
91+
# If multiple instances have the same count, take the one with lowest ID
92+
return min(lowest_count_instances, key=lambda x: x["id"])
93+
94+
# Get the instance to delete
95+
instance_to_delete = get_instance_to_delete(instance_data)
96+
97+
# Add it to our delete list and remove from instances
98+
cross_ids_to_delete.append(instance_to_delete["id"])
99+
instances = [inst for inst in instances if inst["id"] != instance_to_delete["id"]]
100+
101+
CandidateURL.objects.filter(id__in=cross_ids_to_delete).delete()
102+
103+
elapsed_time = time.time() - start_time
104+
action = "Deleted"
105+
print(
106+
f"{action} {len(intra_ids_to_delete)} intra-collection and {len(cross_ids_to_delete)} "
107+
f"cross-collection duplicates (total: {len(intra_ids_to_delete) + len(cross_ids_to_delete)}) "
108+
f"in {elapsed_time:.2f} seconds"
109+
)

sde_collections/management/commands/migrate_urls_and_patterns.py

Lines changed: 36 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
DeltaIncludePattern,
1515
DeltaTitlePattern,
1616
)
17-
from sde_collections.models.delta_url import CuratedUrl, DeltaUrl
17+
from sde_collections.models.delta_url import CuratedUrl, DeltaUrl, DumpUrl
1818
from sde_collections.models.pattern import (
1919
DivisionPattern,
2020
DocumentTypePattern,
@@ -50,6 +50,7 @@ def handle(self, *args, **kwargs):
5050

5151
# Step 1: Clear all Delta instances
5252
start_time = time.time()
53+
DumpUrl.objects.all().delete()
5354
CuratedUrl.objects.all().delete()
5455
DeltaUrl.objects.all().delete()
5556
DeltaExcludePattern.objects.all().delete()
@@ -59,22 +60,26 @@ def handle(self, *args, **kwargs):
5960
DeltaDivisionPattern.objects.all().delete()
6061
self.stdout.write(f"Cleared all Delta instances in {time.time() - start_time:.2f} seconds.")
6162

62-
# Step 2: Get collections with Candidate URLs
63+
# Step 2: Get collections ordered by URL count
6364
start_time = time.time()
64-
all_collections_with_urls = Collection.objects.annotate(url_count=Count("candidate_urls")).filter(
65-
url_count__gt=0
66-
)
67-
self.stdout.write(f"Collected collections with URLs in {time.time() - start_time:.2f} seconds.")
65+
total_collections = Collection.objects.count()
66+
collections = Collection.objects.annotate(url_count=Count("candidate_urls")).order_by("url_count")
67+
self.stdout.write(f"Retrieved and ordered collections in {time.time() - start_time:.2f} seconds.")
6868

69-
# Step 3: Migrate all CandidateURLs to DeltaUrl
70-
start_time = time.time()
7169
# Set to track URLs globally across all collections
7270
global_unique_urls = set()
7371

74-
for collection in all_collections_with_urls:
72+
# Process each collection individually
73+
for index, collection in enumerate(collections):
74+
collection_start_time = time.time()
75+
self.stdout.write(
76+
f"\nProcessing collection: {collection} with {collection.url_count} URLs ({index + 1}/{total_collections})" # noqa
77+
)
78+
79+
# Step 3: Migrate CandidateURLs to DeltaUrl for this collection
80+
urls_start_time = time.time()
7581
delta_urls = []
7682

77-
# Filter CandidateURL objects, ensuring each URL is globally unique
7883
for candidate_url in CandidateURL.objects.filter(collection=collection):
7984
if candidate_url.url not in global_unique_urls:
8085
global_unique_urls.add(candidate_url.url)
@@ -93,69 +98,50 @@ def handle(self, *args, **kwargs):
9398

9499
# Bulk create the unique DeltaUrl instances for this collection
95100
DeltaUrl.objects.bulk_create(delta_urls)
101+
self.stdout.write(
102+
f"Migrated {len(delta_urls)} URLs to DeltaUrl in {time.time() - urls_start_time:.2f} seconds"
103+
)
96104

97-
self.stdout.write(f"Migrated CandidateURLs to DeltaUrl in {time.time() - start_time:.2f} seconds.")
98-
99-
# Step 4: Migrate Patterns
100-
start_time = time.time()
101-
102-
pattern_start_time = time.time()
103-
self.migrate_patterns(ExcludePattern)
104-
self.stdout.write(f"ExcludePattern migration completed in {time.time() - pattern_start_time:.2f} seconds.")
105+
# Step 4: Migrate Patterns for this collection
106+
patterns_start_time = time.time()
105107

106-
pattern_start_time = time.time()
107-
self.migrate_patterns(IncludePattern)
108-
self.stdout.write(f"IncludePattern migration completed in {time.time() - pattern_start_time:.2f} seconds.")
108+
for pattern_model in [ExcludePattern, IncludePattern, TitlePattern, DocumentTypePattern, DivisionPattern]:
109+
self.migrate_patterns_for_collection(pattern_model, collection)
109110

110-
pattern_start_time = time.time()
111-
self.migrate_patterns(TitlePattern)
112-
self.stdout.write(f"TitlePattern migration completed in {time.time() - pattern_start_time:.2f} seconds.")
111+
self.stdout.write(f"Pattern migration completed in {time.time() - patterns_start_time:.2f} seconds")
113112

114-
pattern_start_time = time.time()
115-
self.migrate_patterns(DocumentTypePattern)
116-
self.stdout.write(f"DocumentTypePattern migration completed in {time.time() - pattern_start_time:.2f} seconds.")
113+
# Step 5: Promote to CuratedUrl if applicable
114+
if collection.workflow_status in STATUSES_TO_MIGRATE:
115+
promote_start_time = time.time()
116+
collection.promote_to_curated()
117+
self.stdout.write(f"Promoted to CuratedUrl in {time.time() - promote_start_time:.2f} seconds")
117118

118-
pattern_start_time = time.time()
119-
self.migrate_patterns(DivisionPattern)
120-
self.stdout.write(f"DivisionPattern migration completed in {time.time() - pattern_start_time:.2f} seconds.")
121-
122-
self.stdout.write(f"Total patterns migration completed in {time.time() - start_time:.2f} seconds.")
123-
124-
# Step 5: Promote DeltaUrls to CuratedUrl
125-
start_time = time.time()
126-
all_curated_collections_with_urls = all_collections_with_urls.filter(workflow_status__in=STATUSES_TO_MIGRATE)
127-
self.stdout.write(
128-
f"""Migrating URLs for {all_curated_collections_with_urls.count()} collections
129-
with CURATED or higher status..."""
130-
)
131-
for collection in all_curated_collections_with_urls:
132-
collection.promote_to_curated()
133-
self.stdout.write(f"Promotion to CuratedUrl completed in {time.time() - start_time:.2f} seconds.")
119+
self.stdout.write(
120+
f"Total processing time for collection: {time.time() - collection_start_time:.2f} seconds\n"
121+
f"--------------------"
122+
)
134123

135124
# Log the total time for the process
136125
self.stdout.write(f"Total migration process completed in {time.time() - overall_start_time:.2f} seconds.")
137126

138-
def migrate_patterns(self, non_delta_model):
139-
"""Migrate patterns from a non-delta model to the corresponding delta model."""
127+
def migrate_patterns_for_collection(self, non_delta_model, collection):
128+
"""Migrate patterns from a non-delta model to the corresponding delta model for a specific collection."""
140129
# Determine the delta model name and fetch the model class
141130
delta_model_name = "Delta" + non_delta_model.__name__
142131
delta_model = apps.get_model(non_delta_model._meta.app_label, delta_model_name)
143132

144-
self.stdout.write(f"Migrating patterns from {non_delta_model.__name__} to {delta_model_name}...")
145-
146133
# Get all field names from both models except 'id' (primary key)
147134
non_delta_fields = {field.name for field in non_delta_model._meta.fields if field.name != "id"}
148135
delta_fields = {field.name for field in delta_model._meta.fields if field.name != "id"}
149136

150137
# Find shared fields
151138
shared_fields = non_delta_fields.intersection(delta_fields)
152139

153-
for pattern in non_delta_model.objects.all():
140+
# Only process patterns for the current collection
141+
for pattern in non_delta_model.objects.filter(collection=collection):
154142
# Build the dictionary of shared fields to copy
155143
delta_fields_data = {field: getattr(pattern, field) for field in shared_fields}
156144

157145
# Create an instance of the delta model and save it to call the custom save() method
158146
delta_instance = delta_model(**delta_fields_data)
159147
delta_instance.save() # Explicitly call save() to trigger custom logic
160-
161-
self.stdout.write(f"Migration completed for {non_delta_model.__name__} to {delta_model_name}.")

0 commit comments

Comments
 (0)