Skip to content

Commit e285697

Browse files
authored
Merge pull request #1109 from NASA-IMPACT/1105-improve-pattern-application-and-exclusion-management
1105 improve pattern application and exclusion management
2 parents 1ea5168 + 4c7834f commit e285697

27 files changed

+3219
-551
lines changed

sde_collections/admin.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,13 @@
55

66
from sde_collections.models.delta_patterns import (
77
DeltaDivisionPattern,
8+
DeltaResolvedTitle,
89
DeltaTitlePattern,
910
)
1011

1112
from .models.candidate_url import CandidateURL, ResolvedTitle
1213
from .models.collection import Collection, WorkflowHistory
13-
from .models.delta_url import CuratedUrl, DeltaResolvedTitle, DeltaUrl, DumpUrl
14+
from .models.delta_url import CuratedUrl, DeltaUrl, DumpUrl
1415
from .models.pattern import DivisionPattern, IncludePattern, TitlePattern
1516
from .tasks import fetch_and_replace_full_text, import_candidate_urls_from_api
1617

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# docker-compose -f local.yml run --rm django python manage.py deduplicate_patterns
2+
# docker-compose -f production.yml run --rm django python manage.py deduplicate_patterns
3+
4+
from collections import defaultdict
5+
6+
from django.core.management.base import BaseCommand
7+
from django.db.models import Count
8+
9+
from sde_collections.models.pattern import (
10+
DivisionPattern,
11+
DocumentTypePattern,
12+
ExcludePattern,
13+
IncludePattern,
14+
TitlePattern,
15+
)
16+
17+
18+
class Command(BaseCommand):
19+
help = "Remove duplicate patterns within collections for all pattern types"
20+
21+
def handle(self, *args, **kwargs):
22+
pattern_models = [ExcludePattern, IncludePattern, TitlePattern, DocumentTypePattern, DivisionPattern]
23+
24+
deletion_counts = defaultdict(int)
25+
26+
for model in pattern_models:
27+
# Get all collections that have duplicate patterns
28+
collections_with_dupes = (
29+
model.objects.values("collection", "match_pattern")
30+
.annotate(pattern_count=Count("id"))
31+
.filter(pattern_count__gt=1)
32+
)
33+
34+
for group in collections_with_dupes:
35+
# Get all patterns for this collection/match_pattern combo
36+
patterns = model.objects.filter(collection_id=group["collection"], match_pattern=group["match_pattern"])
37+
38+
# Keep one pattern, delete the rest
39+
patterns_to_delete = patterns[1:]
40+
for pattern in patterns_to_delete:
41+
pattern.delete()
42+
deletion_counts[model.__name__] += 1
43+
44+
# Print final summary
45+
for model_name, count in deletion_counts.items():
46+
self.stdout.write(f"{model_name}: {count}")
47+
self.stdout.write(f"Total: {sum(deletion_counts.values())}")
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
import time
2+
3+
from django.core.management.base import BaseCommand
4+
from django.db.models import Count, Min
5+
6+
from sde_collections.models.candidate_url import CandidateURL
7+
from sde_collections.models.collection import Collection
8+
from sde_collections.models.collection_choice_fields import WorkflowStatusChoices
9+
10+
11+
class Command(BaseCommand):
12+
help = "Deduplicate CandidateURLs"
13+
14+
def handle(self, *args, **kwargs):
15+
deduplicate_candidate_urls()
16+
17+
18+
def is_priority_collection(collection):
19+
priority_statuses = {
20+
WorkflowStatusChoices.CURATED,
21+
WorkflowStatusChoices.QUALITY_FIXED,
22+
WorkflowStatusChoices.SECRET_DEPLOYMENT_STARTED,
23+
WorkflowStatusChoices.SECRET_DEPLOYMENT_FAILED,
24+
WorkflowStatusChoices.READY_FOR_LRM_QUALITY_CHECK,
25+
WorkflowStatusChoices.READY_FOR_FINAL_QUALITY_CHECK,
26+
WorkflowStatusChoices.QUALITY_CHECK_FAILED,
27+
WorkflowStatusChoices.QUALITY_CHECK_MINOR,
28+
WorkflowStatusChoices.QUALITY_CHECK_PERFECT,
29+
WorkflowStatusChoices.PROD_PERFECT,
30+
WorkflowStatusChoices.PROD_MINOR,
31+
WorkflowStatusChoices.PROD_MAJOR,
32+
}
33+
return collection.workflow_status in priority_statuses
34+
35+
36+
def deduplicate_candidate_urls():
37+
start_time = time.time()
38+
39+
collection_counts = {
40+
c["id"]: c["url_count"]
41+
for c in Collection.objects.annotate(url_count=Count("candidate_urls")).values("id", "url_count")
42+
}
43+
44+
collection_status = {c.id: is_priority_collection(c) for c in Collection.objects.all()}
45+
46+
# Phase 1: Intra-collection duplicates
47+
intra_dupes = (
48+
CandidateURL.objects.values("collection_id", "url")
49+
.annotate(count=Count("id"), min_id=Min("id"))
50+
.filter(count__gt=1)
51+
)
52+
53+
intra_ids_to_delete = []
54+
for dupe in intra_dupes:
55+
dupe_ids = set(
56+
CandidateURL.objects.filter(collection_id=dupe["collection_id"], url=dupe["url"])
57+
.exclude(id=dupe["min_id"])
58+
.values_list("id", flat=True)
59+
)
60+
intra_ids_to_delete.extend(dupe_ids)
61+
62+
CandidateURL.objects.filter(id__in=intra_ids_to_delete).delete()
63+
64+
# Phase 2: Cross-collection duplicates
65+
cross_dupes = CandidateURL.objects.values("url").annotate(count=Count("id")).filter(count__gt=1)
66+
67+
cross_ids_to_delete = []
68+
for dupe in cross_dupes:
69+
instances = list(CandidateURL.objects.filter(url=dupe["url"]).values("id", "collection_id"))
70+
71+
priority_instances = [i for i in instances if collection_status[i["collection_id"]]]
72+
non_priority_instances = [i for i in instances if not collection_status[i["collection_id"]]]
73+
74+
if priority_instances:
75+
keep_instance = min(priority_instances, key=lambda x: collection_counts[x["collection_id"]])
76+
else:
77+
keep_instance = min(non_priority_instances, key=lambda x: collection_counts[x["collection_id"]])
78+
79+
delete_ids = [i["id"] for i in instances if i["id"] != keep_instance["id"]]
80+
cross_ids_to_delete.extend(delete_ids)
81+
82+
CandidateURL.objects.filter(id__in=cross_ids_to_delete).delete()
83+
84+
elapsed_time = time.time() - start_time
85+
action = "Deleted"
86+
print(
87+
f"{action} {len(intra_ids_to_delete)} intra-collection and {len(cross_ids_to_delete)} cross-collection duplicates (total: {len(intra_ids_to_delete) + len(cross_ids_to_delete)}) in {elapsed_time:.2f} seconds" # noqa
88+
)

sde_collections/management/commands/migrate_urls_and_patterns.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ def handle(self, *args, **kwargs):
8787
visited=candidate_url.visited,
8888
document_type=candidate_url.document_type,
8989
division=candidate_url.division,
90-
delete=False,
90+
to_delete=False,
9191
)
9292
)
9393

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
# Generated by Django 4.2.9 on 2024-11-23 17:44
2+
3+
from django.db import migrations, models
4+
import sde_collections.models.delta_patterns
5+
6+
7+
class Migration(migrations.Migration):
8+
9+
dependencies = [
10+
("sde_collections", "0065_rename_delete_deltaurl_to_delete_and_more"),
11+
]
12+
13+
operations = [
14+
migrations.AlterUniqueTogether(
15+
name="deltadivisionpattern",
16+
unique_together=set(),
17+
),
18+
migrations.AlterUniqueTogether(
19+
name="deltadocumenttypepattern",
20+
unique_together=set(),
21+
),
22+
migrations.AlterUniqueTogether(
23+
name="deltaexcludepattern",
24+
unique_together=set(),
25+
),
26+
migrations.AlterUniqueTogether(
27+
name="deltaincludepattern",
28+
unique_together=set(),
29+
),
30+
migrations.AlterUniqueTogether(
31+
name="deltatitlepattern",
32+
unique_together=set(),
33+
),
34+
migrations.AlterField(
35+
model_name="deltadivisionpattern",
36+
name="match_pattern",
37+
field=models.CharField(
38+
help_text="This pattern is compared against the URL of all documents in the collection",
39+
verbose_name="Pattern",
40+
),
41+
),
42+
migrations.AlterField(
43+
model_name="deltadocumenttypepattern",
44+
name="match_pattern",
45+
field=models.CharField(
46+
help_text="This pattern is compared against the URL of all documents in the collection",
47+
verbose_name="Pattern",
48+
),
49+
),
50+
migrations.AlterField(
51+
model_name="deltaexcludepattern",
52+
name="match_pattern",
53+
field=models.CharField(
54+
help_text="This pattern is compared against the URL of all documents in the collection",
55+
verbose_name="Pattern",
56+
),
57+
),
58+
migrations.AlterField(
59+
model_name="deltaincludepattern",
60+
name="match_pattern",
61+
field=models.CharField(
62+
help_text="This pattern is compared against the URL of all documents in the collection",
63+
verbose_name="Pattern",
64+
),
65+
),
66+
migrations.AlterField(
67+
model_name="deltatitlepattern",
68+
name="match_pattern",
69+
field=models.CharField(
70+
help_text="This pattern is compared against the URL of all documents in the collection",
71+
verbose_name="Pattern",
72+
),
73+
),
74+
migrations.AlterField(
75+
model_name="deltatitlepattern",
76+
name="title_pattern",
77+
field=models.CharField(
78+
help_text="Pattern for the new title. Support exact replacement or sinequa-valid code",
79+
validators=[sde_collections.models.delta_patterns.validate_title_pattern],
80+
verbose_name="Title Pattern",
81+
),
82+
),
83+
]
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
# Generated by Django 4.2.9 on 2024-11-23 18:14
2+
3+
from django.db import migrations
4+
5+
6+
class Migration(migrations.Migration):
7+
8+
dependencies = [
9+
("sde_collections", "0066_alter_deltadivisionpattern_unique_together_and_more"),
10+
]
11+
12+
operations = [
13+
migrations.AlterModelOptions(
14+
name="deltadivisionpattern",
15+
options={
16+
"ordering": ["match_pattern"],
17+
"verbose_name": "Delta Division Pattern",
18+
"verbose_name_plural": "Delta Division Patterns",
19+
},
20+
),
21+
migrations.AlterModelOptions(
22+
name="deltadocumenttypepattern",
23+
options={
24+
"ordering": ["match_pattern"],
25+
"verbose_name": "Delta Document Type Pattern",
26+
"verbose_name_plural": "Delta Document Type Patterns",
27+
},
28+
),
29+
migrations.AlterModelOptions(
30+
name="deltaexcludepattern",
31+
options={
32+
"ordering": ["match_pattern"],
33+
"verbose_name": "Delta Exclude Pattern",
34+
"verbose_name_plural": "Delta Exclude Patterns",
35+
},
36+
),
37+
migrations.AlterModelOptions(
38+
name="deltaincludepattern",
39+
options={
40+
"ordering": ["match_pattern"],
41+
"verbose_name": "Delta Include Pattern",
42+
"verbose_name_plural": "Delta Include Patterns",
43+
},
44+
),
45+
migrations.AlterModelOptions(
46+
name="deltatitlepattern",
47+
options={
48+
"ordering": ["match_pattern"],
49+
"verbose_name": "Delta Title Pattern",
50+
"verbose_name_plural": "Delta Title Patterns",
51+
},
52+
),
53+
migrations.AlterUniqueTogether(
54+
name="deltadivisionpattern",
55+
unique_together={("collection", "match_pattern")},
56+
),
57+
migrations.AlterUniqueTogether(
58+
name="deltadocumenttypepattern",
59+
unique_together={("collection", "match_pattern")},
60+
),
61+
migrations.AlterUniqueTogether(
62+
name="deltaexcludepattern",
63+
unique_together={("collection", "match_pattern")},
64+
),
65+
migrations.AlterUniqueTogether(
66+
name="deltaincludepattern",
67+
unique_together={("collection", "match_pattern")},
68+
),
69+
migrations.AlterUniqueTogether(
70+
name="deltatitlepattern",
71+
unique_together={("collection", "match_pattern")},
72+
),
73+
]

0 commit comments

Comments
 (0)