Skip to content

Commit d81534d

Browse files
committed
fix tests and refactor related_names for patterns
1 parent 73b266a commit d81534d

File tree

6 files changed

+512
-96
lines changed

6 files changed

+512
-96
lines changed
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
# Generated by Django 4.2.9 on 2024-11-24 19:39
2+
3+
from django.db import migrations, models
4+
import django.db.models.deletion
5+
import sde_collections.models.delta_patterns
6+
7+
8+
class Migration(migrations.Migration):
9+
10+
dependencies = [
11+
("sde_collections", "0067_alter_deltadivisionpattern_options_and_more"),
12+
]
13+
14+
operations = [
15+
migrations.AlterField(
16+
model_name="deltadivisionpattern",
17+
name="collection",
18+
field=models.ForeignKey(
19+
on_delete=django.db.models.deletion.CASCADE,
20+
related_name="%(class)ss",
21+
related_query_name="%(class)ss",
22+
to="sde_collections.collection",
23+
),
24+
),
25+
migrations.AlterField(
26+
model_name="deltadivisionpattern",
27+
name="curated_urls",
28+
field=models.ManyToManyField(related_name="%(class)ss", to="sde_collections.curatedurl"),
29+
),
30+
migrations.AlterField(
31+
model_name="deltadivisionpattern",
32+
name="delta_urls",
33+
field=models.ManyToManyField(related_name="%(class)ss", to="sde_collections.deltaurl"),
34+
),
35+
migrations.AlterField(
36+
model_name="deltadocumenttypepattern",
37+
name="collection",
38+
field=models.ForeignKey(
39+
on_delete=django.db.models.deletion.CASCADE,
40+
related_name="%(class)ss",
41+
related_query_name="%(class)ss",
42+
to="sde_collections.collection",
43+
),
44+
),
45+
migrations.AlterField(
46+
model_name="deltadocumenttypepattern",
47+
name="curated_urls",
48+
field=models.ManyToManyField(related_name="%(class)ss", to="sde_collections.curatedurl"),
49+
),
50+
migrations.AlterField(
51+
model_name="deltadocumenttypepattern",
52+
name="delta_urls",
53+
field=models.ManyToManyField(related_name="%(class)ss", to="sde_collections.deltaurl"),
54+
),
55+
migrations.AlterField(
56+
model_name="deltaexcludepattern",
57+
name="collection",
58+
field=models.ForeignKey(
59+
on_delete=django.db.models.deletion.CASCADE,
60+
related_name="%(class)ss",
61+
related_query_name="%(class)ss",
62+
to="sde_collections.collection",
63+
),
64+
),
65+
migrations.AlterField(
66+
model_name="deltaexcludepattern",
67+
name="curated_urls",
68+
field=models.ManyToManyField(related_name="%(class)ss", to="sde_collections.curatedurl"),
69+
),
70+
migrations.AlterField(
71+
model_name="deltaexcludepattern",
72+
name="delta_urls",
73+
field=models.ManyToManyField(related_name="%(class)ss", to="sde_collections.deltaurl"),
74+
),
75+
migrations.AlterField(
76+
model_name="deltaincludepattern",
77+
name="collection",
78+
field=models.ForeignKey(
79+
on_delete=django.db.models.deletion.CASCADE,
80+
related_name="%(class)ss",
81+
related_query_name="%(class)ss",
82+
to="sde_collections.collection",
83+
),
84+
),
85+
migrations.AlterField(
86+
model_name="deltaincludepattern",
87+
name="curated_urls",
88+
field=models.ManyToManyField(related_name="%(class)ss", to="sde_collections.curatedurl"),
89+
),
90+
migrations.AlterField(
91+
model_name="deltaincludepattern",
92+
name="delta_urls",
93+
field=models.ManyToManyField(related_name="%(class)ss", to="sde_collections.deltaurl"),
94+
),
95+
migrations.AlterField(
96+
model_name="deltatitlepattern",
97+
name="collection",
98+
field=models.ForeignKey(
99+
on_delete=django.db.models.deletion.CASCADE,
100+
related_name="%(class)ss",
101+
related_query_name="%(class)ss",
102+
to="sde_collections.collection",
103+
),
104+
),
105+
migrations.AlterField(
106+
model_name="deltatitlepattern",
107+
name="curated_urls",
108+
field=models.ManyToManyField(related_name="%(class)ss", to="sde_collections.curatedurl"),
109+
),
110+
migrations.AlterField(
111+
model_name="deltatitlepattern",
112+
name="delta_urls",
113+
field=models.ManyToManyField(related_name="%(class)ss", to="sde_collections.deltaurl"),
114+
),
115+
migrations.AlterField(
116+
model_name="deltatitlepattern",
117+
name="title_pattern",
118+
field=models.CharField(
119+
help_text="Pattern for the new title. Can be an exact replacement string or sinequa-valid code",
120+
validators=[sde_collections.models.delta_patterns.validate_title_pattern],
121+
verbose_name="Title Pattern",
122+
),
123+
),
124+
]

sde_collections/models/collection.py

Lines changed: 48 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,12 @@ def refresh_url_lists_for_all_patterns(self):
118118
pattern.update_affected_curated_urls_list()
119119

120120
def migrate_dump_to_delta(self):
121-
"""Main function to handle migration from DumpUrls to DeltaUrls with specific rules."""
121+
"""
122+
Migrates data from DumpUrls to DeltaUrls, preserving all fields.
123+
Creates DeltaUrls that reflect:
124+
1. Changes from DumpUrls vs CuratedUrls
125+
2. Missing URLs in DumpUrls that exist in CuratedUrls (marked for deletion)
126+
"""
122127
# Step 1: Clear existing DeltaUrls for this collection
123128
self.clear_delta_urls()
124129

@@ -146,27 +151,31 @@ def migrate_dump_to_delta(self):
146151
# Step 5: Clear DumpUrls after migration is complete
147152
self.clear_dump_urls()
148153

149-
# Step 6: Reapply patterns to DeltaUrls
150-
self.refresh_url_lists_for_all_patterns()
154+
# Step 6: Apply all patterns to DeltaUrls
155+
# self.refresh_url_lists_for_all_patterns() # TODO: I'm pretty confident we shouldn't be running this
156+
self.apply_all_patterns()
151157

152158
def create_or_update_delta_url(self, url_instance, to_delete=False):
153159
"""
154160
Creates or updates a DeltaUrl entry based on the given DumpUrl or CuratedUrl object.
155-
If to_delete is True, only sets the to_delete flag and url.
161+
Always copies all fields, even for deletion cases.
162+
163+
Args:
164+
url_instance: DumpUrl or CuratedUrl instance to copy from
165+
to_delete: Whether to mark the resulting DeltaUrl for deletion
156166
"""
157-
if to_delete:
158-
# Only set the URL and to_delete flag
159-
DeltaUrl.objects.update_or_create(collection=self, url=url_instance.url, defaults={"to_delete": True})
160-
else:
161-
# Automatically move over all fields from url_instance
162-
fields_to_copy = {
163-
field.name: getattr(url_instance, field.name)
164-
for field in DumpUrl._meta.fields # Assumes same fields for CuratedUrl via inheritance
165-
if field.name not in ["id", "collection", "url"]
166-
}
167-
fields_to_copy["to_delete"] = False # Ensure to_delete flag is False
167+
# Get all copyable fields from the source instance
168+
fields_to_copy = {
169+
field.name: getattr(url_instance, field.name)
170+
for field in url_instance._meta.fields
171+
if field.name not in ["id", "collection"]
172+
}
168173

169-
DeltaUrl.objects.update_or_create(collection=self, url=url_instance.url, defaults=fields_to_copy)
174+
# Set deletion status
175+
fields_to_copy["to_delete"] = to_delete
176+
177+
# Update or create the DeltaUrl
178+
DeltaUrl.objects.update_or_create(collection=self, url=url_instance.url, defaults=fields_to_copy)
170179

171180
def promote_to_curated(self):
172181
"""
@@ -600,15 +609,32 @@ def sync_with_production_webapp(self) -> None:
600609

601610
self.save()
602611

603-
def apply_all_patterns(self) -> None:
604-
"""Apply all the patterns."""
605-
for pattern in self.excludepattern.all():
612+
def apply_all_patterns(self):
613+
"""Apply all the patterns with debug information."""
614+
print("\nApplying patterns:")
615+
616+
for pattern in self.deltaexcludepatterns.all():
617+
print(f"\nApplying exclude pattern: {pattern.match_pattern}")
606618
pattern.apply()
607-
for pattern in self.includepattern.all():
619+
620+
for pattern in self.deltaincludepatterns.all():
621+
print(f"\nApplying include pattern: {pattern.match_pattern}")
608622
pattern.apply()
609-
for pattern in self.titlepattern.all():
623+
624+
for pattern in self.deltatitlepatterns.all():
625+
print(f"\nApplying title pattern: {pattern.match_pattern}")
610626
pattern.apply()
611-
for pattern in self.documenttypepattern.all():
627+
628+
for pattern in self.deltadocumenttypepatterns.all():
629+
print(f"\nApplying doctype pattern: {pattern.match_pattern}")
630+
matching_urls = pattern.get_matching_delta_urls()
631+
print(f"Matching URLs: {matching_urls.count()}")
632+
pattern.apply()
633+
634+
for pattern in self.deltadivisionpatterns.all():
635+
print(f"\nApplying division pattern: {pattern.match_pattern}")
636+
matching_urls = pattern.get_matching_delta_urls()
637+
print(f"Matching URLs: {matching_urls.count()}")
612638
pattern.apply()
613639

614640
def save(self, *args, **kwargs):

sde_collections/models/delta_patterns.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ class MatchPatternTypeChoices(models.IntegerChoices):
2424
collection = models.ForeignKey(
2525
"Collection",
2626
on_delete=models.CASCADE,
27-
related_name="%(class)s",
27+
related_name="%(class)ss", # Makes collection.deltaincludepatterns.all()
2828
related_query_name="%(class)ss",
2929
)
3030
match_pattern = models.CharField(
@@ -33,11 +33,11 @@ class MatchPatternTypeChoices(models.IntegerChoices):
3333
match_pattern_type = models.IntegerField(choices=MatchPatternTypeChoices.choices, default=1)
3434
delta_urls = models.ManyToManyField(
3535
"DeltaUrl",
36-
related_name="%(class)s_delta_urls",
36+
related_name="%(class)ss", # Makes delta_url.deltaincludepatterns.all()
3737
)
3838
curated_urls = models.ManyToManyField(
3939
"CuratedUrl",
40-
related_name="%(class)s_curated_urls",
40+
related_name="%(class)ss", # Makes curated_url.deltaincludepatterns.all()
4141
)
4242

4343
def get_regex_pattern(self) -> str:

sde_collections/tests/factories.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -66,25 +66,25 @@ class Meta:
6666
# division = 1
6767

6868

69-
class CuratedUrlFactory(factory.django.DjangoModelFactory):
69+
class DeltaUrlFactory(factory.django.DjangoModelFactory):
7070
class Meta:
71-
model = CuratedUrl
71+
model = DeltaUrl
7272

7373
collection = factory.SubFactory(CollectionFactory)
7474
url = factory.Faker("url")
7575
scraped_title = factory.Faker("sentence")
76-
scraped_text = factory.Faker("paragraph")
77-
generated_title = factory.Faker("sentence")
78-
visited = factory.Faker("boolean")
79-
document_type = 1
80-
division = 1
76+
to_delete = False
8177

8278

83-
class DeltaUrlFactory(factory.django.DjangoModelFactory):
79+
class CuratedUrlFactory(factory.django.DjangoModelFactory):
8480
class Meta:
85-
model = DeltaUrl
81+
model = CuratedUrl
8682

8783
collection = factory.SubFactory(CollectionFactory)
8884
url = factory.Faker("url")
8985
scraped_title = factory.Faker("sentence")
90-
to_delete = False
86+
scraped_text = factory.Faker("paragraph")
87+
generated_title = factory.Faker("sentence")
88+
visited = factory.Faker("boolean")
89+
document_type = 1
90+
division = 1

0 commit comments

Comments
 (0)