Skip to content

Commit c83e1c6

Browse files
committed
update pattern application to not create excess deltaurls
1 parent 2ffb5c2 commit c83e1c6

File tree

2 files changed

+99
-1
lines changed

2 files changed

+99
-1
lines changed

sde_collections/models/delta_patterns.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,10 @@ class MatchPatternTypeChoices(models.IntegerChoices):
4040
)
4141

4242
def matched_urls(self):
43-
"""Find all URLs matching the pattern."""
43+
"""
44+
Find all URLs matching the pattern.
45+
This does not update pattern.delta_urls or pattern.curated_urls.
46+
"""
4447
DeltaUrl = apps.get_model("sde_collections", "DeltaUrl")
4548
CuratedUrl = apps.get_model("sde_collections", "CuratedUrl")
4649

@@ -92,6 +95,10 @@ def apply(self, fields_to_copy=None, update_fields=None):
9295

9396
# Step 1: Generate or update DeltaUrls for each matching CuratedUrl
9497
for curated_url in matched_urls["matching_curated_urls"]:
98+
# Check if the curated_url is already linked to this pattern
99+
if self.curated_urls.filter(pk=curated_url.pk).exists():
100+
# Skip creating a DeltaUrl if the curated_url is already associated with this pattern
101+
continue
95102
self.generate_delta_url(curated_url, fields_to_copy)
96103

97104
# Step 2: Apply updates to fields on matching DeltaUrls

sde_collections/tests/test_delta_patterns.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
CollectionFactory,
1919
CuratedUrlFactory,
2020
DeltaUrlFactory,
21+
DumpUrlFactory,
2122
)
2223
from sde_collections.utils.title_resolver import resolve_title
2324

@@ -84,6 +85,72 @@ def test_generate_delta_url_creation_and_update(self):
8485
delta_url.refresh_from_db()
8586
assert delta_url.scraped_title == original_delta_title
8687

88+
def test_apply_creates_delta_url_if_curated_url_does_not_exist(self):
89+
"""
90+
Ensures that the `apply` logic creates a new `DeltaUrl` if a matching `CuratedUrl` does not exist.
91+
"""
92+
collection = CollectionFactory()
93+
delta_url = DeltaUrlFactory(
94+
collection=collection, url="https://example.com/page", scraped_title="Original Title"
95+
)
96+
97+
# Create a pattern matching the URL
98+
pattern = DeltaIncludePattern.objects.create(
99+
collection=collection, match_pattern="https://example.com/*", match_pattern_type=2
100+
)
101+
102+
# Apply the pattern
103+
pattern.apply()
104+
105+
# Verify that a DeltaUrl is created
106+
assert DeltaUrl.objects.filter(url=delta_url.url).exists()
107+
108+
def test_apply_skips_delta_url_creation_if_curated_url_exists(self):
109+
"""
110+
Ensures that the `apply` logic does not create a new `DeltaUrl` if a matching `CuratedUrl` already exists.
111+
"""
112+
collection = CollectionFactory()
113+
delta_url = DeltaUrlFactory(
114+
collection=collection, url="https://example.com/page", scraped_title="Original Title"
115+
)
116+
117+
# Create a pattern matching the URL
118+
pattern = DeltaIncludePattern.objects.create(
119+
collection=collection, match_pattern="https://example.com/*", match_pattern_type=2
120+
)
121+
122+
# Promote the DeltaUrl to a CuratedUrl
123+
collection.promote_to_curated()
124+
curated_url = CuratedUrl.objects.get(url=delta_url.url)
125+
126+
# ReApply the pattern
127+
pattern.apply()
128+
129+
# Verify that no DeltaUrl is created after the CuratedUrl exists
130+
assert not DeltaUrl.objects.filter(url=curated_url.url).exists()
131+
132+
def test_apply_creates_delta_url_if_no_curated_url_exists(self):
133+
"""
134+
Ensures that if no `CuratedUrl` exists for a given pattern, a new `DeltaUrl` is created.
135+
"""
136+
collection = CollectionFactory()
137+
dump_url = DumpUrlFactory(collection=collection, url="https://example.com/page", scraped_title="New Title")
138+
139+
# Migrate DumpUrl to DeltaUrl
140+
collection.migrate_dump_to_delta()
141+
142+
# Create a pattern matching the URL
143+
pattern = DeltaIncludePattern.objects.create(
144+
collection=collection, match_pattern="https://example.com/*", match_pattern_type=2
145+
)
146+
147+
# Apply the pattern
148+
pattern.apply()
149+
150+
# A `DeltaUrl` should now exist
151+
delta_url = DeltaUrl.objects.get(url=dump_url.url)
152+
assert delta_url.scraped_title == dump_url.scraped_title
153+
87154
def test_apply_and_unapply_pattern(self):
88155
# if we make a new exclude pattern and it affects an old url
89156
# that wasn't previously affected, what should happen?
@@ -258,6 +325,30 @@ def test_unapply_removes_pattern_relationships(self):
258325
assert not pattern.delta_urls.filter(pk=delta_url.pk).exists()
259326
assert not pattern.curated_urls.filter(pk=curated_url.pk).exists()
260327

328+
# TODO: work on this test logic
329+
# def test_pattern_reapplication_does_not_duplicate_delta_urls(self):
330+
# """
331+
# Ensures that reapplying a pattern does not create duplicate `DeltaUrls` or affect existing `CuratedUrls`.
332+
# """
333+
# collection = CollectionFactory()
334+
# delta_url = DeltaUrlFactory(collection=collection,
335+
# url="https://example.com/page",
336+
# scraped_title="Title Before")
337+
338+
# # Promote to CuratedUrl
339+
# collection.promote_to_curated()
340+
# curated_url = CuratedUrl.objects.get(url=delta_url.url)
341+
342+
# # Apply a pattern
343+
# pattern = DeltaTitlePattern.objects.create(
344+
# collection=collection, match_pattern="https://example.com/*", match_pattern_type=2, title_patte......
345+
# )
346+
# pattern.apply()
347+
348+
# # Ensure no new `DeltaUrl` is created after reapplying the pattern
349+
# pattern.apply()
350+
# assert DeltaUrl.objects.filter(url=curated_url.url).count() == 0
351+
261352

262353
@pytest.mark.django_db
263354
class TestDeltaDocumentTypePattern:

0 commit comments

Comments
 (0)