Skip to content

Commit d87b355

Browse files
committed
implement pattern specificity across entire class
1 parent b378388 commit d87b355

File tree

3 files changed

+130
-44
lines changed

3 files changed

+130
-44
lines changed

sde_collections/models/delta_patterns.py

Lines changed: 42 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,38 @@ class MatchPatternTypeChoices(models.IntegerChoices):
4040
related_name="%(class)ss", # Makes curated_url.deltaincludepatterns.all()
4141
)
4242

43+
def get_url_match_count(self):
44+
"""
45+
Get the number of unique URLs this pattern matches across both delta and curated URLs.
46+
"""
47+
delta_urls = set(self.get_matching_delta_urls().values_list("url", flat=True))
48+
curated_urls = set(self.get_matching_curated_urls().values_list("url", flat=True))
49+
return len(delta_urls.union(curated_urls))
50+
51+
def is_most_distinctive_pattern(self, url) -> bool:
52+
"""
53+
Determine if this pattern should apply to a URL by checking if it matches
54+
the smallest number of URLs among all patterns that match this URL.
55+
Returns True if this pattern should be applied.
56+
"""
57+
my_match_count = self.get_url_match_count()
58+
59+
# Get patterns from same type that affect this URL
60+
pattern_class = self.__class__
61+
matching_patterns = (
62+
pattern_class.objects.filter(collection=self.collection)
63+
.filter(models.Q(delta_urls__url=url.url) | models.Q(curated_urls__url=url.url))
64+
.exclude(id=self.id)
65+
.distinct()
66+
)
67+
68+
# If any matching pattern has a smaller URL set, don't apply
69+
for pattern in matching_patterns:
70+
if pattern.get_url_match_count() < my_match_count:
71+
return False
72+
73+
return True
74+
4375
def get_regex_pattern(self) -> str:
4476
"""Convert the match pattern into a proper regex based on pattern type."""
4577
escaped_pattern = re.escape(self.match_pattern)
@@ -240,6 +272,9 @@ def apply(self) -> None:
240272

241273
# Create DeltaUrls only where field value would change
242274
for curated_url in previously_unaffected_curated:
275+
if not self.is_most_distinctive_pattern(curated_url):
276+
continue
277+
243278
if (
244279
getattr(curated_url, field) == new_value
245280
or DeltaUrl.objects.filter(url=curated_url.url, collection=self.collection).exists()
@@ -257,8 +292,13 @@ def apply(self) -> None:
257292

258293
DeltaUrl.objects.create(**fields)
259294

260-
# Update all matching DeltaUrls with the new field value
261-
self.get_matching_delta_urls().update(**{field: new_value})
295+
# Update all matching DeltaUrls with the new field value if this is the most distinctive pattern
296+
for delta_url in self.get_matching_delta_urls():
297+
if self.is_most_distinctive_pattern(delta_url):
298+
setattr(delta_url, field, new_value)
299+
delta_url.save()
300+
301+
# Update pattern relationships
262302
self.update_affected_delta_urls_list()
263303

264304
def unapply(self) -> None:
@@ -388,35 +428,6 @@ def generate_title_for_url(self, url_obj) -> tuple[str, str | None]:
388428
except (ValueError, ValidationError) as e:
389429
return None, str(e)
390430

391-
def get_url_match_count(self):
392-
"""
393-
Get the number of unique URLs this pattern matches across both delta and curated URLs.
394-
"""
395-
delta_urls = set(self.get_matching_delta_urls().values_list("url", flat=True))
396-
curated_urls = set(self.get_matching_curated_urls().values_list("url", flat=True))
397-
return len(delta_urls.union(curated_urls))
398-
399-
def is_most_distinctive_pattern(self, url) -> bool:
400-
"""
401-
Determine if this pattern should apply to a URL by checking if it matches
402-
the smallest number of URLs among all patterns that match this URL.
403-
Returns True if this pattern should be applied.
404-
"""
405-
my_match_count = self.get_url_match_count()
406-
407-
# Get all patterns that match this URL based on match_pattern regex
408-
matching_patterns = DeltaTitlePattern.objects.filter(collection=self.collection).exclude(
409-
id=self.id
410-
) # Exclude self to avoid duplicate counting
411-
412-
# Filter to only patterns that would match this URL and get their counts
413-
for pattern in matching_patterns:
414-
if re.match(pattern.get_regex_pattern(), url.url):
415-
if pattern.get_url_match_count() < my_match_count:
416-
return False
417-
418-
return True
419-
420431
def apply(self) -> None:
421432
"""
422433
Apply the title pattern to matching URLs:
@@ -491,7 +502,6 @@ def apply(self) -> None:
491502

492503
# Update pattern relationships
493504
self.update_affected_delta_urls_list()
494-
self.update_affected_curated_urls_list()
495505

496506
def unapply(self) -> None:
497507
"""

sde_collections/tests/test_delta_patterns.py

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,12 @@
22

33
import pytest
44

5-
from sde_collections.models.delta_patterns import DeltaExcludePattern, DeltaTitlePattern
6-
from sde_collections.models.delta_url import (
7-
CuratedUrl,
5+
from sde_collections.models.delta_patterns import (
6+
DeltaExcludePattern,
87
DeltaResolvedTitleError,
9-
DeltaUrl,
8+
DeltaTitlePattern,
109
)
10+
from sde_collections.models.delta_url import CuratedUrl, DeltaUrl
1111
from sde_collections.tests.factories import (
1212
CollectionFactory,
1313
CuratedUrlFactory,
@@ -101,7 +101,6 @@ def test_apply_generates_delta_url_if_title_differs(self):
101101
collection=collection,
102102
url="https://example.com/page",
103103
scraped_title="Sample Title",
104-
generated_title="Old Title - Processed",
105104
)
106105

107106
# Step 2: Create a `DeltaTitlePattern` with a new title pattern
@@ -112,9 +111,6 @@ def test_apply_generates_delta_url_if_title_differs(self):
112111
title_pattern="{title} - Processed New",
113112
)
114113

115-
# Apply the pattern
116-
pattern.apply()
117-
118114
# Step 3: A new DeltaUrl should be created with the updated `generated_title`
119115
delta_url = DeltaUrl.objects.get(url=curated_url.url)
120116
expected_generated_title = resolve_title(
@@ -217,7 +213,7 @@ def test_unapply_removes_pattern_relationships(self):
217213
curated_url = CuratedUrlFactory(
218214
collection=collection, url="https://example.com/page", scraped_title="Sample Title"
219215
)
220-
delta_url = DeltaUrlFactory(collection=collection, url="https://example.com/page", scraped_title="Sample Title")
216+
delta_url = DeltaUrlFactory(collection=collection, url="https://example.com/page", scraped_title="New Title")
221217

222218
# Create and apply a `DeltaTitlePattern`
223219
pattern = DeltaTitlePattern.objects.create(
@@ -231,7 +227,8 @@ def test_unapply_removes_pattern_relationships(self):
231227

232228
# Ensure relationships are set
233229
assert pattern.delta_urls.filter(pk=delta_url.pk).exists()
234-
assert pattern.curated_urls.filter(pk=curated_url.pk).exists()
230+
# this actually shouldn't match until after promotion
231+
assert not pattern.curated_urls.filter(pk=curated_url.pk).exists()
235232

236233
# Unapply the pattern
237234
pattern.unapply()

sde_collections/tests/test_title_patterns.py renamed to sde_collections/tests/test_pattern_specificity.py

Lines changed: 81 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,13 @@
1-
# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_title_patterns.py
1+
# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_pattern_specificity.py
22

33
import pytest
44

5-
from sde_collections.models.delta_patterns import DeltaResolvedTitle, DeltaTitlePattern
5+
from sde_collections.models.collection_choice_fields import DocumentTypes
6+
from sde_collections.models.delta_patterns import (
7+
DeltaDocumentTypePattern,
8+
DeltaResolvedTitle,
9+
DeltaTitlePattern,
10+
)
611
from sde_collections.models.delta_url import DeltaUrl
712
from sde_collections.tests.factories import CollectionFactory, DeltaUrlFactory
813

@@ -77,3 +82,77 @@ def test_title_pattern_multiple_resolved_titles_extended():
7782
assert url2 in general_pattern.delta_urls.all()
7883
assert url3 in general_pattern.delta_urls.all()
7984
assert url3 in specific_pattern.delta_urls.all()
85+
86+
87+
@pytest.mark.django_db
88+
def test_field_modifying_pattern_layered_specificity():
89+
"""Test overlapping patterns with different levels of specificity."""
90+
collection = CollectionFactory()
91+
92+
# Create URLs in a hierarchy that allows for overlapping pattern matches
93+
deep_tool = DeltaUrlFactory(
94+
collection=collection,
95+
url="https://example.com/tools/analysis/v2/processor.py",
96+
document_type=DocumentTypes.DOCUMENTATION, # Starting as documentation
97+
)
98+
mid_tool = DeltaUrlFactory(
99+
collection=collection,
100+
url="https://example.com/tools/analysis/helper.py",
101+
document_type=DocumentTypes.DOCUMENTATION, # Starting as documentation
102+
)
103+
top_tool = DeltaUrlFactory(
104+
collection=collection,
105+
url="https://example.com/tools/simple.py",
106+
document_type=DocumentTypes.DOCUMENTATION, # Starting as documentation
107+
)
108+
109+
# Create patterns with overlapping matches
110+
broad_pattern = DeltaDocumentTypePattern.objects.create(
111+
collection=collection,
112+
match_pattern="*/tools/*.py", # Matches all 3 URLs
113+
document_type=DocumentTypes.SOFTWARETOOLS,
114+
match_pattern_type=2,
115+
)
116+
117+
mid_pattern = DeltaDocumentTypePattern.objects.create(
118+
collection=collection,
119+
match_pattern="*/tools/analysis/*.py", # Matches 2 URLs (mid and deep)
120+
document_type=DocumentTypes.DATA, # Different type to clearly show which pattern won
121+
match_pattern_type=2,
122+
)
123+
124+
specific_pattern = DeltaDocumentTypePattern.objects.create(
125+
collection=collection,
126+
match_pattern="*/analysis/v2/*.py", # Matches only 1 URL (deep)
127+
document_type=DocumentTypes.DOCUMENTATION, # Different type to clearly show which pattern won
128+
match_pattern_type=2,
129+
)
130+
131+
# Verify URL match counts
132+
assert broad_pattern.get_url_match_count() == 3
133+
assert mid_pattern.get_url_match_count() == 2
134+
assert specific_pattern.get_url_match_count() == 1
135+
136+
# Verify patterns were applied correctly based on specificity
137+
deep_tool.refresh_from_db()
138+
mid_tool.refresh_from_db()
139+
top_tool.refresh_from_db()
140+
141+
# The most specific pattern (1 match) should win for the deep URL
142+
assert deep_tool.document_type == DocumentTypes.DOCUMENTATION, "Deep URL should use most specific pattern"
143+
144+
# The mid-level pattern (2 matches) should win for the middle URL
145+
assert mid_tool.document_type == DocumentTypes.DATA, "Mid URL should use mid-level pattern"
146+
147+
# The broad pattern (3 matches) should only affect the top URL
148+
assert top_tool.document_type == DocumentTypes.SOFTWARETOOLS, "Top URL should use broad pattern"
149+
150+
# Verify the relationships are tracked correctly
151+
assert deep_tool.pk in specific_pattern.delta_urls.values_list("pk", flat=True)
152+
assert deep_tool.pk in mid_pattern.delta_urls.values_list("pk", flat=True)
153+
assert deep_tool.pk in broad_pattern.delta_urls.values_list("pk", flat=True)
154+
155+
assert mid_tool.pk in mid_pattern.delta_urls.values_list("pk", flat=True)
156+
assert mid_tool.pk in broad_pattern.delta_urls.values_list("pk", flat=True)
157+
158+
assert top_tool.pk in broad_pattern.delta_urls.values_list("pk", flat=True)

0 commit comments

Comments
 (0)