Skip to content

Commit 82462f7

Browse files
committed
add tiebreaker logic to pattern specificity
1 parent 81a9ff9 commit 82462f7

File tree

3 files changed

+88
-9
lines changed

3 files changed

+88
-9
lines changed

sde_collections/models/delta_patterns.py

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -50,11 +50,19 @@ def get_url_match_count(self):
5050

5151
def is_most_distinctive_pattern(self, url) -> bool:
5252
"""
53-
Determine if this pattern should apply to a URL by checking if it matches
54-
the smallest number of URLs among all patterns that match this URL.
53+
Determine if this pattern should apply to a URL by checking:
54+
1. First checks if this pattern matches this URL
55+
2. If it matches the smallest number of URLs among all patterns that match this URL
56+
3. If tied for smallest number of matches, uses the longest pattern string
5557
Returns True if this pattern should be applied.
5658
"""
59+
# First check if this pattern matches the URL
60+
regex_pattern = self.get_regex_pattern()
61+
if not re.search(regex_pattern, url.url):
62+
return False
63+
5764
my_match_count = self.get_url_match_count()
65+
my_pattern_length = len(self.match_pattern)
5866

5967
# Get patterns from same type that affect this URL
6068
pattern_class = self.__class__
@@ -63,12 +71,19 @@ def is_most_distinctive_pattern(self, url) -> bool:
6371
.filter(models.Q(delta_urls__url=url.url) | models.Q(curated_urls__url=url.url))
6472
.exclude(id=self.id)
6573
.distinct()
66-
) # TODO: does this have a distinct urls, or distinct model objects.
74+
)
6775

68-
# If any matching pattern has a smaller URL set, don't apply
76+
# Use M2M relationships for checking other patterns since those are already established
6977
for pattern in matching_patterns:
70-
if pattern.get_url_match_count() < my_match_count:
78+
other_match_count = pattern.get_url_match_count()
79+
if other_match_count < my_match_count:
80+
# Other pattern matches fewer URLs - definitely not most distinctive
7181
return False
82+
if other_match_count == my_match_count:
83+
# Same match count - check pattern length
84+
if len(pattern.match_pattern) > my_pattern_length:
85+
# Other pattern is longer - not most distinctive
86+
return False
7287

7388
return True
7489

sde_collections/tests/test_delta_patterns.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -283,7 +283,6 @@ def test_title_pattern_error_updates(self):
283283
general_pattern = DeltaTitlePattern.objects.create(
284284
collection=collection,
285285
match_pattern="*docs*",
286-
# Use a different error-causing pattern
287286
title_pattern="{invalid}", # Invalid variable name will cause error
288287
match_pattern_type=2,
289288
)
@@ -297,14 +296,17 @@ def test_title_pattern_error_updates(self):
297296
specific_pattern = DeltaTitlePattern.objects.create(
298297
collection=collection,
299298
match_pattern="*docs/specific*",
300-
# Different invalid variable
301299
title_pattern="{another_invalid}",
302300
match_pattern_type=2,
303301
)
304302

303+
# Re-fetch error to see latest state
304+
error.refresh_from_db()
305+
305306
# Error should now be from specific pattern
306-
error = url.deltaresolvedtitleerror # Should still only be one
307-
assert error.title_pattern == specific_pattern
307+
assert (
308+
error.title_pattern == specific_pattern
309+
), f"Error still associated with {error.title_pattern} instead of {specific_pattern}"
308310
assert "Variable 'another_invalid' not allowed in f-string pattern" in error.error_string
309311

310312
# Verify we still only have one error record

sde_collections/tests/test_pattern_specificity.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,3 +156,65 @@ def test_field_modifying_pattern_layered_specificity():
156156
assert mid_tool.pk in broad_pattern.delta_urls.values_list("pk", flat=True)
157157

158158
assert top_tool.pk in broad_pattern.delta_urls.values_list("pk", flat=True)
159+
160+
161+
@pytest.mark.django_db
162+
def test_pattern_specificity_tiebreaker():
163+
"""Test that when patterns match the same number of URLs, longer patterns are considered more specific."""
164+
collection = CollectionFactory()
165+
166+
# Create URLs that would result in same match count for different patterns
167+
url1 = DeltaUrlFactory(
168+
collection=collection, url="https://example.com/docs/specific/item1.html", scraped_title="Title 1"
169+
)
170+
url2 = DeltaUrlFactory(
171+
collection=collection, url="https://example.com/docs/specific/item2.html", scraped_title="Title 2"
172+
)
173+
174+
# Create patterns with same match count but different lengths
175+
general_pattern = DeltaTitlePattern.objects.create(
176+
collection=collection,
177+
match_pattern="*docs*", # Shorter pattern
178+
title_pattern="{title}",
179+
match_pattern_type=2,
180+
)
181+
182+
specific_pattern = DeltaTitlePattern.objects.create(
183+
collection=collection,
184+
match_pattern="*docs/specific*", # Longer pattern
185+
title_pattern="{title} - Specific",
186+
match_pattern_type=2,
187+
)
188+
189+
# Both patterns will match both URLs (same match count)
190+
assert general_pattern.get_url_match_count() == 2
191+
assert specific_pattern.get_url_match_count() == 2
192+
193+
# But the longer pattern should be considered more specific
194+
assert general_pattern.is_most_distinctive_pattern(url1) is False
195+
assert specific_pattern.is_most_distinctive_pattern(url1) is True
196+
197+
# Check that this applies to both URLs
198+
assert general_pattern.is_most_distinctive_pattern(url2) is False
199+
assert specific_pattern.is_most_distinctive_pattern(url2) is True
200+
201+
# Create an even more specific pattern
202+
very_specific_pattern = DeltaTitlePattern.objects.create(
203+
collection=collection,
204+
match_pattern="*docs/specific/item1*", # Even longer pattern
205+
title_pattern="{title} - Very Specific",
206+
match_pattern_type=2,
207+
)
208+
209+
# It matches fewer URLs
210+
assert very_specific_pattern.get_url_match_count() == 1
211+
212+
# For URL1, the very specific pattern should win due to fewer matches
213+
assert general_pattern.is_most_distinctive_pattern(url1) is False
214+
assert specific_pattern.is_most_distinctive_pattern(url1) is False
215+
assert very_specific_pattern.is_most_distinctive_pattern(url1) is True
216+
217+
# For URL2, the middle pattern should still win since very_specific doesn't match
218+
assert general_pattern.is_most_distinctive_pattern(url2) is False
219+
assert specific_pattern.is_most_distinctive_pattern(url2) is True
220+
assert very_specific_pattern.is_most_distinctive_pattern(url2) is False

0 commit comments

Comments
 (0)