Skip to content

Commit 983e316

Browse files
authored
Merge pull request #1118 from NASA-IMPACT/1115-improve-title-processing-and-tests
1115 improve title processing and tests
2 parents a598171 + 82462f7 commit 983e316

File tree

6 files changed

+272
-14
lines changed

6 files changed

+272
-14
lines changed

sde_collections/models/delta_patterns.py

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@
66
from django.db import models
77

88
from ..utils.title_resolver import (
9-
is_valid_fstring,
109
is_valid_xpath,
1110
parse_title,
1211
resolve_title,
12+
validate_fstring,
1313
)
1414
from .collection_choice_fields import Divisions, DocumentTypes
1515

@@ -50,11 +50,19 @@ def get_url_match_count(self):
5050

5151
def is_most_distinctive_pattern(self, url) -> bool:
5252
"""
53-
Determine if this pattern should apply to a URL by checking if it matches
54-
the smallest number of URLs among all patterns that match this URL.
53+
Determine if this pattern should apply to a URL by checking:
54+
1. First checks if this pattern matches this URL
55+
2. If it matches the smallest number of URLs among all patterns that match this URL
56+
3. If tied for smallest number of matches, uses the longest pattern string
5557
Returns True if this pattern should be applied.
5658
"""
59+
# First check if this pattern matches the URL
60+
regex_pattern = self.get_regex_pattern()
61+
if not re.search(regex_pattern, url.url):
62+
return False
63+
5764
my_match_count = self.get_url_match_count()
65+
my_pattern_length = len(self.match_pattern)
5866

5967
# Get patterns from same type that affect this URL
6068
pattern_class = self.__class__
@@ -63,12 +71,19 @@ def is_most_distinctive_pattern(self, url) -> bool:
6371
.filter(models.Q(delta_urls__url=url.url) | models.Q(curated_urls__url=url.url))
6472
.exclude(id=self.id)
6573
.distinct()
66-
) # TODO: does this have a distinct urls, or distinct model objects.
74+
)
6775

68-
# If any matching pattern has a smaller URL set, don't apply
76+
# Use M2M relationships for checking other patterns since those are already established
6977
for pattern in matching_patterns:
70-
if pattern.get_url_match_count() < my_match_count:
78+
other_match_count = pattern.get_url_match_count()
79+
if other_match_count < my_match_count:
80+
# Other pattern matches fewer URLs - definitely not most distinctive
7181
return False
82+
if other_match_count == my_match_count:
83+
# Same match count - check pattern length
84+
if len(pattern.match_pattern) > my_pattern_length:
85+
# Other pattern is longer - not most distinctive
86+
return False
7287

7388
return True
7489

@@ -398,7 +413,7 @@ def validate_title_pattern(title_pattern_string: str) -> None:
398413
raise ValidationError(f"Invalid xpath: {element_value}")
399414
elif element_type == "brace":
400415
try:
401-
is_valid_fstring(element_value)
416+
validate_fstring(element_value)
402417
except ValueError as e:
403418
raise ValidationError(str(e))
404419

@@ -454,8 +469,9 @@ def apply(self) -> None:
454469
new_title, error = self.generate_title_for_url(curated_url)
455470

456471
if error:
457-
# Log error and continue to next URL
458-
DeltaResolvedTitleError.objects.create(title_pattern=self, delta_url=curated_url, error_string=error)
472+
DeltaResolvedTitleError.objects.update_or_create(
473+
delta_url=curated_url, defaults={"title_pattern": self, "error_string": error} # lookup field
474+
)
459475
continue
460476

461477
# Skip if the generated title matches existing or if Delta already exists
@@ -488,7 +504,9 @@ def apply(self) -> None:
488504
new_title, error = self.generate_title_for_url(delta_url)
489505

490506
if error:
491-
DeltaResolvedTitleError.objects.create(title_pattern=self, delta_url=delta_url, error_string=error)
507+
DeltaResolvedTitleError.objects.update_or_create(
508+
delta_url=delta_url, defaults={"title_pattern": self, "error_string": error} # lookup field
509+
)
492510
continue
493511

494512
# Update title and record resolution - key change here

sde_collections/models/pattern.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@
55
from django.db import models
66

77
from ..utils.title_resolver import (
8-
is_valid_fstring,
98
is_valid_xpath,
109
parse_title,
1110
resolve_title,
11+
validate_fstring,
1212
)
1313
from .collection_choice_fields import Divisions, DocumentTypes
1414

@@ -146,7 +146,7 @@ def validate_title_pattern(title_pattern_string):
146146
raise ValidationError(f"'xpath:{element_value}' is not a valid xpath.") # noqa: E231
147147
elif element_type == "brace":
148148
try:
149-
is_valid_fstring(element_value)
149+
validate_fstring(element_value)
150150
except ValueError as e:
151151
raise ValidationError(str(e))
152152

sde_collections/tests/test_delta_patterns.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,3 +266,48 @@ def test_pattern_reapplication_does_not_duplicate_delta_urls(self):
266266
# Ensure no new `DeltaUrl` is created after reapplying the pattern
267267
pattern.apply()
268268
assert DeltaUrl.objects.filter(url=curated_url.url).count() == 0
269+
270+
@pytest.mark.django_db
271+
def test_title_pattern_error_updates(self):
272+
"""
273+
Test that when a more specific pattern creates an error,
274+
it updates rather than duplicates the error record.
275+
"""
276+
# Create a collection and URL
277+
collection = CollectionFactory()
278+
url = DeltaUrlFactory(
279+
collection=collection, url="https://example.com/docs/specific/item.html", scraped_title="Original Title"
280+
)
281+
282+
# Create a general pattern first
283+
general_pattern = DeltaTitlePattern.objects.create(
284+
collection=collection,
285+
match_pattern="*docs*",
286+
title_pattern="{invalid}", # Invalid variable name will cause error
287+
match_pattern_type=2,
288+
)
289+
290+
# Verify initial error state
291+
error = url.deltaresolvedtitleerror
292+
assert error.title_pattern == general_pattern
293+
assert "Variable 'invalid' not allowed in f-string pattern" in error.error_string
294+
295+
# Create a more specific pattern
296+
specific_pattern = DeltaTitlePattern.objects.create(
297+
collection=collection,
298+
match_pattern="*docs/specific*",
299+
title_pattern="{another_invalid}",
300+
match_pattern_type=2,
301+
)
302+
303+
# Re-fetch error to see latest state
304+
error.refresh_from_db()
305+
306+
# Error should now be from specific pattern
307+
assert (
308+
error.title_pattern == specific_pattern
309+
), f"Error still associated with {error.title_pattern} instead of {specific_pattern}"
310+
assert "Variable 'another_invalid' not allowed in f-string pattern" in error.error_string
311+
312+
# Verify we still only have one error record
313+
assert DeltaResolvedTitleError.objects.filter(delta_url=url).count() == 1

sde_collections/tests/test_pattern_specificity.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,3 +156,65 @@ def test_field_modifying_pattern_layered_specificity():
156156
assert mid_tool.pk in broad_pattern.delta_urls.values_list("pk", flat=True)
157157

158158
assert top_tool.pk in broad_pattern.delta_urls.values_list("pk", flat=True)
159+
160+
161+
@pytest.mark.django_db
162+
def test_pattern_specificity_tiebreaker():
163+
"""Test that when patterns match the same number of URLs, longer patterns are considered more specific."""
164+
collection = CollectionFactory()
165+
166+
# Create URLs that would result in same match count for different patterns
167+
url1 = DeltaUrlFactory(
168+
collection=collection, url="https://example.com/docs/specific/item1.html", scraped_title="Title 1"
169+
)
170+
url2 = DeltaUrlFactory(
171+
collection=collection, url="https://example.com/docs/specific/item2.html", scraped_title="Title 2"
172+
)
173+
174+
# Create patterns with same match count but different lengths
175+
general_pattern = DeltaTitlePattern.objects.create(
176+
collection=collection,
177+
match_pattern="*docs*", # Shorter pattern
178+
title_pattern="{title}",
179+
match_pattern_type=2,
180+
)
181+
182+
specific_pattern = DeltaTitlePattern.objects.create(
183+
collection=collection,
184+
match_pattern="*docs/specific*", # Longer pattern
185+
title_pattern="{title} - Specific",
186+
match_pattern_type=2,
187+
)
188+
189+
# Both patterns will match both URLs (same match count)
190+
assert general_pattern.get_url_match_count() == 2
191+
assert specific_pattern.get_url_match_count() == 2
192+
193+
# But the longer pattern should be considered more specific
194+
assert general_pattern.is_most_distinctive_pattern(url1) is False
195+
assert specific_pattern.is_most_distinctive_pattern(url1) is True
196+
197+
# Check that this applies to both URLs
198+
assert general_pattern.is_most_distinctive_pattern(url2) is False
199+
assert specific_pattern.is_most_distinctive_pattern(url2) is True
200+
201+
# Create an even more specific pattern
202+
very_specific_pattern = DeltaTitlePattern.objects.create(
203+
collection=collection,
204+
match_pattern="*docs/specific/item1*", # Even longer pattern
205+
title_pattern="{title} - Very Specific",
206+
match_pattern_type=2,
207+
)
208+
209+
# It matches fewer URLs
210+
assert very_specific_pattern.get_url_match_count() == 1
211+
212+
# For URL1, the very specific pattern should win due to fewer matches
213+
assert general_pattern.is_most_distinctive_pattern(url1) is False
214+
assert specific_pattern.is_most_distinctive_pattern(url1) is False
215+
assert very_specific_pattern.is_most_distinctive_pattern(url1) is True
216+
217+
# For URL2, the middle pattern should still win since very_specific doesn't match
218+
assert general_pattern.is_most_distinctive_pattern(url2) is False
219+
assert specific_pattern.is_most_distinctive_pattern(url2) is True
220+
assert very_specific_pattern.is_most_distinctive_pattern(url2) is False
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_title_resolution.py
2+
3+
from unittest.mock import Mock, patch
4+
5+
import pytest
6+
7+
from ..utils.title_resolver import (
8+
clean_text,
9+
is_valid_xpath,
10+
parse_title,
11+
resolve_brace,
12+
resolve_title,
13+
resolve_xpath,
14+
validate_fstring,
15+
)
16+
17+
18+
def test_parse_title():
19+
# Test basic string
20+
assert parse_title("Simple Title") == [("str", "Simple Title")]
21+
22+
# Test f-string
23+
assert parse_title("Hello {title}") == [("str", "Hello "), ("brace", "{title}")]
24+
25+
# Test xpath
26+
assert parse_title("xpath://h1") == [("xpath", "//h1")]
27+
28+
# Test complex pattern
29+
result = parse_title("xpath://h1 | {title} - {collection}")
30+
assert result == [
31+
("xpath", "//h1"),
32+
("str", " | "),
33+
("brace", "{title}"),
34+
("str", " - "),
35+
("brace", "{collection}"),
36+
]
37+
38+
39+
def test_is_valid_xpath():
40+
assert is_valid_xpath("//h1") is True
41+
assert is_valid_xpath("//div[@class='title']") is True
42+
assert is_valid_xpath("invalid xpath") is False
43+
assert is_valid_xpath("//h1[") is False
44+
45+
46+
def test_validate_fstring():
47+
# Valid cases - should not raise
48+
validate_fstring("{title}")
49+
validate_fstring("{url}")
50+
validate_fstring("{collection}")
51+
52+
# Invalid cases
53+
with pytest.raises(ValueError):
54+
validate_fstring("{invalid_var}")
55+
with pytest.raises(ValueError):
56+
validate_fstring("{title.upper()}")
57+
with pytest.raises(ValueError):
58+
validate_fstring("{len(title)}")
59+
60+
61+
def test_resolve_brace():
62+
context = {"title": "Test Title", "url": "https://example.com", "collection": "Test Collection"}
63+
64+
assert resolve_brace("{title}", context) == "Test Title"
65+
assert resolve_brace("{title} - {collection}", context) == "Test Title - Test Collection"
66+
67+
with pytest.raises(ValueError):
68+
resolve_brace("{invalid}", context)
69+
70+
71+
def test_clean_text():
72+
# Test whitespace handling
73+
assert clean_text(" Title \n With\tSpaces ") == "Title With Spaces"
74+
75+
# Test HTML entities
76+
assert clean_text("Title &amp; More") == "Title & More"
77+
78+
# Test unicode normalization
79+
assert clean_text("Café") == "Cafe"
80+
81+
82+
@patch("requests.get")
83+
def test_resolve_xpath(mock_get):
84+
mock_response = Mock()
85+
mock_response.ok = True
86+
mock_response.content = b"""
87+
<html>
88+
<body>
89+
<h1>Test Title</h1>
90+
<div class="content">Inner Content</div>
91+
</body>
92+
</html>
93+
"""
94+
mock_get.return_value = mock_response
95+
96+
# Test basic xpath
97+
assert resolve_xpath("//h1", "https://example.com") == "Test Title"
98+
assert resolve_xpath("//div[@class='content']", "https://example.com") == "Inner Content"
99+
100+
# Test error cases
101+
mock_response.ok = False
102+
with pytest.raises(ValueError):
103+
resolve_xpath("//h1", "https://example.com")
104+
105+
mock_response.ok = True
106+
with pytest.raises(ValueError):
107+
resolve_xpath("//nonexistent", "https://example.com")
108+
109+
110+
@patch("requests.get")
111+
def test_resolve_title(mock_get):
112+
mock_response = Mock()
113+
mock_response.ok = True
114+
mock_response.content = b"""
115+
<html>
116+
<body>
117+
<h1>Dynamic Content</h1>
118+
</body>
119+
</html>
120+
"""
121+
mock_get.return_value = mock_response
122+
123+
context = {"title": "Original Title", "url": "https://example.com", "collection": "Test Collection"}
124+
125+
# Test combination of xpath and f-string
126+
pattern = "xpath://h1 | {title} - {collection}"
127+
assert resolve_title(pattern, context) == "Dynamic Content | Original Title - Test Collection"
128+
129+
# Test simple f-string
130+
assert resolve_title("{title} ({collection})", context) == "Original Title (Test Collection)"
131+
132+
# Test plain string
133+
assert resolve_title("Static Title", context) == "Static Title"

sde_collections/utils/title_resolver.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ def is_valid_xpath(xpath: str) -> bool:
1717
return False
1818

1919

20-
def is_valid_fstring(pattern: str) -> bool:
20+
def validate_fstring(pattern: str) -> bool:
2121
context = {
2222
"url": "",
2323
"title": "",
@@ -53,7 +53,7 @@ def resolve_brace(pattern: str, context: dict[str, Any]) -> str:
5353
"""Safely interpolates the variables in an f-string pattern using the provided context."""
5454
parsed = ast.parse(f"f'''{pattern}'''", mode="eval")
5555

56-
is_valid_fstring(pattern) # Refactor this
56+
validate_fstring(pattern)
5757

5858
compiled = compile(parsed, "<string>", "eval")
5959
return str(eval(compiled, {}, context))

0 commit comments

Comments
 (0)