Skip to content

Commit b378388

Browse files
committed
implement smallest set rule application for titles
1 parent 3e0f399 commit b378388

File tree

2 files changed

+117
-2
lines changed

2 files changed

+117
-2
lines changed

sde_collections/models/delta_patterns.py

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,35 @@ def generate_title_for_url(self, url_obj) -> tuple[str, str | None]:
388388
except (ValueError, ValidationError) as e:
389389
return None, str(e)
390390

391+
def get_url_match_count(self):
392+
"""
393+
Get the number of unique URLs this pattern matches across both delta and curated URLs.
394+
"""
395+
delta_urls = set(self.get_matching_delta_urls().values_list("url", flat=True))
396+
curated_urls = set(self.get_matching_curated_urls().values_list("url", flat=True))
397+
return len(delta_urls.union(curated_urls))
398+
399+
def is_most_distinctive_pattern(self, url) -> bool:
400+
"""
401+
Determine if this pattern should apply to a URL by checking if it matches
402+
the smallest number of URLs among all patterns that match this URL.
403+
Returns True if this pattern should be applied.
404+
"""
405+
my_match_count = self.get_url_match_count()
406+
407+
# Get all patterns that match this URL based on match_pattern regex
408+
matching_patterns = DeltaTitlePattern.objects.filter(collection=self.collection).exclude(
409+
id=self.id
410+
) # Exclude self to avoid duplicate counting
411+
412+
# Filter to only patterns that would match this URL and get their counts
413+
for pattern in matching_patterns:
414+
if re.match(pattern.get_regex_pattern(), url.url):
415+
if pattern.get_url_match_count() < my_match_count:
416+
return False
417+
418+
return True
419+
391420
def apply(self) -> None:
392421
"""
393422
Apply the title pattern to matching URLs:
@@ -408,6 +437,9 @@ def apply(self) -> None:
408437

409438
# Process each previously unaffected curated URL
410439
for curated_url in previously_unaffected_curated:
440+
if not self.is_most_distinctive_pattern(curated_url):
441+
continue
442+
411443
new_title, error = self.generate_title_for_url(curated_url)
412444

413445
if error:
@@ -439,15 +471,19 @@ def apply(self) -> None:
439471

440472
# Update titles for all matching Delta URLs
441473
for delta_url in self.get_matching_delta_urls():
474+
if not self.is_most_distinctive_pattern(delta_url):
475+
continue
476+
442477
new_title, error = self.generate_title_for_url(delta_url)
443478

444479
if error:
445480
DeltaResolvedTitleError.objects.create(title_pattern=self, delta_url=delta_url, error_string=error)
446481
continue
447482

448-
# Update title and record resolution
483+
# Update title and record resolution - key change here
449484
DeltaResolvedTitle.objects.update_or_create(
450-
title_pattern=self, delta_url=delta_url, defaults={"resolved_title": new_title}
485+
delta_url=delta_url, # Only use delta_url for lookup
486+
defaults={"title_pattern": self, "resolved_title": new_title},
451487
)
452488

453489
delta_url.generated_title = new_title
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_title_patterns.py
2+
3+
import pytest
4+
5+
from sde_collections.models.delta_patterns import DeltaResolvedTitle, DeltaTitlePattern
6+
from sde_collections.models.delta_url import DeltaUrl
7+
from sde_collections.tests.factories import CollectionFactory, DeltaUrlFactory
8+
9+
10+
@pytest.mark.django_db
11+
def test_title_pattern_multiple_resolved_titles_extended():
12+
"""Test that patterns properly handle title resolution based on URL set size."""
13+
collection = CollectionFactory()
14+
15+
# Create URLs with different levels of specificity
16+
url1 = DeltaUrlFactory(
17+
collection=collection, url="https://example.com/docs/item.html", scraped_title="Original Title"
18+
)
19+
url2 = DeltaUrlFactory(
20+
collection=collection, url="https://example.com/docs/item2.html", scraped_title="Original Title"
21+
)
22+
url3 = DeltaUrlFactory(
23+
collection=collection, url="https://example.com/docs/pdfs/item1.html", scraped_title="Original Title"
24+
)
25+
26+
# Create general pattern (matches all URLs)
27+
general_pattern = DeltaTitlePattern.objects.create(
28+
collection=collection,
29+
match_pattern="*docs*",
30+
title_pattern="{title} - Docs",
31+
match_pattern_type=2,
32+
)
33+
34+
# Verify initial pattern application
35+
assert general_pattern.get_url_match_count() == 3
36+
assert DeltaUrl.objects.get(pk=url1.pk).generated_title == "Original Title - Docs"
37+
assert DeltaUrl.objects.get(pk=url2.pk).generated_title == "Original Title - Docs"
38+
assert DeltaUrl.objects.get(pk=url3.pk).generated_title == "Original Title - Docs"
39+
40+
# Verify DeltaResolvedTitle entries
41+
assert DeltaResolvedTitle.objects.count() == 3
42+
for url in [url1, url2, url3]:
43+
resolved = DeltaResolvedTitle.objects.get(delta_url=url)
44+
assert resolved.title_pattern == general_pattern
45+
assert resolved.resolved_title == "Original Title - Docs"
46+
47+
# Create more specific pattern
48+
specific_pattern = DeltaTitlePattern.objects.create(
49+
collection=collection, match_pattern="*docs/pdfs*", title_pattern="{title} - HTML", match_pattern_type=2
50+
)
51+
52+
# Verify pattern match counts
53+
assert specific_pattern.get_url_match_count() == 1 # Only matches pdfs URL
54+
assert general_pattern.get_url_match_count() == 3 # Matches all URLs
55+
56+
# Verify titles were updated appropriately
57+
assert DeltaUrl.objects.get(pk=url1.pk).generated_title == "Original Title - Docs" # Unchanged
58+
assert DeltaUrl.objects.get(pk=url2.pk).generated_title == "Original Title - Docs" # Unchanged
59+
assert DeltaUrl.objects.get(pk=url3.pk).generated_title == "Original Title - HTML" # Updated
60+
61+
# Verify DeltaResolvedTitle entries
62+
assert DeltaResolvedTitle.objects.count() == 3 # Still one per URL
63+
64+
# URLs with general pattern should be unchanged
65+
for url in [url1, url2]:
66+
resolved = DeltaResolvedTitle.objects.get(delta_url=url)
67+
assert resolved.title_pattern == general_pattern
68+
assert resolved.resolved_title == "Original Title - Docs"
69+
70+
# PDF URL should now use specific pattern
71+
resolved_pdf = DeltaResolvedTitle.objects.get(delta_url=url3)
72+
assert resolved_pdf.title_pattern == specific_pattern
73+
assert resolved_pdf.resolved_title == "Original Title - HTML"
74+
75+
# Verify pattern relationships are maintained
76+
assert url1 in general_pattern.delta_urls.all()
77+
assert url2 in general_pattern.delta_urls.all()
78+
assert url3 in general_pattern.delta_urls.all()
79+
assert url3 in specific_pattern.delta_urls.all()

0 commit comments

Comments
 (0)