Skip to content

Commit b563f1e

Browse files
committed
rewrite apply logic
1 parent 4c07dc5 commit b563f1e

File tree

2 files changed

+99
-134
lines changed

2 files changed

+99
-134
lines changed

sde_collections/models/delta_patterns.py

Lines changed: 70 additions & 133 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
from django.core.exceptions import ValidationError
55
from django.db import models
66

7+
from sde_collections.models.delta_url import DeltaUrl
8+
79
from ..utils.title_resolver import (
810
is_valid_fstring,
911
is_valid_xpath,
@@ -42,42 +44,63 @@ class MatchPatternTypeChoices(models.IntegerChoices):
4244
def matched_urls(self):
4345
"""Find all the urls matching the pattern."""
4446
escaped_match_pattern = re.escape(self.match_pattern)
45-
if self.match_pattern_type == self.MatchPatternTypeChoices.INDIVIDUAL_URL:
46-
regex_pattern = f"{escaped_match_pattern}$"
47-
elif self.match_pattern_type == self.MatchPatternTypeChoices.MULTI_URL_PATTERN:
48-
regex_pattern = escaped_match_pattern.replace(r"\*", ".*") # allow * wildcards
49-
else:
50-
raise NotImplementedError
51-
52-
# Filter both DeltaUrls and CuratedUrls
53-
matching_delta_urls = self.delta_urls.filter(url__regex=regex_pattern)
54-
matching_curated_urls = self.curated_urls.filter(url__regex=regex_pattern)
55-
47+
regex_pattern = (
48+
f"{escaped_match_pattern}$"
49+
if self.match_pattern_type == self.MatchPatternTypeChoices.INDIVIDUAL_URL
50+
else escaped_match_pattern.replace(r"\*", ".*")
51+
)
5652
return {
57-
"matching_delta_urls": matching_delta_urls,
58-
"matching_curated_urls": matching_curated_urls,
53+
"matching_delta_urls": self.delta_urls.filter(url__regex=regex_pattern),
54+
"matching_curated_urls": self.curated_urls.filter(url__regex=regex_pattern),
5955
}
6056

61-
def _process_match_pattern(self) -> str:
57+
def generate_delta_url(self, curated_url, fields_to_copy=None):
6258
"""
63-
Multi-Url patterns need a star at the beginning and at the end
64-
Individual Url Patterns need a star at the beginning
59+
Generates or updates a DeltaUrl based on a CuratedUrl.
60+
Only specified fields are copied if fields_to_copy is provided.
6561
"""
66-
# we don't trust the bracketing stars from the system, so we remove any
67-
processed_pattern = self.match_pattern.strip().strip("*").strip()
68-
if not processed_pattern.startswith("http"):
69-
# if it doesn't begin with http, it must need a star at the beginning
70-
processed_pattern = f"*{processed_pattern}"
71-
if self.match_pattern_type == BaseMatchPattern.MatchPatternTypeChoices.MULTI_URL_PATTERN:
72-
# all multi urls should have a star at the end, but individuals should not
73-
processed_pattern = f"{processed_pattern}*"
74-
return processed_pattern
75-
76-
def apply(self):
77-
raise NotImplementedError
62+
delta_url, created = DeltaUrl.objects.get_or_create(
63+
collection=self.collection,
64+
url=curated_url.url,
65+
defaults={field: getattr(curated_url, field) for field in (fields_to_copy or [])},
66+
)
67+
if not created and fields_to_copy:
68+
# Update only if certain fields are missing in DeltaUrl
69+
# in the current codebase, this is only executed for scraped_title, but this
70+
# can be extended to other fields as well, if we add a pattern that requires it
71+
for field in fields_to_copy:
72+
if getattr(delta_url, field, None) in [None, ""]:
73+
setattr(delta_url, field, getattr(curated_url, field))
74+
delta_url.save()
75+
76+
def apply(self, fields_to_copy=None, update_fields=None):
77+
matched_urls = self.matched_urls()
78+
79+
# Iterate over matched CuratedUrls to create or update DeltaUrls as needed
80+
for curated_url in matched_urls["matching_curated_urls"]:
81+
self.generate_delta_url(curated_url, fields_to_copy)
82+
83+
# Apply any updates to DeltaUrls based on update_fields
84+
if update_fields:
85+
for field, value in update_fields.items():
86+
matched_urls["matching_delta_urls"].update(**{field: value})
87+
88+
# Populate through tables for DeltaUrl and CuratedUrl relationships
89+
for field_name, url_ids in {
90+
"delta_urls": matched_urls["matching_delta_urls"].values_list("id", flat=True),
91+
"curated_urls": matched_urls["matching_curated_urls"].values_list("id", flat=True),
92+
}.items():
93+
through_model = getattr(self, field_name).through
94+
bulk_data = [
95+
through_model(**{f"{field_name[:-1]}_id": url_id, f"{self.__class__.__name__.lower()}_id": self.id})
96+
for url_id in url_ids
97+
]
98+
through_model.objects.bulk_create(bulk_data, ignore_conflicts=True)
7899

79100
def unapply(self):
80-
raise NotImplementedError
101+
"""Default unapply behavior."""
102+
self.delta_urls.clear()
103+
self.curated_urls.clear()
81104

82105
def save(self, *args, **kwargs):
83106
"""Save the pattern and apply it."""
@@ -101,31 +124,8 @@ def __str__(self):
101124
class DeltaExcludePattern(BaseMatchPattern):
102125
reason = models.TextField("Reason for excluding", default="", blank=True)
103126

104-
def apply(self) -> None:
105-
matched_urls = self.matched_urls()
106-
107-
# Define a mapping of model attributes to their related URL fields
108-
url_mappings = {
109-
"delta_urls": matched_urls["matching_delta_urls"].values_list("id", flat=True),
110-
"curated_urls": matched_urls["matching_curated_urls"].values_list("id", flat=True),
111-
}
112-
113-
for field_name, url_ids in url_mappings.items():
114-
through_model = getattr(self, field_name).through # Access the through model dynamically
115-
bulk_data = [
116-
through_model(**{f"{field_name[:-1]}_id": url_id, "deltaexcludepattern_id": self.id})
117-
for url_id in url_ids
118-
]
119-
through_model.objects.bulk_create(bulk_data)
120-
121-
def unapply(self) -> None:
122-
# this is the new, suggested code
123-
# self.delta_urls.clear()
124-
# self.curated_urls.clear()
125-
# this is the old code
126-
# need to study later and decide which is better
127-
"Unapplies automatically by deleting include pattern through objects in a cascade"
128-
return
127+
# No need to override `apply`—we use the base class logic as-is.
128+
# This pattern's functionality is handled by the `excluded` annotation in the manager.
129129

130130
class Meta:
131131
verbose_name = "Exclude Pattern"
@@ -134,26 +134,7 @@ class Meta:
134134

135135

136136
class DeltaIncludePattern(BaseMatchPattern):
137-
def apply(self) -> None:
138-
matched_urls = self.matched_urls()
139-
140-
# Define a mapping of model attributes to their related URL fields
141-
url_mappings = {
142-
"delta_urls": matched_urls["matching_delta_urls"].values_list("id", flat=True),
143-
"curated_urls": matched_urls["matching_curated_urls"].values_list("id", flat=True),
144-
}
145-
146-
for field_name, url_ids in url_mappings.items():
147-
through_model = getattr(self, field_name).through # Access the through model dynamically
148-
bulk_data = [
149-
through_model(**{f"{field_name[:-1]}_id": url_id, "deltaincludepattern_id": self.id})
150-
for url_id in url_ids
151-
]
152-
through_model.objects.bulk_create(bulk_data)
153-
154-
def unapply(self) -> None:
155-
"Unapplies automatically by deleting includepattern through objects in a cascade"
156-
return
137+
# No additional logic needed for `apply`—using base class functionality.
157138

158139
class Meta:
159140
verbose_name = "Include Pattern"
@@ -186,12 +167,13 @@ class DeltaTitlePattern(BaseMatchPattern):
186167
)
187168

188169
def apply(self) -> None:
189-
matched = self.matched_urls() # Now returns separate QuerySets for delta and curated URLs
190-
updated_urls = []
170+
# Use `fields_to_copy` to copy `scraped_title` for any matching curated URLs.
171+
super().apply(fields_to_copy=["scraped_title"])
172+
173+
matched = self.matched_urls() # Separate QuerySets for delta and curated URLs
191174
ResolvedTitle = apps.get_model("sde_collections", "ResolvedTitle")
192175
ResolvedTitleError = apps.get_model("sde_collections", "ResolvedTitleError")
193176

194-
# Process both DeltaUrls and CuratedUrls
195177
for url_obj in matched["matching_delta_urls"] | matched["matching_curated_urls"]:
196178
context = {
197179
"url": url_obj.url,
@@ -201,16 +183,15 @@ def apply(self) -> None:
201183
try:
202184
generated_title = resolve_title(self.title_pattern, context)
203185

204-
# Remove any existing resolved title for this URL
186+
# Remove existing resolved title entries for this URL
205187
ResolvedTitle.objects.filter(url=url_obj).delete()
206188

207-
# Create new resolved title entry
189+
# Create a new resolved title entry
208190
ResolvedTitle.objects.create(title_pattern=self, url=url_obj, resolved_title=generated_title)
209191

210-
# Update generated title and save it to the DeltaUrl or CuratedUrl
192+
# Update generated title and save it to DeltaUrl or CuratedUrl
211193
url_obj.generated_title = generated_title
212194
url_obj.save()
213-
updated_urls.append(url_obj)
214195

215196
except (ValueError, ValidationError) as e:
216197
message = str(e)
@@ -225,30 +206,14 @@ def apply(self) -> None:
225206

226207
resolved_title_error.save()
227208

228-
# Associate pattern with both delta and curated URLs
229-
for field_name, urls in {
230-
"delta_urls": matched["matching_delta_urls"],
231-
"curated_urls": matched["matching_curated_urls"],
232-
}.items():
233-
through_model = getattr(self, field_name).through
234-
pattern_url_associations = [
235-
through_model(deltatitlepattern_id=self.id, **{f"{field_name[:-1]}_id": url.id}) for url in urls
236-
]
237-
through_model.objects.bulk_create(pattern_url_associations, ignore_conflicts=True)
238-
239209
def unapply(self) -> None:
240210
"""Clears generated titles and dissociates URLs from the pattern."""
241-
for url_obj in self.delta_urls.all() | self.curated_urls.all():
211+
for url_obj in self.delta_urls.all():
242212
url_obj.generated_title = ""
243213
url_obj.save()
244214
self.delta_urls.clear()
245215
self.curated_urls.clear()
246216

247-
def delete(self, *args, **kwargs):
248-
"""Ensures unapply is called before deletion."""
249-
self.unapply()
250-
super().delete(*args, **kwargs)
251-
252217
class Meta:
253218
verbose_name = "Title Pattern"
254219
verbose_name_plural = "Title Patterns"
@@ -258,27 +223,13 @@ class Meta:
258223
class DeltaDocumentTypePattern(BaseMatchPattern):
259224
document_type = models.IntegerField(choices=DocumentTypes.choices)
260225

226+
# We use `update_fields` in the base apply method to set `document_type`.
261227
def apply(self) -> None:
262-
matched = self.matched_urls()
263-
# Apply the document type to both DeltaUrls and CuratedUrls
264-
for field_name, urls in {
265-
"delta_urls": matched["matching_delta_urls"],
266-
"curated_urls": matched["matching_curated_urls"],
267-
}.items():
268-
urls.update(document_type=self.document_type) # Update the document type for matched URLs
269-
# Bulk create associations in the through table
270-
through_model = getattr(self, field_name).through
271-
pattern_url_associations = [
272-
through_model(**{f"{field_name[:-1]}_id": url.id, "deltadocumenttypepattern_id": self.id})
273-
for url in urls
274-
]
275-
through_model.objects.bulk_create(pattern_url_associations, ignore_conflicts=True)
228+
super().apply(update_fields={"document_type": self.document_type})
276229

277230
def unapply(self) -> None:
278231
"""Clear document type from associated delta and curated URLs."""
279-
for url_obj in self.delta_urls.all() | self.curated_urls.all():
280-
url_obj.document_type = None
281-
url_obj.save()
232+
self.delta_urls.update(document_type=None)
282233
self.delta_urls.clear()
283234
self.curated_urls.clear()
284235

@@ -291,28 +242,14 @@ class Meta:
291242
class DeltaDivisionPattern(BaseMatchPattern):
292243
division = models.IntegerField(choices=Divisions.choices)
293244

245+
# We use `update_fields` in the base apply method to set `division`.
294246
def apply(self) -> None:
295-
matched = self.matched_urls()
296-
# Apply the division to both DeltaUrls and CuratedUrls
297-
for field_name, urls in {
298-
"delta_urls": matched["matching_delta_urls"],
299-
"curated_urls": matched["matching_curated_urls"],
300-
}.items():
301-
urls.update(division=self.division) # Update the division for matched URLs
302-
# Bulk create associations in the through table
303-
through_model = getattr(self, field_name).through
304-
pattern_url_associations = [
305-
through_model(**{f"{field_name[:-1]}_id": url.id, "deltadivisionpattern_id": self.id}) for url in urls
306-
]
307-
through_model.objects.bulk_create(pattern_url_associations, ignore_conflicts=True)
247+
super().apply(update_fields={"division": self.division})
308248

309249
def unapply(self) -> None:
310250
"""Clear division from associated delta and curated URLs."""
311-
for url_obj in self.delta_urls.all() | self.curated_urls.all():
312-
url_obj.division = None
313-
url_obj.save()
314-
self.delta_urls.clear()
315-
self.curated_urls.clear()
251+
# TODO: need to double check this logic for complicated cases
252+
self.delta_urls.update(division=None)
316253

317254
class Meta:
318255
verbose_name = "Division Pattern"

sde_collections/models/delta_url.py

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from django.db import models
55

66
from .collection_choice_fields import Divisions, DocumentTypes
7-
from .delta_patterns import DeltaExcludePattern
7+
from .delta_patterns import DeltaExcludePattern, DeltaTitlePattern
88

99

1010
class DeltaUrlQuerySet(models.QuerySet):
@@ -108,3 +108,31 @@ class CuratedUrl(BaseUrl):
108108
"""Urls that are curated and ready for production"""
109109

110110
objects = CuratedUrlManager()
111+
112+
113+
class DeltaResolvedTitleBase(models.Model):
114+
# TODO: need to understand this logic and whether we need to have thess match to CuratedUrls as well
115+
title_pattern = models.ForeignKey(DeltaTitlePattern, on_delete=models.CASCADE)
116+
delta_url = models.OneToOneField(DeltaUrl, on_delete=models.CASCADE)
117+
created_at = models.DateTimeField(auto_now_add=True)
118+
119+
class Meta:
120+
abstract = True
121+
122+
123+
class DeltaResolvedTitle(DeltaResolvedTitleBase):
124+
resolved_title = models.CharField(blank=True, default="")
125+
126+
class Meta:
127+
verbose_name = "Resolved Title"
128+
verbose_name_plural = "Resolved Titles"
129+
130+
def save(self, *args, **kwargs):
131+
# Finds the linked delta URL and deletes DeltaResolvedTitleError objects linked to it
132+
DeltaResolvedTitleError.objects.filter(delta_url=self.delta_url).delete()
133+
super().save(*args, **kwargs)
134+
135+
136+
class DeltaResolvedTitleError(DeltaResolvedTitleBase):
137+
error_string = models.TextField(null=False, blank=False)
138+
http_status_code = models.IntegerField(null=True, blank=True)

0 commit comments

Comments
 (0)