44from django .core .exceptions import ValidationError
55from django .db import models
66
7+ from sde_collections .models .delta_url import DeltaUrl
8+
79from ..utils .title_resolver import (
810 is_valid_fstring ,
911 is_valid_xpath ,
@@ -42,42 +44,63 @@ class MatchPatternTypeChoices(models.IntegerChoices):
4244 def matched_urls (self ):
4345 """Find all the urls matching the pattern."""
4446 escaped_match_pattern = re .escape (self .match_pattern )
45- if self .match_pattern_type == self .MatchPatternTypeChoices .INDIVIDUAL_URL :
46- regex_pattern = f"{ escaped_match_pattern } $"
47- elif self .match_pattern_type == self .MatchPatternTypeChoices .MULTI_URL_PATTERN :
48- regex_pattern = escaped_match_pattern .replace (r"\*" , ".*" ) # allow * wildcards
49- else :
50- raise NotImplementedError
51-
52- # Filter both DeltaUrls and CuratedUrls
53- matching_delta_urls = self .delta_urls .filter (url__regex = regex_pattern )
54- matching_curated_urls = self .curated_urls .filter (url__regex = regex_pattern )
55-
47+ regex_pattern = (
48+ f"{ escaped_match_pattern } $"
49+ if self .match_pattern_type == self .MatchPatternTypeChoices .INDIVIDUAL_URL
50+ else escaped_match_pattern .replace (r"\*" , ".*" )
51+ )
5652 return {
57- "matching_delta_urls" : matching_delta_urls ,
58- "matching_curated_urls" : matching_curated_urls ,
53+ "matching_delta_urls" : self . delta_urls . filter ( url__regex = regex_pattern ) ,
54+ "matching_curated_urls" : self . curated_urls . filter ( url__regex = regex_pattern ) ,
5955 }
6056
61- def _process_match_pattern (self ) -> str :
57+ def generate_delta_url (self , curated_url , fields_to_copy = None ) :
6258 """
63- Multi-Url patterns need a star at the beginning and at the end
64- Individual Url Patterns need a star at the beginning
59+ Generates or updates a DeltaUrl based on a CuratedUrl.
60+ Only specified fields are copied if fields_to_copy is provided.
6561 """
66- # we don't trust the bracketing stars from the system, so we remove any
67- processed_pattern = self .match_pattern .strip ().strip ("*" ).strip ()
68- if not processed_pattern .startswith ("http" ):
69- # if it doesn't begin with http, it must need a star at the beginning
70- processed_pattern = f"*{ processed_pattern } "
71- if self .match_pattern_type == BaseMatchPattern .MatchPatternTypeChoices .MULTI_URL_PATTERN :
72- # all multi urls should have a star at the end, but individuals should not
73- processed_pattern = f"{ processed_pattern } *"
74- return processed_pattern
75-
76- def apply (self ):
77- raise NotImplementedError
62+ delta_url , created = DeltaUrl .objects .get_or_create (
63+ collection = self .collection ,
64+ url = curated_url .url ,
65+ defaults = {field : getattr (curated_url , field ) for field in (fields_to_copy or [])},
66+ )
67+ if not created and fields_to_copy :
68+ # Update only if certain fields are missing in DeltaUrl
69+ # in the current codebase, this is only executed for scraped_title, but this
70+ # can be extended to other fields as well, if we add a pattern that requires it
71+ for field in fields_to_copy :
72+ if getattr (delta_url , field , None ) in [None , "" ]:
73+ setattr (delta_url , field , getattr (curated_url , field ))
74+ delta_url .save ()
75+
76+ def apply (self , fields_to_copy = None , update_fields = None ):
77+ matched_urls = self .matched_urls ()
78+
79+ # Iterate over matched CuratedUrls to create or update DeltaUrls as needed
80+ for curated_url in matched_urls ["matching_curated_urls" ]:
81+ self .generate_delta_url (curated_url , fields_to_copy )
82+
83+ # Apply any updates to DeltaUrls based on update_fields
84+ if update_fields :
85+ for field , value in update_fields .items ():
86+ matched_urls ["matching_delta_urls" ].update (** {field : value })
87+
88+ # Populate through tables for DeltaUrl and CuratedUrl relationships
89+ for field_name , url_ids in {
90+ "delta_urls" : matched_urls ["matching_delta_urls" ].values_list ("id" , flat = True ),
91+ "curated_urls" : matched_urls ["matching_curated_urls" ].values_list ("id" , flat = True ),
92+ }.items ():
93+ through_model = getattr (self , field_name ).through
94+ bulk_data = [
95+ through_model (** {f"{ field_name [:- 1 ]} _id" : url_id , f"{ self .__class__ .__name__ .lower ()} _id" : self .id })
96+ for url_id in url_ids
97+ ]
98+ through_model .objects .bulk_create (bulk_data , ignore_conflicts = True )
7899
79100 def unapply (self ):
80- raise NotImplementedError
101+ """Default unapply behavior."""
102+ self .delta_urls .clear ()
103+ self .curated_urls .clear ()
81104
82105 def save (self , * args , ** kwargs ):
83106 """Save the pattern and apply it."""
@@ -101,31 +124,8 @@ def __str__(self):
101124class DeltaExcludePattern (BaseMatchPattern ):
102125 reason = models .TextField ("Reason for excluding" , default = "" , blank = True )
103126
104- def apply (self ) -> None :
105- matched_urls = self .matched_urls ()
106-
107- # Define a mapping of model attributes to their related URL fields
108- url_mappings = {
109- "delta_urls" : matched_urls ["matching_delta_urls" ].values_list ("id" , flat = True ),
110- "curated_urls" : matched_urls ["matching_curated_urls" ].values_list ("id" , flat = True ),
111- }
112-
113- for field_name , url_ids in url_mappings .items ():
114- through_model = getattr (self , field_name ).through # Access the through model dynamically
115- bulk_data = [
116- through_model (** {f"{ field_name [:- 1 ]} _id" : url_id , "deltaexcludepattern_id" : self .id })
117- for url_id in url_ids
118- ]
119- through_model .objects .bulk_create (bulk_data )
120-
121- def unapply (self ) -> None :
122- # this is the new, suggested code
123- # self.delta_urls.clear()
124- # self.curated_urls.clear()
125- # this is the old code
126- # need to study later and decide which is better
127- "Unapplies automatically by deleting include pattern through objects in a cascade"
128- return
127+ # No need to override `apply`—we use the base class logic as-is.
128+ # This pattern's functionality is handled by the `excluded` annotation in the manager.
129129
130130 class Meta :
131131 verbose_name = "Exclude Pattern"
@@ -134,26 +134,7 @@ class Meta:
134134
135135
136136class DeltaIncludePattern (BaseMatchPattern ):
137- def apply (self ) -> None :
138- matched_urls = self .matched_urls ()
139-
140- # Define a mapping of model attributes to their related URL fields
141- url_mappings = {
142- "delta_urls" : matched_urls ["matching_delta_urls" ].values_list ("id" , flat = True ),
143- "curated_urls" : matched_urls ["matching_curated_urls" ].values_list ("id" , flat = True ),
144- }
145-
146- for field_name , url_ids in url_mappings .items ():
147- through_model = getattr (self , field_name ).through # Access the through model dynamically
148- bulk_data = [
149- through_model (** {f"{ field_name [:- 1 ]} _id" : url_id , "deltaincludepattern_id" : self .id })
150- for url_id in url_ids
151- ]
152- through_model .objects .bulk_create (bulk_data )
153-
154- def unapply (self ) -> None :
155- "Unapplies automatically by deleting includepattern through objects in a cascade"
156- return
137+ # No additional logic needed for `apply`—using base class functionality.
157138
158139 class Meta :
159140 verbose_name = "Include Pattern"
@@ -186,12 +167,13 @@ class DeltaTitlePattern(BaseMatchPattern):
186167 )
187168
188169 def apply (self ) -> None :
189- matched = self .matched_urls () # Now returns separate QuerySets for delta and curated URLs
190- updated_urls = []
170+ # Use `fields_to_copy` to copy `scraped_title` for any matching curated URLs.
171+ super ().apply (fields_to_copy = ["scraped_title" ])
172+
173+ matched = self .matched_urls () # Separate QuerySets for delta and curated URLs
191174 ResolvedTitle = apps .get_model ("sde_collections" , "ResolvedTitle" )
192175 ResolvedTitleError = apps .get_model ("sde_collections" , "ResolvedTitleError" )
193176
194- # Process both DeltaUrls and CuratedUrls
195177 for url_obj in matched ["matching_delta_urls" ] | matched ["matching_curated_urls" ]:
196178 context = {
197179 "url" : url_obj .url ,
@@ -201,16 +183,15 @@ def apply(self) -> None:
201183 try :
202184 generated_title = resolve_title (self .title_pattern , context )
203185
204- # Remove any existing resolved title for this URL
186+ # Remove existing resolved title entries for this URL
205187 ResolvedTitle .objects .filter (url = url_obj ).delete ()
206188
207- # Create new resolved title entry
189+ # Create a new resolved title entry
208190 ResolvedTitle .objects .create (title_pattern = self , url = url_obj , resolved_title = generated_title )
209191
210- # Update generated title and save it to the DeltaUrl or CuratedUrl
192+ # Update generated title and save it to DeltaUrl or CuratedUrl
211193 url_obj .generated_title = generated_title
212194 url_obj .save ()
213- updated_urls .append (url_obj )
214195
215196 except (ValueError , ValidationError ) as e :
216197 message = str (e )
@@ -225,30 +206,14 @@ def apply(self) -> None:
225206
226207 resolved_title_error .save ()
227208
228- # Associate pattern with both delta and curated URLs
229- for field_name , urls in {
230- "delta_urls" : matched ["matching_delta_urls" ],
231- "curated_urls" : matched ["matching_curated_urls" ],
232- }.items ():
233- through_model = getattr (self , field_name ).through
234- pattern_url_associations = [
235- through_model (deltatitlepattern_id = self .id , ** {f"{ field_name [:- 1 ]} _id" : url .id }) for url in urls
236- ]
237- through_model .objects .bulk_create (pattern_url_associations , ignore_conflicts = True )
238-
239209 def unapply (self ) -> None :
240210 """Clears generated titles and dissociates URLs from the pattern."""
241- for url_obj in self .delta_urls .all () | self . curated_urls . all () :
211+ for url_obj in self .delta_urls .all ():
242212 url_obj .generated_title = ""
243213 url_obj .save ()
244214 self .delta_urls .clear ()
245215 self .curated_urls .clear ()
246216
247- def delete (self , * args , ** kwargs ):
248- """Ensures unapply is called before deletion."""
249- self .unapply ()
250- super ().delete (* args , ** kwargs )
251-
252217 class Meta :
253218 verbose_name = "Title Pattern"
254219 verbose_name_plural = "Title Patterns"
@@ -258,27 +223,13 @@ class Meta:
258223class DeltaDocumentTypePattern (BaseMatchPattern ):
259224 document_type = models .IntegerField (choices = DocumentTypes .choices )
260225
226+ # We use `update_fields` in the base apply method to set `document_type`.
261227 def apply (self ) -> None :
262- matched = self .matched_urls ()
263- # Apply the document type to both DeltaUrls and CuratedUrls
264- for field_name , urls in {
265- "delta_urls" : matched ["matching_delta_urls" ],
266- "curated_urls" : matched ["matching_curated_urls" ],
267- }.items ():
268- urls .update (document_type = self .document_type ) # Update the document type for matched URLs
269- # Bulk create associations in the through table
270- through_model = getattr (self , field_name ).through
271- pattern_url_associations = [
272- through_model (** {f"{ field_name [:- 1 ]} _id" : url .id , "deltadocumenttypepattern_id" : self .id })
273- for url in urls
274- ]
275- through_model .objects .bulk_create (pattern_url_associations , ignore_conflicts = True )
228+ super ().apply (update_fields = {"document_type" : self .document_type })
276229
277230 def unapply (self ) -> None :
278231 """Clear document type from associated delta and curated URLs."""
279- for url_obj in self .delta_urls .all () | self .curated_urls .all ():
280- url_obj .document_type = None
281- url_obj .save ()
232+ self .delta_urls .update (document_type = None )
282233 self .delta_urls .clear ()
283234 self .curated_urls .clear ()
284235
@@ -291,28 +242,14 @@ class Meta:
291242class DeltaDivisionPattern (BaseMatchPattern ):
292243 division = models .IntegerField (choices = Divisions .choices )
293244
245+ # We use `update_fields` in the base apply method to set `division`.
294246 def apply (self ) -> None :
295- matched = self .matched_urls ()
296- # Apply the division to both DeltaUrls and CuratedUrls
297- for field_name , urls in {
298- "delta_urls" : matched ["matching_delta_urls" ],
299- "curated_urls" : matched ["matching_curated_urls" ],
300- }.items ():
301- urls .update (division = self .division ) # Update the division for matched URLs
302- # Bulk create associations in the through table
303- through_model = getattr (self , field_name ).through
304- pattern_url_associations = [
305- through_model (** {f"{ field_name [:- 1 ]} _id" : url .id , "deltadivisionpattern_id" : self .id }) for url in urls
306- ]
307- through_model .objects .bulk_create (pattern_url_associations , ignore_conflicts = True )
247+ super ().apply (update_fields = {"division" : self .division })
308248
309249 def unapply (self ) -> None :
310250 """Clear division from associated delta and curated URLs."""
311- for url_obj in self .delta_urls .all () | self .curated_urls .all ():
312- url_obj .division = None
313- url_obj .save ()
314- self .delta_urls .clear ()
315- self .curated_urls .clear ()
251+ # TODO: need to double check this logic for complicated cases
252+ self .delta_urls .update (division = None )
316253
317254 class Meta :
318255 verbose_name = "Division Pattern"
0 commit comments