@@ -156,3 +156,65 @@ def test_field_modifying_pattern_layered_specificity():
156
156
assert mid_tool .pk in broad_pattern .delta_urls .values_list ("pk" , flat = True )
157
157
158
158
assert top_tool .pk in broad_pattern .delta_urls .values_list ("pk" , flat = True )
159
+
160
+
161
+ @pytest .mark .django_db
162
+ def test_pattern_specificity_tiebreaker ():
163
+ """Test that when patterns match the same number of URLs, longer patterns are considered more specific."""
164
+ collection = CollectionFactory ()
165
+
166
+ # Create URLs that would result in same match count for different patterns
167
+ url1 = DeltaUrlFactory (
168
+ collection = collection , url = "https://example.com/docs/specific/item1.html" , scraped_title = "Title 1"
169
+ )
170
+ url2 = DeltaUrlFactory (
171
+ collection = collection , url = "https://example.com/docs/specific/item2.html" , scraped_title = "Title 2"
172
+ )
173
+
174
+ # Create patterns with same match count but different lengths
175
+ general_pattern = DeltaTitlePattern .objects .create (
176
+ collection = collection ,
177
+ match_pattern = "*docs*" , # Shorter pattern
178
+ title_pattern = "{title}" ,
179
+ match_pattern_type = 2 ,
180
+ )
181
+
182
+ specific_pattern = DeltaTitlePattern .objects .create (
183
+ collection = collection ,
184
+ match_pattern = "*docs/specific*" , # Longer pattern
185
+ title_pattern = "{title} - Specific" ,
186
+ match_pattern_type = 2 ,
187
+ )
188
+
189
+ # Both patterns will match both URLs (same match count)
190
+ assert general_pattern .get_url_match_count () == 2
191
+ assert specific_pattern .get_url_match_count () == 2
192
+
193
+ # But the longer pattern should be considered more specific
194
+ assert general_pattern .is_most_distinctive_pattern (url1 ) is False
195
+ assert specific_pattern .is_most_distinctive_pattern (url1 ) is True
196
+
197
+ # Check that this applies to both URLs
198
+ assert general_pattern .is_most_distinctive_pattern (url2 ) is False
199
+ assert specific_pattern .is_most_distinctive_pattern (url2 ) is True
200
+
201
+ # Create an even more specific pattern
202
+ very_specific_pattern = DeltaTitlePattern .objects .create (
203
+ collection = collection ,
204
+ match_pattern = "*docs/specific/item1*" , # Even longer pattern
205
+ title_pattern = "{title} - Very Specific" ,
206
+ match_pattern_type = 2 ,
207
+ )
208
+
209
+ # It matches fewer URLs
210
+ assert very_specific_pattern .get_url_match_count () == 1
211
+
212
+ # For URL1, the very specific pattern should win due to fewer matches
213
+ assert general_pattern .is_most_distinctive_pattern (url1 ) is False
214
+ assert specific_pattern .is_most_distinctive_pattern (url1 ) is False
215
+ assert very_specific_pattern .is_most_distinctive_pattern (url1 ) is True
216
+
217
+ # For URL2, the middle pattern should still win since very_specific doesn't match
218
+ assert general_pattern .is_most_distinctive_pattern (url2 ) is False
219
+ assert specific_pattern .is_most_distinctive_pattern (url2 ) is True
220
+ assert very_specific_pattern .is_most_distinctive_pattern (url2 ) is False
0 commit comments