|
1 | 1 | # docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_promote_collection.py
|
2 | 2 | import pytest
|
3 | 3 |
|
| 4 | +from sde_collections.models.collection_choice_fields import Divisions, DocumentTypes |
4 | 5 | from sde_collections.models.delta_patterns import (
|
| 6 | + DeltaDivisionPattern, |
| 7 | + DeltaDocumentTypePattern, |
5 | 8 | DeltaExcludePattern,
|
6 | 9 | DeltaIncludePattern,
|
7 | 10 | DeltaTitlePattern,
|
8 | 11 | )
|
9 |
| -from sde_collections.models.delta_url import CuratedUrl, DeltaUrl |
| 12 | +from sde_collections.models.delta_url import CuratedUrl, DeltaUrl, DumpUrl |
10 | 13 | from sde_collections.tests.factories import CollectionFactory
|
11 | 14 |
|
12 | 15 |
|
@@ -208,3 +211,182 @@ def test_promotion_with_title_change():
|
208 | 211 |
|
209 | 212 | # This should trigger the same error we're seeing in production
|
210 | 213 | collection.promote_to_curated()
|
| 214 | + |
| 215 | + |
| 216 | +@pytest.mark.django_db |
| 217 | +def test_promotion_maintains_pattern_relationships_through_updates(collection): |
| 218 | + """Test that pattern relationships survive multiple promotions with updates""" |
| 219 | + # Initial setup |
| 220 | + DeltaUrl.objects.create(collection=collection, url="https://example.com", scraped_title="Title") |
| 221 | + pattern = DeltaTitlePattern.objects.create( |
| 222 | + collection=collection, match_pattern="example.com", match_pattern_type=1, title_pattern="Pattern: {title}" |
| 223 | + ) |
| 224 | + |
| 225 | + collection.promote_to_curated() |
| 226 | + |
| 227 | + # Record initial state |
| 228 | + curated = CuratedUrl.objects.get(url="https://example.com") |
| 229 | + initial_id = curated.id |
| 230 | + initial_pattern_relations = list(pattern.curated_urls.all()) |
| 231 | + |
| 232 | + # Create new delta with changes |
| 233 | + DeltaUrl.objects.create(collection=collection, url="https://example.com", scraped_title="New Title") |
| 234 | + collection.promote_to_curated() |
| 235 | + |
| 236 | + # Verify relationships maintained |
| 237 | + curated.refresh_from_db() |
| 238 | + assert curated.id == initial_id # ID should not change |
| 239 | + assert list(pattern.curated_urls.all()) == initial_pattern_relations |
| 240 | + |
| 241 | + |
| 242 | +@pytest.mark.django_db |
| 243 | +def test_sequential_promotions_with_multiple_patterns(collection): |
| 244 | + """Test complex scenario with multiple promotions and pattern changes""" |
| 245 | + # Initial setup with two URLs |
| 246 | + urls = ["https://example.com/doc", "https://example.com/guide"] |
| 247 | + for url in urls: |
| 248 | + DeltaUrl.objects.create(collection=collection, url=url, scraped_title=f"Title for {url}") |
| 249 | + |
| 250 | + # First pattern and promotion |
| 251 | + pattern1 = DeltaTitlePattern.objects.create( |
| 252 | + collection=collection, match_pattern="doc", match_pattern_type=2, title_pattern="Doc: {title}" |
| 253 | + ) |
| 254 | + |
| 255 | + collection.promote_to_curated() |
| 256 | + |
| 257 | + # Record state after first promotion |
| 258 | + initial_ids = {url: CuratedUrl.objects.get(url=url).id for url in urls} |
| 259 | + |
| 260 | + DeltaUrl.objects.create(collection=collection, url="https://example.com/guide", scraped_title="Updated guide") |
| 261 | + collection.promote_to_curated() |
| 262 | + |
| 263 | + pattern2 = DeltaTitlePattern.objects.create( |
| 264 | + collection=collection, match_pattern="guide", match_pattern_type=2, title_pattern="Guide: {title}" |
| 265 | + ) |
| 266 | + assert not pattern2.curated_urls.filter(url__contains="guide").exists() |
| 267 | + |
| 268 | + # Verify state |
| 269 | + for url in urls: |
| 270 | + curated = CuratedUrl.objects.get(url=url) |
| 271 | + assert curated.id == initial_ids[url] # IDs should be preserved |
| 272 | + |
| 273 | + collection.promote_to_curated() |
| 274 | + |
| 275 | + # Verify pattern relationships |
| 276 | + assert pattern1.curated_urls.filter(url__contains="doc").exists() |
| 277 | + assert pattern2.curated_urls.filter(url__contains="guide").exists() |
| 278 | + |
| 279 | + |
| 280 | +@pytest.mark.django_db |
| 281 | +def test_promotion_with_division_changes(collection): |
| 282 | + """Test that division patterns are correctly promoted and applied""" |
| 283 | + # Initial setup |
| 284 | + DeltaUrl.objects.create(collection=collection, url="https://example.com/astrophysics", division=Divisions.GENERAL) |
| 285 | + DeltaUrl.objects.create(collection=collection, url="https://example.com/helio", division=Divisions.GENERAL) |
| 286 | + |
| 287 | + # Create and apply division patterns |
| 288 | + DeltaDivisionPattern.objects.create( |
| 289 | + collection=collection, |
| 290 | + match_pattern="astrophysics", |
| 291 | + match_pattern_type=2, # Multi-URL pattern |
| 292 | + division=Divisions.ASTROPHYSICS, |
| 293 | + ) |
| 294 | + |
| 295 | + DeltaDivisionPattern.objects.create( |
| 296 | + collection=collection, match_pattern="helio", match_pattern_type=2, division=Divisions.HELIOPHYSICS |
| 297 | + ) |
| 298 | + |
| 299 | + # Promote and verify divisions were set |
| 300 | + collection.promote_to_curated() |
| 301 | + |
| 302 | + assert CuratedUrl.objects.get(url__contains="astrophysics").division == Divisions.ASTROPHYSICS |
| 303 | + assert CuratedUrl.objects.get(url__contains="helio").division == Divisions.HELIOPHYSICS |
| 304 | + |
| 305 | + |
| 306 | +@pytest.mark.django_db |
| 307 | +def test_promotion_with_document_type_changes(collection): |
| 308 | + """Test document type patterns through promotion""" |
| 309 | + # Create URLs with default doc type |
| 310 | + DeltaUrl.objects.create( |
| 311 | + collection=collection, url="https://example.com/data/set1", document_type=DocumentTypes.DOCUMENTATION |
| 312 | + ) |
| 313 | + DeltaUrl.objects.create( |
| 314 | + collection=collection, url="https://example.com/tools/tool1", document_type=DocumentTypes.DOCUMENTATION |
| 315 | + ) |
| 316 | + |
| 317 | + # Set up patterns for different doc types |
| 318 | + DeltaDocumentTypePattern.objects.create( |
| 319 | + collection=collection, match_pattern="data/*", match_pattern_type=2, document_type=DocumentTypes.DATA |
| 320 | + ).apply() |
| 321 | + |
| 322 | + DeltaDocumentTypePattern.objects.create( |
| 323 | + collection=collection, match_pattern="tools/*", match_pattern_type=2, document_type=DocumentTypes.SOFTWARETOOLS |
| 324 | + ).apply() |
| 325 | + |
| 326 | + collection.promote_to_curated() |
| 327 | + |
| 328 | + # Verify document types were correctly set |
| 329 | + assert CuratedUrl.objects.get(url__contains="/data/").document_type == DocumentTypes.DATA |
| 330 | + assert CuratedUrl.objects.get(url__contains="/tools/").document_type == DocumentTypes.SOFTWARETOOLS |
| 331 | + |
| 332 | + |
| 333 | +@pytest.mark.django_db |
| 334 | +def test_promotion_with_multiple_metadata_changes_dump(collection): |
| 335 | + """Test complex scenario with multiple metadata changes through multiple promotions""" |
| 336 | + # Initial URL we'll be working with |
| 337 | + url = "https://example.com/helio/data" |
| 338 | + |
| 339 | + # Create initial DumpUrl |
| 340 | + DumpUrl.objects.create( |
| 341 | + collection=collection, |
| 342 | + url=url, |
| 343 | + division=Divisions.GENERAL, |
| 344 | + document_type=DocumentTypes.DOCUMENTATION, |
| 345 | + scraped_title="Raw Data Title", |
| 346 | + ) |
| 347 | + |
| 348 | + # Migrate DumpUrls to DeltaUrls |
| 349 | + collection.migrate_dump_to_delta() |
| 350 | + |
| 351 | + # Create patterns that will affect this URL |
| 352 | + DeltaDivisionPattern.objects.create( |
| 353 | + collection=collection, match_pattern="*helio*", match_pattern_type=2, division=Divisions.HELIOPHYSICS |
| 354 | + ) |
| 355 | + |
| 356 | + DeltaDocumentTypePattern.objects.create( |
| 357 | + collection=collection, match_pattern="*data*", match_pattern_type=2, document_type=DocumentTypes.DATA |
| 358 | + ) |
| 359 | + |
| 360 | + DeltaTitlePattern.objects.create( |
| 361 | + collection=collection, match_pattern="*data*", match_pattern_type=2, title_pattern="Heliophysics Data: {title}" |
| 362 | + ) |
| 363 | + |
| 364 | + # First promotion |
| 365 | + collection.promote_to_curated() |
| 366 | + |
| 367 | + # Verify initial promotion worked correctly |
| 368 | + curated = CuratedUrl.objects.get(url=url) |
| 369 | + assert curated.division == Divisions.HELIOPHYSICS |
| 370 | + assert curated.document_type == DocumentTypes.DATA |
| 371 | + assert curated.generated_title == "Heliophysics Data: Raw Data Title" |
| 372 | + |
| 373 | + # Create new DumpUrl with updated data to simulate a new crawl |
| 374 | + DumpUrl.objects.create( |
| 375 | + collection=collection, |
| 376 | + url=url, |
| 377 | + division=Divisions.GENERAL, # These will be overridden by patterns |
| 378 | + document_type=DocumentTypes.DOCUMENTATION, # These will be overridden by patterns |
| 379 | + scraped_title="Updated Data Title", |
| 380 | + ) |
| 381 | + |
| 382 | + # Migrate new dump to delta |
| 383 | + collection.migrate_dump_to_delta() |
| 384 | + |
| 385 | + # Second promotion - should maintain pattern-applied metadata while updating the title |
| 386 | + collection.promote_to_curated() |
| 387 | + |
| 388 | + # Verify final state |
| 389 | + curated = CuratedUrl.objects.get(url=url) |
| 390 | + assert curated.division == Divisions.HELIOPHYSICS # Should still be preserved from pattern |
| 391 | + assert curated.document_type == DocumentTypes.DATA # Should still be preserved from pattern |
| 392 | + assert curated.generated_title == "Heliophysics Data: Updated Data Title" # Should reflect new title |
0 commit comments