|
10 | 10 | django.setup() |
11 | 11 |
|
12 | 12 | from publications.models import Publication, Source, HarvestingEvent, Schedule |
13 | | -from publications.tasks import parse_oai_xml_and_save_publications |
| 13 | +from publications.tasks import parse_oai_xml_and_save_publications, harvest_oai_endpoint |
14 | 14 | from django.contrib.auth import get_user_model |
15 | 15 |
|
16 | 16 | User = get_user_model() |
@@ -310,3 +310,233 @@ def test_real_journal_harvesting_agile_giss(self): |
310 | 310 | # Skip test if AGILE doesn't have OAI-PMH endpoint |
311 | 311 | self.skipTest(f"AGILE-GISS endpoint not available: {e}") |
312 | 312 |
|
| 313 | + |
| 314 | +class HarvestingErrorTests(TestCase): |
| 315 | + """ |
| 316 | + Test cases for error handling during harvesting. |
| 317 | +
|
| 318 | + These tests verify that the harvesting system properly handles: |
| 319 | + - Malformed XML |
| 320 | + - Empty responses |
| 321 | + - Missing required metadata |
| 322 | + - Invalid XML structure |
| 323 | + - Network/HTTP errors |
| 324 | + """ |
| 325 | + |
| 326 | + def setUp(self): |
| 327 | + """Set up test sources and events.""" |
| 328 | + Publication.objects.all().delete() |
| 329 | + self.source = Source.objects.create( |
| 330 | + url_field="http://example.com/oai", |
| 331 | + harvest_interval_minutes=60, |
| 332 | + name="Error Test Source" |
| 333 | + ) |
| 334 | + |
| 335 | + def test_malformed_xml(self): |
| 336 | + """Test that malformed XML is handled gracefully.""" |
| 337 | + event = HarvestingEvent.objects.create( |
| 338 | + source=self.source, |
| 339 | + status="in_progress" |
| 340 | + ) |
| 341 | + |
| 342 | + malformed_xml_path = BASE_TEST_DIR / 'harvesting' / 'error_cases' / 'malformed_xml.xml' |
| 343 | + xml_bytes = malformed_xml_path.read_bytes() |
| 344 | + |
| 345 | + # Should not raise exception, but should log error |
| 346 | + parse_oai_xml_and_save_publications(xml_bytes, event) |
| 347 | + |
| 348 | + # No publications should be created from malformed XML |
| 349 | + pub_count = Publication.objects.filter(job=event).count() |
| 350 | + self.assertEqual(pub_count, 0, "Malformed XML should not create publications") |
| 351 | + |
| 352 | + def test_empty_response(self): |
| 353 | + """Test that empty OAI-PMH response (no records) is handled.""" |
| 354 | + event = HarvestingEvent.objects.create( |
| 355 | + source=self.source, |
| 356 | + status="in_progress" |
| 357 | + ) |
| 358 | + |
| 359 | + empty_xml_path = BASE_TEST_DIR / 'harvesting' / 'error_cases' / 'empty_response.xml' |
| 360 | + xml_bytes = empty_xml_path.read_bytes() |
| 361 | + |
| 362 | + # Should not raise exception |
| 363 | + parse_oai_xml_and_save_publications(xml_bytes, event) |
| 364 | + |
| 365 | + # No publications should be created from empty response |
| 366 | + pub_count = Publication.objects.filter(job=event).count() |
| 367 | + self.assertEqual(pub_count, 0, "Empty response should create zero publications") |
| 368 | + |
| 369 | + def test_invalid_xml_structure(self): |
| 370 | + """Test that non-OAI-PMH XML structure is handled.""" |
| 371 | + event = HarvestingEvent.objects.create( |
| 372 | + source=self.source, |
| 373 | + status="in_progress" |
| 374 | + ) |
| 375 | + |
| 376 | + invalid_xml_path = BASE_TEST_DIR / 'harvesting' / 'error_cases' / 'invalid_xml_structure.xml' |
| 377 | + xml_bytes = invalid_xml_path.read_bytes() |
| 378 | + |
| 379 | + # Should not raise exception |
| 380 | + parse_oai_xml_and_save_publications(xml_bytes, event) |
| 381 | + |
| 382 | + # No publications should be created from invalid structure |
| 383 | + pub_count = Publication.objects.filter(job=event).count() |
| 384 | + self.assertEqual(pub_count, 0, "Invalid XML structure should create zero publications") |
| 385 | + |
| 386 | + def test_missing_required_metadata(self): |
| 387 | + """Test that records with missing required fields are handled.""" |
| 388 | + event = HarvestingEvent.objects.create( |
| 389 | + source=self.source, |
| 390 | + status="in_progress" |
| 391 | + ) |
| 392 | + |
| 393 | + missing_metadata_path = BASE_TEST_DIR / 'harvesting' / 'error_cases' / 'missing_metadata.xml' |
| 394 | + xml_bytes = missing_metadata_path.read_bytes() |
| 395 | + |
| 396 | + # Should not raise exception - may create some publications |
| 397 | + parse_oai_xml_and_save_publications(xml_bytes, event) |
| 398 | + |
| 399 | + # Check what was created |
| 400 | + pubs = Publication.objects.filter(job=event) |
| 401 | + |
| 402 | + # At least one record (the one with title) should be created |
| 403 | + self.assertGreaterEqual(pubs.count(), 1, "Should create publications even with minimal metadata") |
| 404 | + |
| 405 | + # Check that publications were created despite missing fields |
| 406 | + for pub in pubs: |
| 407 | + # Title might be None for some records |
| 408 | + if pub.title: |
| 409 | + self.assertIsInstance(pub.title, str) |
| 410 | + |
| 411 | + def test_empty_content(self): |
| 412 | + """Test that empty/None content is handled.""" |
| 413 | + event = HarvestingEvent.objects.create( |
| 414 | + source=self.source, |
| 415 | + status="in_progress" |
| 416 | + ) |
| 417 | + |
| 418 | + # Test with empty bytes |
| 419 | + parse_oai_xml_and_save_publications(b"", event) |
| 420 | + pub_count = Publication.objects.filter(job=event).count() |
| 421 | + self.assertEqual(pub_count, 0, "Empty content should create zero publications") |
| 422 | + |
| 423 | + # Test with whitespace only |
| 424 | + parse_oai_xml_and_save_publications(b" \n\t ", event) |
| 425 | + pub_count = Publication.objects.filter(job=event).count() |
| 426 | + self.assertEqual(pub_count, 0, "Whitespace-only content should create zero publications") |
| 427 | + |
| 428 | + @responses.activate |
| 429 | + def test_http_404_error(self): |
| 430 | + """Test that HTTP 404 errors are handled properly.""" |
| 431 | + # Mock a 404 response |
| 432 | + responses.add( |
| 433 | + responses.GET, |
| 434 | + 'http://example.com/oai-404', |
| 435 | + status=404, |
| 436 | + body='Not Found' |
| 437 | + ) |
| 438 | + |
| 439 | + source = Source.objects.create( |
| 440 | + url_field="http://example.com/oai-404", |
| 441 | + harvest_interval_minutes=60 |
| 442 | + ) |
| 443 | + |
| 444 | + # harvest_oai_endpoint should handle the error |
| 445 | + harvest_oai_endpoint(source.id) |
| 446 | + |
| 447 | + # Check that event was marked as failed |
| 448 | + event = HarvestingEvent.objects.filter(source=source).latest('started_at') |
| 449 | + self.assertEqual(event.status, 'failed', "Event should be marked as failed for 404 error") |
| 450 | + |
| 451 | + @responses.activate |
| 452 | + def test_http_500_error(self): |
| 453 | + """Test that HTTP 500 errors are handled properly.""" |
| 454 | + # Mock a 500 response |
| 455 | + responses.add( |
| 456 | + responses.GET, |
| 457 | + 'http://example.com/oai-500', |
| 458 | + status=500, |
| 459 | + body='Internal Server Error' |
| 460 | + ) |
| 461 | + |
| 462 | + source = Source.objects.create( |
| 463 | + url_field="http://example.com/oai-500", |
| 464 | + harvest_interval_minutes=60 |
| 465 | + ) |
| 466 | + |
| 467 | + # harvest_oai_endpoint should handle the error |
| 468 | + harvest_oai_endpoint(source.id) |
| 469 | + |
| 470 | + # Check that event was marked as failed |
| 471 | + event = HarvestingEvent.objects.filter(source=source).latest('started_at') |
| 472 | + self.assertEqual(event.status, 'failed', "Event should be marked as failed for 500 error") |
| 473 | + |
| 474 | + @responses.activate |
| 475 | + def test_network_timeout(self): |
| 476 | + """Test that network timeouts are handled properly.""" |
| 477 | + from requests.exceptions import Timeout |
| 478 | + |
| 479 | + # Mock a timeout |
| 480 | + responses.add( |
| 481 | + responses.GET, |
| 482 | + 'http://example.com/oai-timeout', |
| 483 | + body=Timeout('Connection timeout') |
| 484 | + ) |
| 485 | + |
| 486 | + source = Source.objects.create( |
| 487 | + url_field="http://example.com/oai-timeout", |
| 488 | + harvest_interval_minutes=60 |
| 489 | + ) |
| 490 | + |
| 491 | + # harvest_oai_endpoint should handle the timeout |
| 492 | + harvest_oai_endpoint(source.id) |
| 493 | + |
| 494 | + # Check that event was marked as failed |
| 495 | + event = HarvestingEvent.objects.filter(source=source).latest('started_at') |
| 496 | + self.assertEqual(event.status, 'failed', "Event should be marked as failed for timeout") |
| 497 | + |
| 498 | + @responses.activate |
| 499 | + def test_invalid_xml_in_http_response(self): |
| 500 | + """Test that invalid XML in HTTP response is handled.""" |
| 501 | + # Mock response with invalid XML |
| 502 | + responses.add( |
| 503 | + responses.GET, |
| 504 | + 'http://example.com/oai-invalid', |
| 505 | + status=200, |
| 506 | + body='This is not XML at all', |
| 507 | + content_type='text/xml' |
| 508 | + ) |
| 509 | + |
| 510 | + source = Source.objects.create( |
| 511 | + url_field="http://example.com/oai-invalid", |
| 512 | + harvest_interval_minutes=60 |
| 513 | + ) |
| 514 | + |
| 515 | + # Should complete but create no publications |
| 516 | + harvest_oai_endpoint(source.id) |
| 517 | + |
| 518 | + event = HarvestingEvent.objects.filter(source=source).latest('started_at') |
| 519 | + # Should complete (not fail) but create no publications |
| 520 | + self.assertEqual(event.status, 'completed', "Event should complete even with invalid XML") |
| 521 | + |
| 522 | + pub_count = Publication.objects.filter(job=event).count() |
| 523 | + self.assertEqual(pub_count, 0, "Invalid XML should create zero publications") |
| 524 | + |
| 525 | + def test_max_records_limit_with_errors(self): |
| 526 | + """Test that max_records works even when some records cause errors.""" |
| 527 | + event = HarvestingEvent.objects.create( |
| 528 | + source=self.source, |
| 529 | + status="in_progress" |
| 530 | + ) |
| 531 | + |
| 532 | + # Use the missing metadata file which has 2 records, one problematic |
| 533 | + missing_metadata_path = BASE_TEST_DIR / 'harvesting' / 'error_cases' / 'missing_metadata.xml' |
| 534 | + xml_bytes = missing_metadata_path.read_bytes() |
| 535 | + |
| 536 | + # Limit to 1 record |
| 537 | + parse_oai_xml_and_save_publications(xml_bytes, event, max_records=1) |
| 538 | + |
| 539 | + # Should process only 1 record |
| 540 | + pub_count = Publication.objects.filter(job=event).count() |
| 541 | + self.assertLessEqual(pub_count, 1, "Should respect max_records limit even with errors") |
| 542 | + |
0 commit comments