|
1 | 1 | import json |
2 | 2 | import pathlib |
| 3 | +from copy import deepcopy |
3 | 4 |
|
4 | 5 | import pytest |
5 | 6 |
|
|
11 | 12 | Resource, |
12 | 13 | Schema, |
13 | 14 | fields, |
| 15 | + platform, |
14 | 16 | ) |
15 | 17 |
|
16 | 18 | # General |
@@ -302,7 +304,10 @@ def test_validate_package_using_detector_schema_sync_issue_847(): |
302 | 304 | Resource( |
303 | 305 | data=[["f1"], ["v1"], ["v2"], ["v3"]], |
304 | 306 | schema=Schema( |
305 | | - fields=[fields.StringField(name="f1"), fields.StringField(name="f2")], |
| 307 | + fields=[ |
| 308 | + fields.StringField(name="f1"), |
| 309 | + fields.StringField(name="f2"), |
| 310 | + ], |
306 | 311 | ), |
307 | 312 | ), |
308 | 313 | ] |
@@ -362,3 +367,313 @@ def test_package_licenses_required_path_or_name_issue_1290(): |
362 | 367 | descriptor = {"resources": [], "licenses": [{"title": "title"}]} |
363 | 368 | report = Package.validate_descriptor(descriptor) |
364 | 369 | assert report.errors[0].note.count('license requires "path" or "name"') |
| 370 | + |
| 371 | + |
| 372 | +def test_package_validate_with_skip_errors(): |
| 373 | + ## Test runs on data with two blank-row errors, one primary-key error, see |
| 374 | + # first test case |
| 375 | + test_cases = [ |
| 376 | + {"ignore": [], "expect_errors": ["blank-row", "primary-key", "blank-row"]}, |
| 377 | + {"ignore": ["primary-key"], "expect_errors": ["blank-row", "blank-row"]}, |
| 378 | + {"ignore": ["blank-row"], "expect_errors": ["primary-key"]}, |
| 379 | + {"ignore": ["blank-row", "primary-key"], "expect_errors": []}, |
| 380 | + ] |
| 381 | + |
| 382 | + for tc in test_cases: |
| 383 | + with open("data/invalid/datapackage.json") as file: |
| 384 | + package = Package(json.load(file), basepath="data/invalid") |
| 385 | + checklist = Checklist(skip_errors=tc["ignore"]) |
| 386 | + |
| 387 | + report = package.validate(checklist) |
| 388 | + |
| 389 | + assert report.flatten(["type"]) == [[t] for t in tc["expect_errors"]] |
| 390 | + |
| 391 | + |
| 392 | +# Stats |
| 393 | + |
| 394 | +DESCRIPTOR_SH = { |
| 395 | + "resources": [ |
| 396 | + { |
| 397 | + "name": "resource1", |
| 398 | + "path": "data/table.csv", |
| 399 | + "hash": "sha256:a1fd6c5ff3494f697874deeb07f69f8667e903dd94a7bc062dd57550cea26da8", |
| 400 | + "bytes": 30, |
| 401 | + } |
| 402 | + ] |
| 403 | +} |
| 404 | + |
| 405 | + |
| 406 | +@pytest.mark.skipif(platform.type == "windows", reason="Fix on Windows") |
| 407 | +def test_package_validate_stats(): |
| 408 | + source = deepcopy(DESCRIPTOR_SH) |
| 409 | + package = Package(source) |
| 410 | + report = package.validate() |
| 411 | + assert report.valid |
| 412 | + |
| 413 | + |
| 414 | +def test_package_validate_stats_invalid(): |
| 415 | + source = deepcopy(DESCRIPTOR_SH) |
| 416 | + source["resources"][0]["hash"] += "a" |
| 417 | + source["resources"][0]["bytes"] += 1 |
| 418 | + package = Package(source) |
| 419 | + report = package.validate() |
| 420 | + assert report.flatten(["rowNumber", "fieldNumber", "type"]) == [ |
| 421 | + [None, None, "hash-count"], |
| 422 | + [None, None, "byte-count"], |
| 423 | + ] |
| 424 | + |
| 425 | + |
| 426 | +@pytest.mark.skipif(platform.type == "windows", reason="Fix on Windows") |
| 427 | +def test_package_validate_stats_size(): |
| 428 | + source = deepcopy(DESCRIPTOR_SH) |
| 429 | + source["resources"][0].pop("hash") |
| 430 | + package = Package(source) |
| 431 | + report = package.validate() |
| 432 | + assert report.valid |
| 433 | + |
| 434 | + |
| 435 | +def test_package_validate_stats_size_invalid(): |
| 436 | + source = deepcopy(DESCRIPTOR_SH) |
| 437 | + source["resources"][0]["bytes"] += 1 |
| 438 | + source["resources"][0].pop("hash") |
| 439 | + package = Package(source) |
| 440 | + report = package.validate() |
| 441 | + assert report.flatten(["rowNumber", "fieldNumber", "type"]) == [ |
| 442 | + [None, None, "byte-count"], |
| 443 | + ] |
| 444 | + |
| 445 | + |
| 446 | +@pytest.mark.skipif(platform.type == "windows", reason="Fix on Windows") |
| 447 | +def test_package_validate_stats_hash(): |
| 448 | + source = deepcopy(DESCRIPTOR_SH) |
| 449 | + source["resources"][0].pop("bytes") |
| 450 | + package = Package(source) |
| 451 | + report = package.validate() |
| 452 | + assert report.valid |
| 453 | + |
| 454 | + |
| 455 | +def test_package_validate_check_file_package_stats_hash_invalid(): |
| 456 | + source = deepcopy(DESCRIPTOR_SH) |
| 457 | + source["resources"][0].pop("bytes") |
| 458 | + source["resources"][0]["hash"] += "a" |
| 459 | + package = Package(source) |
| 460 | + report = package.validate() |
| 461 | + assert report.flatten(["rowNumber", "fieldNumber", "type"]) == [ |
| 462 | + [None, None, "hash-count"], |
| 463 | + ] |
| 464 | + |
| 465 | + |
| 466 | +# Schema |
| 467 | + |
| 468 | +DESCRIPTOR_FK = { |
| 469 | + "resources": [ |
| 470 | + { |
| 471 | + "name": "cities", |
| 472 | + "data": [ |
| 473 | + ["id", "name", "next_id"], |
| 474 | + [1, "london", 2], |
| 475 | + [2, "paris", 3], |
| 476 | + [3, "rome", 4], |
| 477 | + [4, "rio", None], |
| 478 | + ], |
| 479 | + "schema": { |
| 480 | + "fields": [ |
| 481 | + {"name": "id", "type": "integer"}, |
| 482 | + {"name": "name", "type": "string"}, |
| 483 | + {"name": "next_id", "type": "integer"}, |
| 484 | + ], |
| 485 | + "foreignKeys": [ |
| 486 | + { |
| 487 | + "fields": "next_id", |
| 488 | + "reference": {"resource": "", "fields": "id"}, |
| 489 | + }, |
| 490 | + { |
| 491 | + "fields": "id", |
| 492 | + "reference": {"resource": "people", "fields": "label"}, |
| 493 | + }, |
| 494 | + ], |
| 495 | + }, |
| 496 | + }, |
| 497 | + { |
| 498 | + "name": "people", |
| 499 | + "data": [["label", "population"], [1, 8], [2, 2], [3, 3], [4, 6]], |
| 500 | + }, |
| 501 | + ], |
| 502 | +} |
| 503 | + |
| 504 | +MULTI_FK_RESSOURCE = { |
| 505 | + "name": "travel_time", |
| 506 | + "data": [["from", "to", "hours"], [1, 2, 1.5], [2, 3, 8], [3, 4, 18]], |
| 507 | + "schema": { |
| 508 | + "fields": [ |
| 509 | + {"name": "from", "type": "integer"}, |
| 510 | + {"name": "to", "type": "integer"}, |
| 511 | + {"name": "hours", "type": "number"}, |
| 512 | + ], |
| 513 | + "foreignKeys": [ |
| 514 | + { |
| 515 | + "fields": ["from", "to"], |
| 516 | + "reference": {"resource": "cities", "fields": ["id", "next_id"]}, |
| 517 | + } |
| 518 | + ], |
| 519 | + }, |
| 520 | +} |
| 521 | + |
| 522 | + |
| 523 | +def test_package_validate_schema_foreign_key_error(): |
| 524 | + descriptor = deepcopy(DESCRIPTOR_FK) |
| 525 | + package = Package(descriptor) |
| 526 | + report = package.validate() |
| 527 | + assert report.valid |
| 528 | + |
| 529 | + |
| 530 | +def test_package_validate_schema_foreign_key_not_defined(): |
| 531 | + descriptor = deepcopy(DESCRIPTOR_FK) |
| 532 | + del descriptor["resources"][0]["schema"]["foreignKeys"] |
| 533 | + package = Package(descriptor) |
| 534 | + report = package.validate() |
| 535 | + assert report.valid |
| 536 | + |
| 537 | + |
| 538 | +def test_package_validate_schema_foreign_key_self_referenced_resource_violation(): |
| 539 | + descriptor = deepcopy(DESCRIPTOR_FK) |
| 540 | + del descriptor["resources"][0]["data"][4] |
| 541 | + package = Package(descriptor) |
| 542 | + report = package.validate() |
| 543 | + assert report.flatten(["rowNumber", "fieldNumber", "type", "cells"]) == [ |
| 544 | + [4, None, "foreign-key", ["3", "rome", "4"]], |
| 545 | + ] |
| 546 | + |
| 547 | + |
| 548 | +def test_package_validate_schema_foreign_key_internal_resource_violation(): |
| 549 | + descriptor = deepcopy(DESCRIPTOR_FK) |
| 550 | + del descriptor["resources"][1]["data"][4] |
| 551 | + package = Package(descriptor) |
| 552 | + report = package.validate() |
| 553 | + assert report.flatten(["rowNumber", "fieldNumber", "type", "cells"]) == [ |
| 554 | + [5, None, "foreign-key", ["4", "rio", ""]], |
| 555 | + ] |
| 556 | + |
| 557 | + |
| 558 | +def test_package_validate_schema_foreign_key_internal_resource_violation_non_existent(): |
| 559 | + descriptor = deepcopy(DESCRIPTOR_FK) |
| 560 | + descriptor["resources"][1]["data"] = [["label", "population"], [10, 10]] |
| 561 | + package = Package(descriptor) |
| 562 | + report = package.validate() |
| 563 | + assert report.flatten(["rowNumber", "fieldNumber", "type", "cells"]) == [ |
| 564 | + [2, None, "foreign-key", ["1", "london", "2"]], |
| 565 | + [3, None, "foreign-key", ["2", "paris", "3"]], |
| 566 | + [4, None, "foreign-key", ["3", "rome", "4"]], |
| 567 | + [5, None, "foreign-key", ["4", "rio", ""]], |
| 568 | + ] |
| 569 | + |
| 570 | + |
| 571 | +def test_package_validate_schema_multiple_foreign_key(): |
| 572 | + descriptor = deepcopy(DESCRIPTOR_FK) |
| 573 | + descriptor["resources"].append(MULTI_FK_RESSOURCE) |
| 574 | + package = Package(descriptor) |
| 575 | + report = package.validate() |
| 576 | + assert report.valid |
| 577 | + |
| 578 | + |
| 579 | +def test_package_validate_schema_multiple_foreign_key_resource_violation_non_existent(): |
| 580 | + descriptor = deepcopy(DESCRIPTOR_FK) |
| 581 | + # remove London |
| 582 | + del descriptor["resources"][0]["data"][1] |
| 583 | + descriptor["resources"].append(MULTI_FK_RESSOURCE) |
| 584 | + package = Package(descriptor) |
| 585 | + report = package.validate() |
| 586 | + assert report.flatten(["rowNumber", "fieldNumber", "type", "cells", "note"]) == [ |
| 587 | + [ |
| 588 | + 2, |
| 589 | + None, |
| 590 | + "foreign-key", |
| 591 | + ["1", "2", "1.5"], |
| 592 | + 'for "from, to": values "1, 2" not found in the lookup table "cities" as "id, next_id"', |
| 593 | + ], |
| 594 | + ] |
| 595 | + |
| 596 | + |
| 597 | +def test_package_validate_schema_multiple_foreign_key_violations(): |
| 598 | + descriptor = deepcopy(DESCRIPTOR_FK) |
| 599 | + # Add some wrong fks |
| 600 | + descriptor["resources"][0]["data"][3][0] = 5 |
| 601 | + descriptor["resources"][0]["data"][4][0] = 6 |
| 602 | + descriptor["resources"].append(MULTI_FK_RESSOURCE) |
| 603 | + package = Package(descriptor) |
| 604 | + report = package.validate() |
| 605 | + assert report.flatten( |
| 606 | + [ |
| 607 | + "rowNumber", |
| 608 | + "fieldNames", |
| 609 | + "fieldCells", |
| 610 | + "referenceName", |
| 611 | + "referenceFieldNames", |
| 612 | + ] |
| 613 | + ) == [ |
| 614 | + [3, ["next_id"], ["3"], "", ["id"]], |
| 615 | + [4, ["next_id"], ["4"], "", ["id"]], |
| 616 | + [4, ["id"], ["5"], "people", ["label"]], |
| 617 | + [5, ["id"], ["6"], "people", ["label"]], |
| 618 | + [4, ["from", "to"], ["3", "4"], "cities", ["id", "next_id"]], |
| 619 | + ] |
| 620 | + |
| 621 | + |
| 622 | +# Bugs |
| 623 | + |
| 624 | + |
| 625 | +def test_package_validate_using_detector_schema_sync_issue_847(): |
| 626 | + package = Package( |
| 627 | + resources=[ |
| 628 | + Resource( |
| 629 | + data=[["f1"], ["v1"], ["v2"], ["v3"]], |
| 630 | + schema=Schema( |
| 631 | + fields=[ |
| 632 | + fields.AnyField(name="f1"), |
| 633 | + fields.AnyField(name="f2"), |
| 634 | + ] |
| 635 | + ), |
| 636 | + ), |
| 637 | + ] |
| 638 | + ) |
| 639 | + for resource in package.resources: |
| 640 | + resource.detector = Detector(schema_sync=True) |
| 641 | + report = package.validate() |
| 642 | + assert report.valid |
| 643 | + |
| 644 | + |
| 645 | +# Parallel |
| 646 | + |
| 647 | +# Note: to test parallel validation, do not use foreign keys to prevent an |
| 648 | +# automatic fallback on single-core execution |
| 649 | + |
| 650 | + |
| 651 | +@pytest.mark.ci |
| 652 | +def test_package_validate_parallel_from_dict(): |
| 653 | + with open("data/datapackage.json") as file: |
| 654 | + package = Package(json.load(file), basepath="data") |
| 655 | + report = package.validate(parallel=True) |
| 656 | + assert report.valid |
| 657 | + |
| 658 | + |
| 659 | +@pytest.mark.ci |
| 660 | +def test_package_validate_parallel_from_dict_invalid(): |
| 661 | + with open("data/invalid/datapackage_no_foreign_key.json") as file: |
| 662 | + package = Package(json.load(file), basepath="data/invalid") |
| 663 | + report = package.validate(parallel=True) |
| 664 | + assert report.flatten(["taskNumber", "rowNumber", "fieldNumber", "type"]) == [ |
| 665 | + [1, 3, None, "blank-row"], |
| 666 | + [1, 3, None, "primary-key"], |
| 667 | + [2, 4, None, "blank-row"], |
| 668 | + ] |
| 669 | + |
| 670 | + |
| 671 | +@pytest.mark.ci |
| 672 | +def test_package_validate_with_parallel(): |
| 673 | + package = Package("data/invalid/datapackage_no_foreign_key.json") |
| 674 | + report = package.validate(parallel=True) |
| 675 | + assert report.flatten(["taskNumber", "rowNumber", "fieldNumber", "type"]) == [ |
| 676 | + [1, 3, None, "blank-row"], |
| 677 | + [1, 3, None, "primary-key"], |
| 678 | + [2, 4, None, "blank-row"], |
| 679 | + ] |
0 commit comments