Validate actual file properties such as checksum

tom-tan · mr-c · mr-c · commit 4c57396b9063 · 2023-05-27T13:36:00.000+02:00
A re-do of #146 Co-Authored-By: Michael R. Crusoe <michael.crusoe@gmail.com>
diff --git a/cwltest/compare.py b/cwltest/compare.py
@@ -1,7 +1,10 @@
 """Compare utilities for CWL objects."""
 
+import hashlib
 import json
-from typing import Any, Dict, Optional, Set
+import os.path
+import urllib.parse
+from typing import Any, Callable, Dict, Optional, Set
 
 
 class CompareFail(Exception):
@@ -22,11 +25,11 @@ def format(
 
 
 def _check_keys(
-    keys: Set[str], expected: Dict[str, Any], actual: Dict[str, Any]
+    keys: Set[str], expected: Dict[str, Any], actual: Dict[str, Any], skip_details: bool
 ) -> None:
     for k in keys:
         try:
-            compare(expected.get(k), actual.get(k))
+            compare(expected.get(k), actual.get(k), skip_details)
         except CompareFail as e:
             raise CompareFail.format(
                 expected, actual, f"field {k!r} failed comparison: {str(e)}"
@@ -48,10 +51,12 @@ def _compare_contents(expected: Dict[str, Any], actual: Dict[str, Any]) -> None:
         )
 
 
-def _compare_dict(expected: Dict[str, Any], actual: Dict[str, Any]) -> None:
+def _compare_dict(
+    expected: Dict[str, Any], actual: Dict[str, Any], skip_details: bool
+) -> None:
     for c in expected:
         try:
-            compare(expected[c], actual.get(c))
+            compare(expected[c], actual.get(c), skip_details)
         except CompareFail as e:
             raise CompareFail.format(
                 expected, actual, f"failed comparison for key {c!r}: {e}"
@@ -62,7 +67,9 @@ def _compare_dict(expected: Dict[str, Any], actual: Dict[str, Any]) -> None:
             raise CompareFail.format(expected, actual, "unexpected key '%s'" % k)
 
 
-def _compare_directory(expected: Dict[str, Any], actual: Dict[str, Any]) -> None:
+def _compare_directory(
+    expected: Dict[str, Any], actual: Dict[str, Any], skip_details: bool
+) -> None:
     if actual.get("class") != "Directory":
         raise CompareFail.format(
             expected, actual, "expected object with a class 'Directory'"
@@ -75,7 +82,7 @@ def _compare_directory(expected: Dict[str, Any], actual: Dict[str, Any]) -> None
         found = False
         for j in actual["listing"]:
             try:
-                compare(i, j)
+                compare(i, j, skip_details)
                 found = True
                 break
             except CompareFail:
@@ -86,19 +93,32 @@ def _compare_directory(expected: Dict[str, Any], actual: Dict[str, Any]) -> None
                 actual,
                 "%s not found" % json.dumps(i, indent=4, sort_keys=True),
             )
-    _compare_file(expected, actual)
+    _compare_file(expected, actual, skip_details)
 
 
-def _compare_file(expected: Dict[str, Any], actual: Dict[str, Any]) -> None:
-    _compare_location(expected, actual)
+def _compare_file(
+    expected: Dict[str, Any], actual: Dict[str, Any], skip_details: bool
+) -> None:
+    _compare_location(expected, actual, skip_details)
     if "contents" in expected:
         _compare_contents(expected, actual)
-    other_keys = set(expected.keys()) - {"path", "location", "listing", "contents"}
-    _check_keys(other_keys, expected, actual)
-    _check_keys(other_keys, expected, actual)
-
-
-def _compare_location(expected: Dict[str, Any], actual: Dict[str, Any]) -> None:
+    if actual.get("class") == "File" and not skip_details:
+        _compare_checksum(expected, actual)
+        _compare_size(expected, actual)
+    other_keys = set(expected.keys()) - {
+        "path",
+        "location",
+        "listing",
+        "contents",
+        "checksum",
+        "size",
+    }
+    _check_keys(other_keys, expected, actual, skip_details)
+
+
+def _compare_location(
+    expected: Dict[str, Any], actual: Dict[str, Any], skip_details: bool
+) -> None:
     if "path" in expected:
         comp = "path"
         if "path" not in actual:
@@ -109,7 +129,19 @@ def _compare_location(expected: Dict[str, Any], actual: Dict[str, Any]) -> None:
         return
     if actual.get("class") == "Directory":
         actual[comp] = actual[comp].rstrip("/")
-
+        exist_fun: Callable[[str], bool] = os.path.isdir
+    else:
+        exist_fun = os.path.isfile
+    if "path" in actual:
+        path = urllib.parse.urlparse(actual["path"]).path
+    else:
+        path = urllib.parse.urlparse(actual["location"]).path
+    if not exist_fun(path) and not skip_details:
+        raise CompareFail.format(
+            expected,
+            actual,
+            f"{actual[comp]} does not exist",
+        )
     if expected[comp] != "Any" and (
         not (
             actual[comp].endswith("/" + expected[comp])
@@ -123,7 +155,67 @@ def _compare_location(expected: Dict[str, Any], actual: Dict[str, Any]) -> None:
         )
 
 
-def compare(expected: Any, actual: Any) -> None:
+def _compare_checksum(expected: Dict[str, Any], actual: Dict[str, Any]) -> None:
+    if "path" in actual:
+        path = urllib.parse.urlparse(actual["path"]).path
+    else:
+        path = urllib.parse.urlparse(actual["location"]).path
+    checksum = hashlib.sha1()  # nosec
+    with open(path, "rb") as f:
+        contents = f.read(1024 * 1024)
+        while contents != b"":
+            checksum.update(contents)
+            contents = f.read(1024 * 1024)
+    actual_checksum_on_disk = f"sha1${checksum.hexdigest()}"
+    if "checksum" in actual:
+        actual_checksum_declared = actual["checksum"]
+        if actual_checksum_on_disk != actual_checksum_declared:
+            raise CompareFail.format(
+                expected,
+                actual,
+                "Output file checksums do not match: actual "
+                f"{actual_checksum_on_disk!r} on disk is not equal to actual "
+                f"{actual_checksum_declared!r} in the output object",
+            )
+    if "checksum" in expected:
+        expected_checksum = expected["checksum"]
+        if expected_checksum != actual_checksum_on_disk:
+            raise CompareFail.format(
+                expected,
+                actual,
+                "Output file checksums do not match: actual "
+                f"{actual_checksum_on_disk!r} is not equal to expected {expected_checksum!r}",
+            )
+
+
+def _compare_size(expected: Dict[str, Any], actual: Dict[str, Any]) -> None:
+    if "path" in actual:
+        path = urllib.parse.urlparse(actual["path"]).path
+    else:
+        path = urllib.parse.urlparse(actual["location"]).path
+    actual_size_on_disk = os.path.getsize(path)
+    if "size" in actual:
+        actual_size_declared = actual["size"]
+        if actual_size_on_disk != actual_size_declared:
+            raise CompareFail.format(
+                expected,
+                actual,
+                "Output file sizes do not match: actual "
+                f"{actual_size_on_disk!r} on disk is not equal to actual "
+                f"{actual_size_declared!r}' in the output object",
+            )
+    if "size" in expected:
+        expected_size = expected["size"]
+        if expected_size != actual_size_on_disk:
+            raise CompareFail.format(
+                expected,
+                actual,
+                "Output file sizes do not match: actual "
+                f"{actual_size_on_disk!r} is not equal to expected {expected_size!r}",
+            )
+
+
+def compare(expected: Any, actual: Any, skip_details: bool = False) -> None:
     """Compare two CWL objects."""
     if expected == "Any":
         return
@@ -136,11 +228,11 @@ def compare(expected: Any, actual: Any) -> None:
                 raise CompareFail.format(expected, actual)
 
             if expected.get("class") == "File":
-                _compare_file(expected, actual)
+                _compare_file(expected, actual, skip_details)
             elif expected.get("class") == "Directory":
-                _compare_directory(expected, actual)
+                _compare_directory(expected, actual, skip_details)
             else:
-                _compare_dict(expected, actual)
+                _compare_dict(expected, actual, skip_details)
 
         elif isinstance(expected, list):
             if not isinstance(actual, list):
@@ -150,7 +242,7 @@ def compare(expected: Any, actual: Any) -> None:
                 raise CompareFail.format(expected, actual, "lengths don't match")
             for c in range(0, len(expected)):
                 try:
-                    compare(expected[c], actual[c])
+                    compare(expected[c], actual[c], skip_details)
                 except CompareFail as e:
                     raise CompareFail.format(expected, actual, e) from e
         else:
diff --git a/tests/test_compare.py b/tests/test_compare.py
@@ -1,7 +1,9 @@
+import os
+from pathlib import Path
 from typing import Any, Dict
 
 import pytest
-from cwltest.compare import CompareFail, compare
+from cwltest.compare import CompareFail, _compare_directory, _compare_file, compare
 
 from .util import get_data
 
@@ -37,6 +39,7 @@ def test_compare_contents_success() -> None:
         "size": 2,
         "class": "File",
         "contents": "2\n",
+        "checksum": "sha1$7448d8798a4380162d4b56f9b452e2f6f9e24e7a",
     }
     actual = {
         "basename": "cores.txt",
@@ -49,6 +52,120 @@ def test_compare_contents_success() -> None:
     compare(expected, actual)
 
 
+def test_compare_contents_not_exist() -> None:
+    expected = {
+        "location": "cores.txt",
+        "class": "File",
+    }
+    actual = {
+        "basename": "cores.txt",
+        "class": "File",
+        "location": "file:///var/folders/8x/2df05_7j20j6r8y81w4qf43r0000gn/T/tmpG0EkrS/cores.txt",
+        "path": "/none/exist/path/to/cores.txt",
+        "size": 2,
+    }
+    with pytest.raises(CompareFail):
+        _compare_file(expected, actual, False)
+
+
+def test_compare_file_different_size(tmp_path: Path) -> None:
+    expected = {
+        "location": "cores.txt",
+        "size": 2,
+        "class": "File",
+    }
+
+    path = tmp_path / "cores.txt"
+    with open(path, "w") as f:
+        f.write("hello")
+
+    actual = {
+        "basename": "cores.txt",
+        "class": "File",
+        "location": str(path),
+    }
+    with pytest.raises(CompareFail):
+        _compare_file(expected, actual, False)
+
+
+def test_compare_file_different_checksum(tmp_path: Path) -> None:
+    expected = {
+        "location": "cores.txt",
+        "class": "File",
+        "checksum": "sha1$7448d8798a4380162d4b56f9b452e2f6f9e24e7a",
+    }
+
+    path = tmp_path / "cores.txt"
+    with open(path, "w") as f:
+        f.write("hello")
+
+    actual = {
+        "basename": "cores.txt",
+        "class": "File",
+        "location": str(path),
+    }
+    with pytest.raises(CompareFail):
+        _compare_file(expected, actual, False)
+
+
+def test_compare_file_inconsistent_size(tmp_path: Path) -> None:
+    expected = {
+        "location": "cores.txt",
+        "class": "File",
+    }
+
+    path = tmp_path / "cores.txt"
+    with open(path, "w") as f:
+        f.write("hello")
+
+    actual = {
+        "basename": "cores.txt",
+        "class": "File",
+        "location": str(path),
+        "size": 65535,
+    }
+    with pytest.raises(CompareFail):
+        _compare_file(expected, actual, False)
+
+
+def test_compare_file_inconsistent_checksum(tmp_path: Path) -> None:
+    expected = {
+        "location": "cores.txt",
+        "class": "File",
+    }
+
+    path = tmp_path / "cores.txt"
+    with open(path, "w") as f:
+        f.write("hello")
+
+    actual = {
+        "basename": "cores.txt",
+        "checksum": "inconsistent-checksum",
+        "class": "File",
+        "location": str(path),
+    }
+    with pytest.raises(CompareFail):
+        _compare_file(expected, actual, False)
+
+
+def test_compare_directory(tmp_path: Path) -> None:
+    expected = {
+        "location": "dir",
+        "class": "Directory",
+        "listing": [],
+    }
+
+    path = tmp_path / "dir"
+    os.makedirs(path)
+
+    actual = {
+        "class": "Directory",
+        "location": str(path),
+        "listing": [],
+    }
+    _compare_directory(expected, actual, False)
+
+
 def test_compare_directory_success() -> None:
     expected = {
         "stuff": {
@@ -100,7 +217,7 @@ def test_compare_directory_success() -> None:
             ],
         }
     }
-    compare(expected, actual)
+    compare(expected, actual, skip_details=True)
 
 
 def test_compare_directory_failure_different_listing() -> None:
@@ -270,7 +387,7 @@ def test_compare_file_success() -> None:
         "path": "/var/folders/8x/2df05_7j20j6r8y81w4qf43r0000gn/T/tmpG0EkrS/cores.txt",
         "size": 2,
     }
-    compare(expected, actual)
+    compare(expected, actual, skip_details=True)
 
 
 def test_compare_list_failure_missing() -> None:
diff --git a/tox.ini b/tox.ini
@@ -1,6 +1,6 @@
 [tox]
 envlist =
-  py{36,37,38,39,310,311}-lint,
+  py{37,38,39,310,311}-lint,
   py{36,37,38,39,310,311}-unit,
   py{36,37,38,39,310,311}-bandit,
   py{37,38,39,310,311}-mypy,
@@ -58,7 +58,7 @@ commands =
   py{36,37,38,39,310,311}-unit: python -m pip install -U pip setuptools wheel
   py{36,37,38,39,310,311}-unit: python -m pytest --cov --cov-config={toxinidir}/.coveragerc --cov-append {posargs}
   py{36,37,38,39,310,311}-unit: coverage xml
-  py{36,37,38,39,310,311}-bandit: bandit --recursive cwltest --exclude tests/*
+  py{36,37,38,39,310,311}-bandit: bandit --recursive cwltest
   py{36,37,38,39,310,311}-lint: make flake8
   py{36,37,38,39,310,311}-lint: make format-check
   py{37,38,39,310,311}-mypy: make mypy