Skip to content

Commit 71c0851

Browse files
committed
Add runtime control of how to check the payload "data" directory against the manifest
1 parent b017f42 commit 71c0851

File tree

3 files changed

+50
-6
lines changed

3 files changed

+50
-6
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
* Increase the cnv generation attempts from 3 to 5
88
* Add HTML/leaflet based map output
99
* If no files to test (empty breakout), set overall score to black
10+
* Add runtime control of how to check the payload "data" directory against the manifest
1011

1112
## v2025.08.0 (2025-08-11)
1213
* Initial release.

src/r2r_ctd/__main__.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
import click
2020
from rich.logging import RichHandler
2121

22-
from r2r_ctd.breakout import Breakout
22+
from r2r_ctd.breakout import Breakout, PayloadStrictness
2323
from r2r_ctd.docker_ctl import test_docker as _test_docker
2424
from r2r_ctd.maps import make_map
2525
from r2r_ctd.reporting import (
@@ -58,10 +58,17 @@ def test_docker():
5858
type=click.Path(exists=True, file_okay=False, writable=True, path_type=Path),
5959
)
6060
@click.option("--gen-cnvs/--no-gen-cnvs", default=True)
61-
def qa(gen_cnvs: bool, paths: tuple[Path, ...]):
61+
@click.option(
62+
"--payload-strictness",
63+
show_default=True,
64+
default=PayloadStrictness.FLEX,
65+
type=click.Choice(PayloadStrictness, case_sensitive=False),
66+
help=PayloadStrictness.__doc__,
67+
)
68+
def qa(gen_cnvs: bool, paths: tuple[Path, ...], payload_strictness: PayloadStrictness):
6269
"""Run the QA routines on one or more directories."""
6370
for path in paths:
64-
breakout = Breakout(path=path)
71+
breakout = Breakout(path=path, payload=payload_strictness)
6572
ra = ResultAggregator(breakout)
6673

6774
# write geoCSV

src/r2r_ctd/breakout.py

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
from dataclasses import dataclass
88
from datetime import datetime, timedelta
9+
from enum import StrEnum, auto
910
from functools import cached_property
1011
from hashlib import file_digest
1112
from logging import getLogger
@@ -105,6 +106,19 @@ def contains(self, dt: datetime) -> bool:
105106
return self.dtstart <= dt < self.dtend
106107

107108

109+
class PayloadStrictness(StrEnum):
110+
"""Strictness of how to validate the payload against the manifest
111+
112+
* "strict": any files in the payload directory and not in the manifest cause the test to fail.
113+
* "flex": a reasonable set of files in the payload directory and not in the manifest are ignored (.DS_Store files).
114+
* "manifest": only files in the manifest are checked and others are ignored.
115+
"""
116+
117+
STRICT = auto()
118+
FLEX = auto()
119+
MANIFEST = auto()
120+
121+
108122
@dataclass
109123
class Breakout:
110124
"""Convenience wrapper for manipulating the various Paths of the r2r breakout
@@ -123,6 +137,9 @@ class Breakout:
123137
path: Path
124138
"""Path to the breakout itself, this set on instantiating a Breakout"""
125139

140+
payload: PayloadStrictness = PayloadStrictness.FLEX
141+
"""How strictly should the payload directory be validated"""
142+
126143
@property
127144
def manifest_path(self) -> Path:
128145
"""The Path of the manifest-md5.txt file in this breakout"""
@@ -133,6 +150,11 @@ def manifest(self) -> str:
133150
"""Reads the manifest file as returns its contents as a string"""
134151
return self.manifest_path.read_text()
135152

153+
@property
154+
def payload_path(self) -> Path:
155+
"""The path to the "data" directory assuming this is a BagIt bag"""
156+
return self.path / "data"
157+
136158
@cached_property
137159
def manifest_dict(self) -> dict[Path, str]:
138160
"""Transforms the manifest file into a dict containing file path to file hash mappings"""
@@ -150,14 +172,28 @@ def manifest_dict(self) -> dict[Path, str]:
150172
def manifest_ok(self) -> bool:
151173
"""Iterate over the manifest and check all the file hashes against the files in the breakout
152174
153-
In an actual bag-it bag, it would be an error for extra stuff to be in the data directory.
154-
For example, a .DS_Store file if you looked at the breakout data directory on a mac.
155-
This ignores anything not in the manifest file.
175+
See :py:class:`PayloadStrictness` for how to control behavior.
156176
157177
This returns True if both all the files in the manifest are present and their md5 hashes match.
158178
159179
This is one of the checks that goes into the stoplight report.
160180
"""
181+
flex_files = {
182+
".DS_Store",
183+
}
184+
logger.info(f"Payload validation mode: {self.payload}")
185+
err_message = "Files are in payload directory and not in manifest, breakout is likely invalid or corrupted"
186+
for root, _, files in self.payload_path.walk():
187+
paths = {root / file for file in files}
188+
diff = paths - self.manifest_dict.keys()
189+
if self.payload == "strict" and any(diff):
190+
logger.critical(err_message)
191+
return False
192+
193+
if self.payload == "flex" and not all(d.name in flex_files for d in diff):
194+
logger.critical(err_message)
195+
return False
196+
161197
for file_path, manifest_hash in self.manifest_dict.items():
162198
if not file_path.exists():
163199
return False

0 commit comments

Comments
 (0)