66
77from dataclasses import dataclass
88from datetime import datetime , timedelta
9+ from enum import StrEnum , auto
910from functools import cached_property
1011from hashlib import file_digest
1112from logging import getLogger
@@ -105,6 +106,19 @@ def contains(self, dt: datetime) -> bool:
105106 return self .dtstart <= dt < self .dtend
106107
107108
109+ class PayloadStrictness (StrEnum ):
110+ """Strictness of how to validate the payload against the manifest
111+
112+ * "strict": any files in the payload directory and not in the manifest cause the test to fail.
113+ * "flex": a reasonable set of files in the payload directory and not in the manifest are ignored (.DS_Store files).
114+ * "manifest": only files in the manifest are checked and others are ignored.
115+ """
116+
117+ STRICT = auto ()
118+ FLEX = auto ()
119+ MANIFEST = auto ()
120+
121+
108122@dataclass
109123class Breakout :
110124 """Convenience wrapper for manipulating the various Paths of the r2r breakout
@@ -123,6 +137,9 @@ class Breakout:
123137 path : Path
124138 """Path to the breakout itself, this set on instantiating a Breakout"""
125139
140+ payload : PayloadStrictness = PayloadStrictness .FLEX
141+ """How strictly should the payload directory be validated"""
142+
126143 @property
127144 def manifest_path (self ) -> Path :
128145 """The Path of the manifest-md5.txt file in this breakout"""
@@ -133,6 +150,11 @@ def manifest(self) -> str:
133150 """Reads the manifest file as returns its contents as a string"""
134151 return self .manifest_path .read_text ()
135152
153+ @property
154+ def payload_path (self ) -> Path :
155+ """The path to the "data" directory assuming this is a BagIt bag"""
156+ return self .path / "data"
157+
136158 @cached_property
137159 def manifest_dict (self ) -> dict [Path , str ]:
138160 """Transforms the manifest file into a dict containing file path to file hash mappings"""
@@ -150,14 +172,28 @@ def manifest_dict(self) -> dict[Path, str]:
150172 def manifest_ok (self ) -> bool :
151173 """Iterate over the manifest and check all the file hashes against the files in the breakout
152174
153- In an actual bag-it bag, it would be an error for extra stuff to be in the data directory.
154- For example, a .DS_Store file if you looked at the breakout data directory on a mac.
155- This ignores anything not in the manifest file.
175+ See :py:class:`PayloadStrictness` for how to control behavior.
156176
157177 This returns True if both all the files in the manifest are present and their md5 hashes match.
158178
159179 This is one of the checks that goes into the stoplight report.
160180 """
181+ flex_files = {
182+ ".DS_Store" ,
183+ }
184+ logger .info (f"Payload validation mode: { self .payload } " )
185+ err_message = "Files are in payload directory and not in manifest, breakout is likely invalid or corrupted"
186+ for root , _ , files in self .payload_path .walk ():
187+ paths = {root / file for file in files }
188+ diff = paths - self .manifest_dict .keys ()
189+ if self .payload == "strict" and any (diff ):
190+ logger .critical (err_message )
191+ return False
192+
193+ if self .payload == "flex" and not all (d .name in flex_files for d in diff ):
194+ logger .critical (err_message )
195+ return False
196+
161197 for file_path , manifest_hash in self .manifest_dict .items ():
162198 if not file_path .exists ():
163199 return False
0 commit comments