Add basic provenance to icf metadata

jeromekelleher · jeromekelleher · commit f1edded7fb97 · 2024-03-27T15:58:07.000Z
diff --git a/bio2zarr/vcf.py b/bio2zarr/vcf.py
@@ -150,7 +150,9 @@ class VcfPartition:
 
 
 ICF_METADATA_FORMAT_VERSION = "0.2"
-ICF_DEFAULT_COMPRESSOR = numcodecs.Blosc(cname="lz4", clevel=7).get_config()
+ICF_DEFAULT_COMPRESSOR = numcodecs.Blosc(
+    cname="lz4", clevel=7, shuffle=numcodecs.Blosc.NOSHUFFLE
+).get_config()
 
 
 @dataclasses.dataclass
@@ -165,6 +167,7 @@ class IcfMetadata:
     format_version: str = None
     compressor: dict = None
     column_chunk_size: int = None
+    provenance: dict = None
 
     @property
     def info_fields(self):
@@ -334,6 +337,9 @@ def scan_vcfs(
     icf_metadata.format_version = ICF_METADATA_FORMAT_VERSION
     icf_metadata.compressor = ICF_DEFAULT_COMPRESSOR
     icf_metadata.column_chunk_size = column_chunk_size
+    # Bare minimum here for provenance - would be nice to include versions of key
+    # dependencies as well.
+    icf_metadata.provenance = {"source": f"bio2zarr-{provenance.__version__}"}
     logger.info(f"Scan complete, resulting in {len(all_partitions)} partitions.")
     return icf_metadata, header
 
@@ -677,8 +683,6 @@ def values(self):
         j = 0
         for partition_id in range(self.num_partitions):
             for chunk in self.chunks(partition_id):
-                # for chunk_path in self.chunk_files(partition_id):
-                # chunk = self.read_chunk(chunk_path)
                 for record in chunk:
                     ret[j] = record
                     j += 1
@@ -995,7 +999,7 @@ def process_partition(self, partition_index):
         # NOTE to do this properly we probably need to take a lock on this file - but
         # this simple approach will catch the vast majority of problems.
         if summary_path.exists():
-            summary_path.unlink()  # NEEDS TEST
+            summary_path.unlink()
 
         partition = self.metadata.partitions[partition_index]
         logger.info(
diff --git a/tests/test_icf.py b/tests/test_icf.py
@@ -7,6 +7,7 @@
 import numcodecs
 
 from bio2zarr import vcf
+from bio2zarr import provenance
 
 
 class TestSmallExample:
@@ -26,6 +27,14 @@ def icf(self, tmp_path_factory):
         out = tmp_path_factory.mktemp("data") / "example.exploded"
         return vcf.explode([self.data_path], out)
 
+    def test_format_version(self, icf):
+        assert icf.metadata.format_version == vcf.ICF_METADATA_FORMAT_VERSION
+
+    def test_provenance(self, icf):
+        assert icf.metadata.provenance == {
+            "source": f"bio2zarr-{provenance.__version__}"
+        }
+
     def test_mkschema(self, tmp_path, icf):
         schema_file = tmp_path / "schema.json"
         with open(schema_file, "w") as f:
@@ -252,7 +261,7 @@ def test_missing_field(self, tmp_path):
     def test_missing_chunk_index(self, tmp_path):
         icf_path = tmp_path / "icf"
         vcf.explode([self.data_path], icf_path)
-        chunk_index_path = icf_path / "POS"/ "p0" / "chunk_index"
+        chunk_index_path = icf_path / "POS" / "p0" / "chunk_index"
         assert chunk_index_path.exists()
         chunk_index_path.unlink()
         icf = vcf.IntermediateColumnarFormat(icf_path)
@@ -262,7 +271,7 @@ def test_missing_chunk_index(self, tmp_path):
     def test_missing_chunk_file(self, tmp_path):
         icf_path = tmp_path / "icf"
         vcf.explode([self.data_path], icf_path)
-        chunk_file = icf_path / "POS"/ "p0" / "2"
+        chunk_file = icf_path / "POS" / "p0" / "2"
         assert chunk_file.exists()
         chunk_file.unlink()
         icf = vcf.IntermediateColumnarFormat(icf_path)
@@ -272,7 +281,7 @@ def test_missing_chunk_file(self, tmp_path):
     def test_empty_chunk_file(self, tmp_path):
         icf_path = tmp_path / "icf"
         vcf.explode([self.data_path], icf_path)
-        chunk_file = icf_path / "POS"/ "p0" / "2"
+        chunk_file = icf_path / "POS" / "p0" / "2"
         assert chunk_file.exists()
         with open(chunk_file, "w") as f:
             pass
@@ -284,7 +293,7 @@ def test_empty_chunk_file(self, tmp_path):
     def test_truncated_chunk_file(self, tmp_path, length):
         icf_path = tmp_path / "icf"
         vcf.explode([self.data_path], icf_path)
-        chunk_file = icf_path / "POS"/ "p0" / "2"
+        chunk_file = icf_path / "POS" / "p0" / "2"
         with open(chunk_file, "rb") as f:
             buff = f.read(length)
         assert len(buff) == length
@@ -298,7 +307,7 @@ def test_truncated_chunk_file(self, tmp_path, length):
     def test_chunk_incorrect_length(self, tmp_path):
         icf_path = tmp_path / "icf"
         vcf.explode([self.data_path], icf_path)
-        chunk_file = icf_path / "POS"/ "p0" / "2"
+        chunk_file = icf_path / "POS" / "p0" / "2"
         compressor = numcodecs.Blosc(cname="lz4")
         with open(chunk_file, "rb") as f:
             pkl = compressor.decode(f.read())
@@ -321,7 +330,9 @@ class TestSlicing:
     @pytest.fixture(scope="class")
     def icf(self, tmp_path_factory):
         out = tmp_path_factory.mktemp("data") / "example.exploded"
-        return vcf.explode([self.data_path], out, column_chunk_size=0.0125, worker_processes=0)
+        return vcf.explode(
+            [self.data_path], out, column_chunk_size=0.0125, worker_processes=0
+        )
 
     def test_repr(self, icf):
         assert repr(icf).startswith(