Skip to content

Commit f1edded

Browse files
Add basic provenance to icf metadata
1 parent 8cfe9ad commit f1edded

File tree

2 files changed

+25
-10
lines changed

2 files changed

+25
-10
lines changed

bio2zarr/vcf.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,9 @@ class VcfPartition:
150150

151151

152152
ICF_METADATA_FORMAT_VERSION = "0.2"
153-
ICF_DEFAULT_COMPRESSOR = numcodecs.Blosc(cname="lz4", clevel=7).get_config()
153+
ICF_DEFAULT_COMPRESSOR = numcodecs.Blosc(
154+
cname="lz4", clevel=7, shuffle=numcodecs.Blosc.NOSHUFFLE
155+
).get_config()
154156

155157

156158
@dataclasses.dataclass
@@ -165,6 +167,7 @@ class IcfMetadata:
165167
format_version: str = None
166168
compressor: dict = None
167169
column_chunk_size: int = None
170+
provenance: dict = None
168171

169172
@property
170173
def info_fields(self):
@@ -334,6 +337,9 @@ def scan_vcfs(
334337
icf_metadata.format_version = ICF_METADATA_FORMAT_VERSION
335338
icf_metadata.compressor = ICF_DEFAULT_COMPRESSOR
336339
icf_metadata.column_chunk_size = column_chunk_size
340+
# Bare minimum here for provenance - would be nice to include versions of key
341+
# dependencies as well.
342+
icf_metadata.provenance = {"source": f"bio2zarr-{provenance.__version__}"}
337343
logger.info(f"Scan complete, resulting in {len(all_partitions)} partitions.")
338344
return icf_metadata, header
339345

@@ -677,8 +683,6 @@ def values(self):
677683
j = 0
678684
for partition_id in range(self.num_partitions):
679685
for chunk in self.chunks(partition_id):
680-
# for chunk_path in self.chunk_files(partition_id):
681-
# chunk = self.read_chunk(chunk_path)
682686
for record in chunk:
683687
ret[j] = record
684688
j += 1
@@ -995,7 +999,7 @@ def process_partition(self, partition_index):
995999
# NOTE to do this properly we probably need to take a lock on this file - but
9961000
# this simple approach will catch the vast majority of problems.
9971001
if summary_path.exists():
998-
summary_path.unlink() # NEEDS TEST
1002+
summary_path.unlink()
9991003

10001004
partition = self.metadata.partitions[partition_index]
10011005
logger.info(

tests/test_icf.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import numcodecs
88

99
from bio2zarr import vcf
10+
from bio2zarr import provenance
1011

1112

1213
class TestSmallExample:
@@ -26,6 +27,14 @@ def icf(self, tmp_path_factory):
2627
out = tmp_path_factory.mktemp("data") / "example.exploded"
2728
return vcf.explode([self.data_path], out)
2829

30+
def test_format_version(self, icf):
31+
assert icf.metadata.format_version == vcf.ICF_METADATA_FORMAT_VERSION
32+
33+
def test_provenance(self, icf):
34+
assert icf.metadata.provenance == {
35+
"source": f"bio2zarr-{provenance.__version__}"
36+
}
37+
2938
def test_mkschema(self, tmp_path, icf):
3039
schema_file = tmp_path / "schema.json"
3140
with open(schema_file, "w") as f:
@@ -252,7 +261,7 @@ def test_missing_field(self, tmp_path):
252261
def test_missing_chunk_index(self, tmp_path):
253262
icf_path = tmp_path / "icf"
254263
vcf.explode([self.data_path], icf_path)
255-
chunk_index_path = icf_path / "POS"/ "p0" / "chunk_index"
264+
chunk_index_path = icf_path / "POS" / "p0" / "chunk_index"
256265
assert chunk_index_path.exists()
257266
chunk_index_path.unlink()
258267
icf = vcf.IntermediateColumnarFormat(icf_path)
@@ -262,7 +271,7 @@ def test_missing_chunk_index(self, tmp_path):
262271
def test_missing_chunk_file(self, tmp_path):
263272
icf_path = tmp_path / "icf"
264273
vcf.explode([self.data_path], icf_path)
265-
chunk_file = icf_path / "POS"/ "p0" / "2"
274+
chunk_file = icf_path / "POS" / "p0" / "2"
266275
assert chunk_file.exists()
267276
chunk_file.unlink()
268277
icf = vcf.IntermediateColumnarFormat(icf_path)
@@ -272,7 +281,7 @@ def test_missing_chunk_file(self, tmp_path):
272281
def test_empty_chunk_file(self, tmp_path):
273282
icf_path = tmp_path / "icf"
274283
vcf.explode([self.data_path], icf_path)
275-
chunk_file = icf_path / "POS"/ "p0" / "2"
284+
chunk_file = icf_path / "POS" / "p0" / "2"
276285
assert chunk_file.exists()
277286
with open(chunk_file, "w") as f:
278287
pass
@@ -284,7 +293,7 @@ def test_empty_chunk_file(self, tmp_path):
284293
def test_truncated_chunk_file(self, tmp_path, length):
285294
icf_path = tmp_path / "icf"
286295
vcf.explode([self.data_path], icf_path)
287-
chunk_file = icf_path / "POS"/ "p0" / "2"
296+
chunk_file = icf_path / "POS" / "p0" / "2"
288297
with open(chunk_file, "rb") as f:
289298
buff = f.read(length)
290299
assert len(buff) == length
@@ -298,7 +307,7 @@ def test_truncated_chunk_file(self, tmp_path, length):
298307
def test_chunk_incorrect_length(self, tmp_path):
299308
icf_path = tmp_path / "icf"
300309
vcf.explode([self.data_path], icf_path)
301-
chunk_file = icf_path / "POS"/ "p0" / "2"
310+
chunk_file = icf_path / "POS" / "p0" / "2"
302311
compressor = numcodecs.Blosc(cname="lz4")
303312
with open(chunk_file, "rb") as f:
304313
pkl = compressor.decode(f.read())
@@ -321,7 +330,9 @@ class TestSlicing:
321330
@pytest.fixture(scope="class")
322331
def icf(self, tmp_path_factory):
323332
out = tmp_path_factory.mktemp("data") / "example.exploded"
324-
return vcf.explode([self.data_path], out, column_chunk_size=0.0125, worker_processes=0)
333+
return vcf.explode(
334+
[self.data_path], out, column_chunk_size=0.0125, worker_processes=0
335+
)
325336

326337
def test_repr(self, icf):
327338
assert repr(icf).startswith(

0 commit comments

Comments
 (0)