Skip to content

Commit 3fe6a3a

Browse files
Merge pull request #43 from jeromekelleher/provenance-and-metadata
Provenance and metadata
2 parents 1b062b0 + 3094feb commit 3fe6a3a

File tree

8 files changed

+57
-13
lines changed

8 files changed

+57
-13
lines changed

bio2zarr/__init__.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1 @@
1-
2-
from .vcf import explode as explode_vcf
1+
from . provenance import __version__

bio2zarr/__main__.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,18 @@
22

33
from . import cli
44

5+
@cli.version
56
@click.group()
6-
def top_level():
7+
def bio2zarr():
78
pass
89

910
# Provide a single top-level interface to all of the functionality.
1011
# This probably isn't the recommended way of interacting, as we
1112
# install individual commands as console scripts. However, this
1213
# is handy for development and for those whose PATHs aren't set
1314
# up in the right way.
14-
top_level.add_command(cli.vcf2zarr)
15-
top_level.add_command(cli.plink2zarr)
15+
bio2zarr.add_command(cli.vcf2zarr)
16+
bio2zarr.add_command(cli.plink2zarr)
1617

1718
if __name__ == "__main__":
18-
top_level()
19+
bio2zarr()

bio2zarr/cli.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from . import vcf
66
from . import plink
7+
from . import provenance
78

89
# Common arguments/options
910
verbose = click.option("-v", "--verbose", count=True, help="Increase verbosity")
@@ -12,6 +13,7 @@
1213
"-p", "--worker-processes", type=int, default=1, help="Number of worker processes"
1314
)
1415

16+
version = click.version_option(version=provenance.__version__)
1517

1618
# Note: logging hasn't been implemented in the code at all, this is just
1719
# a first pass to try out some ways of doing things to see what works.
@@ -111,6 +113,7 @@ def validate(vcfs, out_path):
111113
vcf.validate(vcfs[0], out_path, show_progress=True)
112114

113115

116+
@version
114117
@click.group()
115118
def vcf2zarr():
116119
pass
@@ -145,6 +148,7 @@ def convert_plink(in_path, out_path, worker_processes, chunk_width, chunk_length
145148
)
146149

147150

151+
@version
148152
@click.group()
149153
def plink2zarr():
150154
pass

bio2zarr/provenance.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
__version__ = "undefined"
2+
try:
3+
from . import _version
4+
5+
__version__ = _version.version
6+
except ImportError: # pragma: nocover
7+
pass

bio2zarr/vcf.py

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import zarr
2424

2525
from . import core
26+
from . import provenance
2627

2728
logger = logging.getLogger(__name__)
2829

@@ -178,6 +179,7 @@ def make_field_def(name, vcf_type, vcf_number):
178179
def scan_vcfs(paths, show_progress):
179180
partitions = []
180181
vcf_metadata = None
182+
header = None
181183
logger.info(f"Scanning {len(paths)} VCFs")
182184
for path in tqdm.tqdm(paths, desc="Scan ", disable=not show_progress):
183185
vcf = cyvcf2.VCF(path)
@@ -215,6 +217,9 @@ def scan_vcfs(paths, show_progress):
215217

216218
if vcf_metadata is None:
217219
vcf_metadata = metadata
220+
# We just take the first header, assuming the others
221+
# are compatible.
222+
header = vcf.raw_header
218223
else:
219224
if metadata != vcf_metadata:
220225
raise ValueError("Incompatible VCF chunks")
@@ -230,7 +235,7 @@ def scan_vcfs(paths, show_progress):
230235
)
231236
partitions.sort(key=lambda x: x.first_position)
232237
vcf_metadata.partitions = partitions
233-
return vcf_metadata
238+
return vcf_metadata, header
234239

235240

236241
def sanitise_value_bool(buff, j, value):
@@ -668,9 +673,10 @@ def __exit__(self, exc_type, exc_val, exc_tb):
668673

669674

670675
class PickleChunkedVcf(collections.abc.Mapping):
671-
def __init__(self, path, metadata):
676+
def __init__(self, path, metadata, vcf_header):
672677
self.path = path
673678
self.metadata = metadata
679+
self.vcf_header = vcf_header
674680

675681
self.columns = {}
676682
for field in self.metadata.fields:
@@ -753,7 +759,9 @@ def load(path):
753759
path = pathlib.Path(path)
754760
with open(path / "metadata.json") as f:
755761
metadata = VcfMetadata.fromdict(json.load(f))
756-
return PickleChunkedVcf(path, metadata)
762+
with open(path / "header.txt") as f:
763+
header = f.read()
764+
return PickleChunkedVcf(path, metadata, header)
757765

758766
@staticmethod
759767
def convert_partition(
@@ -820,8 +828,8 @@ def convert(
820828
):
821829
out_path = pathlib.Path(out_path)
822830
# TODO make scan work in parallel using general progress code too
823-
vcf_metadata = scan_vcfs(vcfs, show_progress=show_progress)
824-
pcvcf = PickleChunkedVcf(out_path, vcf_metadata)
831+
vcf_metadata, header = scan_vcfs(vcfs, show_progress=show_progress)
832+
pcvcf = PickleChunkedVcf(out_path, vcf_metadata, header)
825833
pcvcf.mkdirs()
826834

827835
total_variants = sum(
@@ -855,6 +863,8 @@ def convert(
855863

856864
with open(out_path / "metadata.json", "w") as f:
857865
json.dump(vcf_metadata.asdict(), f, indent=4)
866+
with open(out_path / "header.txt", "w") as f:
867+
f.write(header)
858868
return pcvcf
859869

860870

@@ -1214,7 +1224,6 @@ def encode_contig(self, pcvcf, contig_names, contig_lengths):
12141224
logger.debug("Contig done")
12151225

12161226
def encode_filters(self, pcvcf, filter_names):
1217-
self.root.attrs["filters"] = filter_names
12181227
array = self.root.array(
12191228
"filter_id",
12201229
filter_names,
@@ -1277,6 +1286,10 @@ def convert(
12771286
for column in conversion_spec.columns.values():
12781287
sgvcf.create_array(column)
12791288

1289+
sgvcf.root.attrs["vcf_zarr_version"] = "0.2"
1290+
sgvcf.root.attrs["vcf_header"] = pcvcf.vcf_header
1291+
sgvcf.root.attrs["source"] = f"bio2zarr-{provenance.__version__}"
1292+
12801293
progress_config = core.ProgressConfig(
12811294
total=pcvcf.total_uncompressed_bytes,
12821295
title="Encode",

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,5 @@
55
# The package name along with all the other metadata is specified in setup.cfg
66
# However, GitHub's dependency graph can't see the package unless we put this here.
77
name="bio2zarr",
8-
use_scm_version=True,
8+
use_scm_version={"write_to": "bio2zarr/_version.py"},
99
)

tests/test_cli.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
from unittest import mock
22

3+
import pytest
34
import click.testing as ct
45

56
from bio2zarr import cli
7+
from bio2zarr import __main__ as main
8+
from bio2zarr import provenance
69

710

811
class TestWithMocks:
@@ -117,3 +120,11 @@ def test_convert_plink(self):
117120
chunk_length=None,
118121
show_progress=True,
119122
)
123+
124+
125+
@pytest.mark.parametrize("cmd", [main.bio2zarr, cli.vcf2zarr, cli.plink2zarr])
126+
def test_version(cmd):
127+
runner = ct.CliRunner(mix_stderr=False)
128+
result = runner.invoke(cmd, ["--version"], catch_exceptions=False)
129+
s = f"version {provenance.__version__}\n"
130+
assert result.stdout.endswith(s)

tests/test_vcf_examples.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@
33
import xarray.testing as xt
44
import pytest
55
import sgkit as sg
6+
import cyvcf2
67

78
from bio2zarr import vcf
9+
from bio2zarr import provenance
810

911

1012
class TestSmallExample:
@@ -33,6 +35,13 @@ def test_filters(self, ds):
3335
],
3436
)
3537

38+
def test_header(self, ds):
39+
vcf = cyvcf2.VCF(self.data_path)
40+
assert ds.attrs["vcf_header"] == vcf.raw_header
41+
42+
def test_source(self, ds):
43+
assert ds.attrs["source"] == f"bio2zarr-{provenance.__version__}"
44+
3645
def test_contigs(self, ds):
3746
nt.assert_array_equal(ds["contig_id"], ["19", "20", "X"])
3847
assert "contig_length" not in ds

0 commit comments

Comments
 (0)