Skip to content

Commit facce11

Browse files
Merge pull request #75 from jeromekelleher/zarr-info
Zarr info
2 parents c404974 + 48f15b5 commit facce11

File tree

4 files changed

+69
-13
lines changed

4 files changed

+69
-13
lines changed

bio2zarr/cli.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
help="Chunk size in the samples dimension",
3232
)
3333

34-
version = click.version_option(version=f"bio2zarr {provenance.__version__}")
34+
version = click.version_option(version=f"{provenance.__version__}")
3535

3636

3737
# Note: logging hasn't been implemented in the code at all, this is just

bio2zarr/vcf.py

Lines changed: 54 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,17 @@
4040
)
4141

4242

43+
def display_number(x):
44+
ret = "n/a"
45+
if math.isfinite(x):
46+
ret = f"{x: 0.2g}"
47+
return ret
48+
49+
50+
def display_size(n):
51+
return humanfriendly.format_size(n)
52+
53+
4354
@dataclasses.dataclass
4455
class VcfFieldSummary:
4556
num_chunks: int = 0
@@ -812,15 +823,6 @@ def __len__(self):
812823
return len(self.columns)
813824

814825
def summary_table(self):
815-
def display_number(x):
816-
ret = "n/a"
817-
if math.isfinite(x):
818-
ret = f"{x: 0.2g}"
819-
return ret
820-
821-
def display_size(n):
822-
return humanfriendly.format_size(n)
823-
824826
data = []
825827
for name, col in self.columns.items():
826828
summary = col.vcf_field.summary
@@ -1028,10 +1030,16 @@ def explode(
10281030
return PickleChunkedVcf.load(out_path)
10291031

10301032

1031-
def inspect(if_path):
1033+
def inspect(path):
1034+
path = pathlib.Path(path)
10321035
# TODO add support for the Zarr format also
1033-
pcvcf = PickleChunkedVcf.load(if_path)
1034-
return pcvcf.summary_table()
1036+
if (path / "metadata.json").exists():
1037+
obj = PickleChunkedVcf.load(path)
1038+
elif (path / ".zmetadata").exists():
1039+
obj = VcfZarr(path)
1040+
else:
1041+
raise ValueError("Format not recognised")
1042+
return obj.summary_table()
10351043

10361044

10371045
@dataclasses.dataclass
@@ -1244,6 +1252,40 @@ def fixed_field_spec(
12441252
)
12451253

12461254

1255+
class VcfZarr:
1256+
def __init__(self, path):
1257+
if not (path / ".zmetadata").exists():
1258+
raise ValueError("Not in VcfZarr format")
1259+
self.root = zarr.open(path, mode="r")
1260+
1261+
def __repr__(self):
1262+
return repr(self.root)
1263+
1264+
def summary_table(self):
1265+
data = []
1266+
arrays = [(a.nbytes_stored, a) for _, a in self.root.arrays()]
1267+
arrays.sort(key=lambda x: x[0])
1268+
for stored, array in reversed(arrays):
1269+
d = {
1270+
"name": array.name,
1271+
"dtype": str(array.dtype),
1272+
"stored": display_size(stored),
1273+
"size": display_size(array.nbytes),
1274+
"ratio": display_number(array.nbytes / stored),
1275+
"nchunks": str(array.nchunks),
1276+
"chunk_size": display_size(array.nbytes / array.nchunks),
1277+
"avg_chunk_stored": display_size(int(stored / array.nchunks)),
1278+
"shape": str(array.shape),
1279+
"chunk_shape": str(array.chunks),
1280+
"compressor": str(array.compressor),
1281+
"filters": str(array.filters),
1282+
}
1283+
data.append(d)
1284+
return data
1285+
1286+
1287+
# TODO refactor this into a VcfZarrWriter class, and get rid of the
1288+
# statis methods.
12471289
class SgvcfZarr:
12481290
def __init__(self, path):
12491291
self.path = pathlib.Path(path)

tests/test_cli.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ def test_convert_plink(self):
127127
)
128128

129129

130+
130131
class TestVcfPartition:
131132
def test_num_parts(self):
132133
path = "tests/data/vcf/NA12878.prod.chr20snippet.g.vcf.gz"

tests/test_vcf_examples.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,19 @@ def test_worker_processes(self, ds, tmp_path, worker_processes):
334334
ds2 = sg.load_dataset(out)
335335
xt.assert_equal(ds, ds2)
336336

337+
def test_inspect(self, tmp_path):
338+
# TODO pretty weak test, we should be doing this better somewhere else
339+
out = tmp_path / "example.vcf.zarr"
340+
vcf.convert(
341+
[self.data_path],
342+
out,
343+
chunk_length=3,
344+
)
345+
data = vcf.inspect(out)
346+
assert len(data) > 0
347+
for row in data:
348+
assert "name" in row
349+
337350

338351
class Test1000G2020Example:
339352
data_path = "tests/data/vcf/1kg_2020_chrM.vcf.gz"

0 commit comments

Comments
 (0)