Skip to content

Commit c20eab3

Browse files
Add Zarr summary table
Closes #39
1 parent c404974 commit c20eab3

File tree

1 file changed

+54
-12
lines changed

1 file changed

+54
-12
lines changed

bio2zarr/vcf.py

Lines changed: 54 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,17 @@
4040
)
4141

4242

43+
def display_number(x):
44+
ret = "n/a"
45+
if math.isfinite(x):
46+
ret = f"{x: 0.2g}"
47+
return ret
48+
49+
50+
def display_size(n):
51+
return humanfriendly.format_size(n)
52+
53+
4354
@dataclasses.dataclass
4455
class VcfFieldSummary:
4556
num_chunks: int = 0
@@ -812,15 +823,6 @@ def __len__(self):
812823
return len(self.columns)
813824

814825
def summary_table(self):
815-
def display_number(x):
816-
ret = "n/a"
817-
if math.isfinite(x):
818-
ret = f"{x: 0.2g}"
819-
return ret
820-
821-
def display_size(n):
822-
return humanfriendly.format_size(n)
823-
824826
data = []
825827
for name, col in self.columns.items():
826828
summary = col.vcf_field.summary
@@ -1028,10 +1030,16 @@ def explode(
10281030
return PickleChunkedVcf.load(out_path)
10291031

10301032

1031-
def inspect(if_path):
1033+
def inspect(path):
1034+
path = pathlib.Path(path)
10321035
# TODO add support for the Zarr format also
1033-
pcvcf = PickleChunkedVcf.load(if_path)
1034-
return pcvcf.summary_table()
1036+
if (path / "metadata.json").exists():
1037+
obj = PickleChunkedVcf.load(path)
1038+
elif (path / ".zmetadata").exists():
1039+
obj = VcfZarr(path)
1040+
else:
1041+
raise ValueError("Format not recognised")
1042+
return obj.summary_table()
10351043

10361044

10371045
@dataclasses.dataclass
@@ -1244,6 +1252,40 @@ def fixed_field_spec(
12441252
)
12451253

12461254

1255+
class VcfZarr:
1256+
def __init__(self, path):
1257+
if not (path / ".zmetadata").exists():
1258+
raise ValueError("Not in VcfZarr format")
1259+
self.root = zarr.open(path, mode="r")
1260+
1261+
def __repr__(self):
1262+
return repr(self.root)
1263+
1264+
def summary_table(self):
1265+
data = []
1266+
arrays = [(a.nbytes_stored, a) for _, a in self.root.arrays()]
1267+
arrays.sort(key=lambda x: x[0])
1268+
for stored, array in reversed(arrays):
1269+
d = {
1270+
"name": array.name,
1271+
"dtype": str(array.dtype),
1272+
"stored": display_size(stored),
1273+
"size": display_size(array.nbytes),
1274+
"ratio": display_number(array.nbytes / stored),
1275+
"nchunks": str(array.nchunks),
1276+
"chunk_size": display_size(array.nbytes / array.nchunks),
1277+
"avg_chunk_stored": display_size(int(stored / array.nchunks)),
1278+
"shape": str(array.shape),
1279+
"chunk_shape": str(array.chunks),
1280+
"compressor": str(array.compressor),
1281+
"filters": str(array.filters),
1282+
}
1283+
data.append(d)
1284+
return data
1285+
1286+
1287+
# TODO refactor this into a VcfZarrWriter class, and get rid of the
1288+
# statis methods.
12471289
class SgvcfZarr:
12481290
def __init__(self, path):
12491291
self.path = pathlib.Path(path)

0 commit comments

Comments
 (0)