|
40 | 40 | )
|
41 | 41 |
|
42 | 42 |
|
| 43 | +def display_number(x): |
| 44 | + ret = "n/a" |
| 45 | + if math.isfinite(x): |
| 46 | + ret = f"{x: 0.2g}" |
| 47 | + return ret |
| 48 | + |
| 49 | + |
| 50 | +def display_size(n): |
| 51 | + return humanfriendly.format_size(n) |
| 52 | + |
| 53 | + |
43 | 54 | @dataclasses.dataclass
|
44 | 55 | class VcfFieldSummary:
|
45 | 56 | num_chunks: int = 0
|
@@ -812,15 +823,6 @@ def __len__(self):
|
812 | 823 | return len(self.columns)
|
813 | 824 |
|
814 | 825 | def summary_table(self):
|
815 |
| - def display_number(x): |
816 |
| - ret = "n/a" |
817 |
| - if math.isfinite(x): |
818 |
| - ret = f"{x: 0.2g}" |
819 |
| - return ret |
820 |
| - |
821 |
| - def display_size(n): |
822 |
| - return humanfriendly.format_size(n) |
823 |
| - |
824 | 826 | data = []
|
825 | 827 | for name, col in self.columns.items():
|
826 | 828 | summary = col.vcf_field.summary
|
@@ -1028,10 +1030,16 @@ def explode(
|
1028 | 1030 | return PickleChunkedVcf.load(out_path)
|
1029 | 1031 |
|
1030 | 1032 |
|
1031 |
| -def inspect(if_path): |
| 1033 | +def inspect(path): |
| 1034 | + path = pathlib.Path(path) |
1032 | 1035 | # TODO add support for the Zarr format also
|
1033 |
| - pcvcf = PickleChunkedVcf.load(if_path) |
1034 |
| - return pcvcf.summary_table() |
| 1036 | + if (path / "metadata.json").exists(): |
| 1037 | + obj = PickleChunkedVcf.load(path) |
| 1038 | + elif (path / ".zmetadata").exists(): |
| 1039 | + obj = VcfZarr(path) |
| 1040 | + else: |
| 1041 | + raise ValueError("Format not recognised") |
| 1042 | + return obj.summary_table() |
1035 | 1043 |
|
1036 | 1044 |
|
1037 | 1045 | @dataclasses.dataclass
|
@@ -1244,6 +1252,40 @@ def fixed_field_spec(
|
1244 | 1252 | )
|
1245 | 1253 |
|
1246 | 1254 |
|
| 1255 | +class VcfZarr: |
| 1256 | + def __init__(self, path): |
| 1257 | + if not (path / ".zmetadata").exists(): |
| 1258 | + raise ValueError("Not in VcfZarr format") |
| 1259 | + self.root = zarr.open(path, mode="r") |
| 1260 | + |
| 1261 | + def __repr__(self): |
| 1262 | + return repr(self.root) |
| 1263 | + |
| 1264 | + def summary_table(self): |
| 1265 | + data = [] |
| 1266 | + arrays = [(a.nbytes_stored, a) for _, a in self.root.arrays()] |
| 1267 | + arrays.sort(key=lambda x: x[0]) |
| 1268 | + for stored, array in reversed(arrays): |
| 1269 | + d = { |
| 1270 | + "name": array.name, |
| 1271 | + "dtype": str(array.dtype), |
| 1272 | + "stored": display_size(stored), |
| 1273 | + "size": display_size(array.nbytes), |
| 1274 | + "ratio": display_number(array.nbytes / stored), |
| 1275 | + "nchunks": str(array.nchunks), |
| 1276 | + "chunk_size": display_size(array.nbytes / array.nchunks), |
| 1277 | + "avg_chunk_stored": display_size(int(stored / array.nchunks)), |
| 1278 | + "shape": str(array.shape), |
| 1279 | + "chunk_shape": str(array.chunks), |
| 1280 | + "compressor": str(array.compressor), |
| 1281 | + "filters": str(array.filters), |
| 1282 | + } |
| 1283 | + data.append(d) |
| 1284 | + return data |
| 1285 | + |
| 1286 | + |
| 1287 | +# TODO refactor this into a VcfZarrWriter class, and get rid of the |
| 1288 | +# statis methods. |
1247 | 1289 | class SgvcfZarr:
|
1248 | 1290 | def __init__(self, path):
|
1249 | 1291 | self.path = pathlib.Path(path)
|
|
0 commit comments