Skip to content

Commit 8874dea

Browse files
Merge pull request #86 from jeromekelleher/improved-schema-defaults
Improved schema defaults
2 parents 1c7faff + f2796b1 commit 8874dea

File tree

5 files changed

+173
-39
lines changed

5 files changed

+173
-39
lines changed

bio2zarr/core.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,6 @@
1616

1717
numcodecs.blosc.use_threads = False
1818

19-
# TODO this should probably go in another module where we abstract
20-
# out the zarr defaults
21-
default_compressor = numcodecs.Blosc(
22-
cname="zstd", clevel=7, shuffle=numcodecs.Blosc.AUTOSHUFFLE
23-
)
24-
2519

2620
def chunk_aligned_slices(z, n, max_chunks=None):
2721
"""

bio2zarr/plink.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import numpy as np
55
import zarr
66
import bed_reader
7+
import numcodecs
78

89
from . import core
910

@@ -82,11 +83,16 @@ def convert(
8283
chunks = [variants_chunk_size, samples_chunk_size]
8384
dimensions = ["variants", "samples"]
8485

86+
# TODO we should be reusing some logic from vcfzarr here on laying
87+
# out the basic dataset, and using the schema generator. Currently
88+
# we're not using the best Blosc settings for genotypes here.
89+
default_compressor = numcodecs.Blosc(cname="zstd", clevel=7)
90+
8591
a = root.array(
8692
"sample_id",
8793
bed.iid,
8894
dtype="str",
89-
compressor=core.default_compressor,
95+
compressor=default_compressor,
9096
chunks=(samples_chunk_size,),
9197
)
9298
a.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
@@ -98,7 +104,7 @@ def convert(
98104
"variant_position",
99105
bed.bp_position,
100106
dtype=np.int32,
101-
compressor=core.default_compressor,
107+
compressor=default_compressor,
102108
chunks=(variants_chunk_size,),
103109
)
104110
a.attrs["_ARRAY_DIMENSIONS"] = ["variants"]
@@ -109,7 +115,7 @@ def convert(
109115
"variant_allele",
110116
alleles,
111117
dtype="str",
112-
compressor=core.default_compressor,
118+
compressor=default_compressor,
113119
chunks=(variants_chunk_size,),
114120
)
115121
a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
@@ -121,7 +127,7 @@ def convert(
121127
dtype="bool",
122128
shape=list(shape),
123129
chunks=list(chunks),
124-
compressor=core.default_compressor,
130+
compressor=default_compressor,
125131
)
126132
a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
127133

@@ -132,7 +138,7 @@ def convert(
132138
dtype="i1",
133139
shape=list(shape),
134140
chunks=list(chunks),
135-
compressor=core.default_compressor,
141+
compressor=default_compressor,
136142
)
137143
a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
138144

@@ -141,7 +147,7 @@ def convert(
141147
dtype="bool",
142148
shape=list(shape),
143149
chunks=list(chunks),
144-
compressor=core.default_compressor,
150+
compressor=default_compressor,
145151
)
146152
a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
147153

bio2zarr/vcf.py

Lines changed: 41 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1049,6 +1049,9 @@ def inspect(path):
10491049
return obj.summary_table()
10501050

10511051

1052+
DEFAULT_ZARR_COMPRESSOR = numcodecs.Blosc(cname="zstd", clevel=7)
1053+
1054+
10521055
@dataclasses.dataclass
10531056
class ZarrColumnSpec:
10541057
name: str
@@ -1058,20 +1061,46 @@ class ZarrColumnSpec:
10581061
dimensions: list
10591062
description: str
10601063
vcf_field: str
1061-
compressor: dict
1064+
compressor: dict = None
1065+
filters: list = None
10621066
# TODO add filters
10631067

10641068
def __post_init__(self):
10651069
self.shape = tuple(self.shape)
10661070
self.chunks = tuple(self.chunks)
10671071
self.dimensions = tuple(self.dimensions)
1072+
self.compressor = DEFAULT_ZARR_COMPRESSOR.get_config()
1073+
self.filters = []
1074+
self._choose_compressor_settings()
1075+
1076+
def _choose_compressor_settings(self):
1077+
"""
1078+
Choose compressor and filter settings based on the size and
1079+
type of the array, plus some hueristics from observed properties
1080+
of VCFs.
1081+
1082+
See https://github.com/pystatgen/bio2zarr/discussions/74
1083+
"""
1084+
dt = np.dtype(self.dtype)
1085+
# Default is to not shuffle, because autoshuffle isn't recognised
1086+
# by many Zarr implementations, and shuffling can lead to worse
1087+
# performance in some cases anyway. Turning on shuffle should be a
1088+
# deliberate choice.
1089+
shuffle = numcodecs.Blosc.NOSHUFFLE
1090+
if dt.itemsize == 1:
1091+
# Any 1 byte field gets BITSHUFFLE by default
1092+
shuffle = numcodecs.Blosc.BITSHUFFLE
1093+
self.compressor["shuffle"] = shuffle
1094+
1095+
if dt.name == "bool":
1096+
self.filters.append(numcodecs.PackBits().get_config())
10681097

10691098

10701099
ZARR_SCHEMA_FORMAT_VERSION = "0.2"
10711100

10721101

10731102
@dataclasses.dataclass
1074-
class ZarrConversionSpec:
1103+
class VcfZarrSchema:
10751104
format_version: str
10761105
samples_chunk_size: int
10771106
variants_chunk_size: int
@@ -1095,15 +1124,15 @@ def fromdict(d):
10951124
"Zarr schema format version mismatch: "
10961125
f"{d['format_version']} != {ZARR_SCHEMA_FORMAT_VERSION}"
10971126
)
1098-
ret = ZarrConversionSpec(**d)
1127+
ret = VcfZarrSchema(**d)
10991128
ret.columns = {
11001129
key: ZarrColumnSpec(**value) for key, value in d["columns"].items()
11011130
}
11021131
return ret
11031132

11041133
@staticmethod
11051134
def fromjson(s):
1106-
return ZarrConversionSpec.fromdict(json.loads(s))
1135+
return VcfZarrSchema.fromdict(json.loads(s))
11071136

11081137
@staticmethod
11091138
def generate(pcvcf, variants_chunk_size=None, samples_chunk_size=None):
@@ -1117,7 +1146,6 @@ def generate(pcvcf, variants_chunk_size=None, samples_chunk_size=None):
11171146
logger.info(
11181147
f"Generating schema with chunks={variants_chunk_size, samples_chunk_size}"
11191148
)
1120-
compressor = core.default_compressor.get_config()
11211149

11221150
def fixed_field_spec(
11231151
name, dtype, vcf_field=None, shape=(m,), dimensions=("variants",)
@@ -1130,7 +1158,6 @@ def fixed_field_spec(
11301158
description="",
11311159
dimensions=dimensions,
11321160
chunks=[variants_chunk_size],
1133-
compressor=compressor,
11341161
)
11351162

11361163
alt_col = pcvcf.columns["ALT"]
@@ -1206,7 +1233,6 @@ def fixed_field_spec(
12061233
chunks=chunks,
12071234
dimensions=dimensions,
12081235
description=field.description,
1209-
compressor=compressor,
12101236
)
12111237
colspecs.append(colspec)
12121238

@@ -1225,7 +1251,6 @@ def fixed_field_spec(
12251251
chunks=list(chunks),
12261252
dimensions=list(dimensions),
12271253
description="",
1228-
compressor=compressor,
12291254
)
12301255
)
12311256
shape += [ploidy]
@@ -1239,7 +1264,6 @@ def fixed_field_spec(
12391264
chunks=list(chunks),
12401265
dimensions=list(dimensions),
12411266
description="",
1242-
compressor=compressor,
12431267
)
12441268
)
12451269
colspecs.append(
@@ -1251,11 +1275,10 @@ def fixed_field_spec(
12511275
chunks=list(chunks),
12521276
dimensions=list(dimensions),
12531277
description="",
1254-
compressor=compressor,
12551278
)
12561279
)
12571280

1258-
return ZarrConversionSpec(
1281+
return VcfZarrSchema(
12591282
format_version=ZARR_SCHEMA_FORMAT_VERSION,
12601283
samples_chunk_size=samples_chunk_size,
12611284
variants_chunk_size=variants_chunk_size,
@@ -1328,6 +1351,7 @@ def init_array(self, variable):
13281351
chunks=variable.chunks,
13291352
dtype=variable.dtype,
13301353
compressor=numcodecs.get_codec(variable.compressor),
1354+
filters=[numcodecs.get_codec(filt) for filt in variable.filters],
13311355
object_codec=object_codec,
13321356
)
13331357
a.attrs["_ARRAY_DIMENSIONS"] = variable.dimensions
@@ -1446,7 +1470,7 @@ def encode_samples(self):
14461470
"sample_id",
14471471
self.schema.sample_id,
14481472
dtype="str",
1449-
compressor=core.default_compressor,
1473+
compressor=DEFAULT_ZARR_COMPRESSOR,
14501474
chunks=(self.schema.samples_chunk_size,),
14511475
)
14521476
array.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
@@ -1457,7 +1481,7 @@ def encode_contig_id(self):
14571481
"contig_id",
14581482
self.schema.contig_id,
14591483
dtype="str",
1460-
compressor=core.default_compressor,
1484+
compressor=DEFAULT_ZARR_COMPRESSOR,
14611485
)
14621486
array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
14631487
if self.schema.contig_length is not None:
@@ -1474,7 +1498,7 @@ def encode_filter_id(self):
14741498
"filter_id",
14751499
self.schema.filter_id,
14761500
dtype="str",
1477-
compressor=core.default_compressor,
1501+
compressor=DEFAULT_ZARR_COMPRESSOR,
14781502
)
14791503
array.attrs["_ARRAY_DIMENSIONS"] = ["filters"]
14801504
return {v: j for j, v in enumerate(self.schema.filter_id)}
@@ -1647,7 +1671,7 @@ def service_completed_futures():
16471671

16481672
def mkschema(if_path, out):
16491673
pcvcf = PickleChunkedVcf.load(if_path)
1650-
spec = ZarrConversionSpec.generate(pcvcf)
1674+
spec = VcfZarrSchema.generate(pcvcf)
16511675
out.write(spec.asjson())
16521676

16531677

@@ -1664,7 +1688,7 @@ def encode(
16641688
):
16651689
pcvcf = PickleChunkedVcf.load(if_path)
16661690
if schema_path is None:
1667-
schema = ZarrConversionSpec.generate(
1691+
schema = VcfZarrSchema.generate(
16681692
pcvcf,
16691693
variants_chunk_size=variants_chunk_size,
16701694
samples_chunk_size=samples_chunk_size,
@@ -1674,7 +1698,7 @@ def encode(
16741698
if variants_chunk_size is not None or samples_chunk_size is not None:
16751699
raise ValueError("Cannot specify schema along with chunk sizes")
16761700
with open(schema_path, "r") as f:
1677-
schema = ZarrConversionSpec.fromjson(f.read())
1701+
schema = VcfZarrSchema.fromjson(f.read())
16781702
zarr_path = pathlib.Path(zarr_path)
16791703
if zarr_path.exists():
16801704
logger.warning(f"Deleting existing {zarr_path}")

tests/test_pcvcf.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,8 @@ def test_mkschema(self, tmp_path, pcvcf):
2727
with open(schema_file, "w") as f:
2828
vcf.mkschema(pcvcf.path, f)
2929
with open(schema_file, "r") as f:
30-
schema1 = vcf.ZarrConversionSpec.fromjson(f.read())
31-
schema2 = vcf.ZarrConversionSpec.generate(pcvcf)
30+
schema1 = vcf.VcfZarrSchema.fromjson(f.read())
31+
schema2 = vcf.VcfZarrSchema.generate(pcvcf)
3232
assert schema1 == schema2
3333

3434
def test_summary_table(self, pcvcf):
@@ -95,7 +95,7 @@ def pcvcf(self, tmp_path_factory):
9595

9696
@pytest.fixture(scope="class")
9797
def schema(self, pcvcf):
98-
return vcf.ZarrConversionSpec.generate(pcvcf)
98+
return vcf.VcfZarrSchema.generate(pcvcf)
9999

100100
@pytest.mark.parametrize(
101101
("name", "dtype", "shape"),
@@ -165,8 +165,8 @@ def test_repr(self, pcvcf):
165165

166166
def test_pos_repr(self, pcvcf):
167167
assert repr(pcvcf["POS"]).startswith(
168-
"PickleChunkedVcfField(name=POS, partition_chunks=[8, 8, 8, 8, 8], path=")
169-
168+
"PickleChunkedVcfField(name=POS, partition_chunks=[8, 8, 8, 8, 8], path="
169+
)
170170

171171
def test_partition_record_index(self, pcvcf):
172172
nt.assert_array_equal(

0 commit comments

Comments
 (0)