Skip to content

Commit 17db733

Browse files
benjefferyjeromekelleher
authored andcommitted
Add defaults to schema
1 parent ebfbbf0 commit 17db733

File tree

4 files changed

+156
-82
lines changed

4 files changed

+156
-82
lines changed

bio2zarr/icf.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -843,7 +843,7 @@ def convert_local_allele_field_types(fields):
843843
chunks = gt.chunks[:-1]
844844
dimensions = gt.dimensions[:-1]
845845

846-
la = vcz.ZarrArraySpec.new(
846+
la = vcz.ZarrArraySpec(
847847
name="call_LA",
848848
dtype="i1",
849849
shape=gt.shape,
@@ -1064,14 +1064,20 @@ def fixed_field_spec(
10641064
dimensions=("variants",),
10651065
chunks=None,
10661066
):
1067-
return vcz.ZarrArraySpec.new(
1067+
compressor = (
1068+
vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config()
1069+
if dtype == "bool"
1070+
else None
1071+
)
1072+
return vcz.ZarrArraySpec(
10681073
source=source,
10691074
name=name,
10701075
dtype=dtype,
10711076
shape=shape,
10721077
description="",
10731078
dimensions=dimensions,
10741079
chunks=chunks or [schema_instance.variants_chunk_size],
1080+
compressor=compressor,
10751081
)
10761082

10771083
alt_field = self.fields["ALT"]
@@ -1135,7 +1141,7 @@ def fixed_field_spec(
11351141
]
11361142
dimensions = ["variants", "samples"]
11371143
array_specs.append(
1138-
vcz.ZarrArraySpec.new(
1144+
vcz.ZarrArraySpec(
11391145
name="call_genotype_phased",
11401146
dtype="bool",
11411147
shape=list(shape),
@@ -1148,23 +1154,25 @@ def fixed_field_spec(
11481154
chunks += [ploidy]
11491155
dimensions += ["ploidy"]
11501156
array_specs.append(
1151-
vcz.ZarrArraySpec.new(
1157+
vcz.ZarrArraySpec(
11521158
name="call_genotype",
11531159
dtype=gt_field.smallest_dtype(),
11541160
shape=list(shape),
11551161
chunks=list(chunks),
11561162
dimensions=list(dimensions),
11571163
description="",
1164+
compressor=vcz.DEFAULT_ZARR_COMPRESSOR_GENOTYPES.get_config(),
11581165
)
11591166
)
11601167
array_specs.append(
1161-
vcz.ZarrArraySpec.new(
1168+
vcz.ZarrArraySpec(
11621169
name="call_genotype_mask",
11631170
dtype="bool",
11641171
shape=list(shape),
11651172
chunks=list(chunks),
11661173
dimensions=list(dimensions),
11671174
description="",
1175+
compressor=vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config(),
11681176
)
11691177
)
11701178

bio2zarr/plink.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ def generate_schema(
8282
)
8383

8484
array_specs = [
85-
vcz.ZarrArraySpec.new(
85+
vcz.ZarrArraySpec(
8686
source="position",
8787
name="variant_position",
8888
dtype="i4",
@@ -91,15 +91,15 @@ def generate_schema(
9191
chunks=[schema_instance.variants_chunk_size],
9292
description=None,
9393
),
94-
vcz.ZarrArraySpec.new(
94+
vcz.ZarrArraySpec(
9595
name="variant_allele",
9696
dtype="O",
9797
shape=[m, 2],
9898
dimensions=["variants", "alleles"],
9999
chunks=[schema_instance.variants_chunk_size, 2],
100100
description=None,
101101
),
102-
vcz.ZarrArraySpec.new(
102+
vcz.ZarrArraySpec(
103103
name="call_genotype_phased",
104104
dtype="bool",
105105
shape=[m, n],
@@ -109,8 +109,9 @@ def generate_schema(
109109
schema_instance.samples_chunk_size,
110110
],
111111
description=None,
112+
compressor=vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config(),
112113
),
113-
vcz.ZarrArraySpec.new(
114+
vcz.ZarrArraySpec(
114115
name="call_genotype",
115116
dtype="i1",
116117
shape=[m, n, 2],
@@ -121,8 +122,9 @@ def generate_schema(
121122
2,
122123
],
123124
description=None,
125+
compressor=vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config(),
124126
),
125-
vcz.ZarrArraySpec.new(
127+
vcz.ZarrArraySpec(
126128
name="call_genotype_mask",
127129
dtype="bool",
128130
shape=[m, n, 2],
@@ -133,6 +135,7 @@ def generate_schema(
133135
2,
134136
],
135137
description=None,
138+
compressor=vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config(),
136139
),
137140
]
138141
schema_instance.fields = array_specs

bio2zarr/vcz.py

Lines changed: 37 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,12 @@
1616

1717
ZARR_SCHEMA_FORMAT_VERSION = "0.5"
1818
DEFAULT_ZARR_COMPRESSOR = numcodecs.Blosc(cname="zstd", clevel=7)
19+
DEFAULT_ZARR_COMPRESSOR_GENOTYPES = numcodecs.Blosc(
20+
cname="zstd", clevel=7, shuffle=numcodecs.Blosc.BITSHUFFLE
21+
)
22+
DEFAULT_ZARR_COMPRESSOR_BOOL = numcodecs.Blosc(
23+
cname="zstd", clevel=7, shuffle=numcodecs.Blosc.BITSHUFFLE
24+
)
1925

2026
_fixed_field_descriptions = {
2127
"variant_contig": "An identifier from the reference genome or an angle-bracketed ID"
@@ -93,8 +99,8 @@ class ZarrArraySpec:
9399
chunks: tuple
94100
dimensions: tuple
95101
description: str
96-
compressor: dict
97-
filters: list
102+
compressor: dict = None
103+
filters: list = None
98104
source: str = None
99105

100106
def __post_init__(self):
@@ -105,15 +111,7 @@ def __post_init__(self):
105111
self.shape = tuple(self.shape)
106112
self.chunks = tuple(self.chunks)
107113
self.dimensions = tuple(self.dimensions)
108-
self.filters = tuple(self.filters)
109-
110-
@staticmethod
111-
def new(**kwargs):
112-
spec = ZarrArraySpec(
113-
**kwargs, compressor=DEFAULT_ZARR_COMPRESSOR.get_config(), filters=[]
114-
)
115-
spec._choose_compressor_settings()
116-
return spec
114+
self.filters = tuple(self.filters) if self.filters is not None else None
117115

118116
@staticmethod
119117
def from_field(
@@ -124,6 +122,8 @@ def from_field(
124122
variants_chunk_size,
125123
samples_chunk_size,
126124
array_name=None,
125+
compressor=None,
126+
filters=None,
127127
):
128128
shape = [num_variants]
129129
prefix = "variant_"
@@ -150,39 +150,18 @@ def from_field(
150150
dimensions.append("genotypes")
151151
else:
152152
dimensions.append(f"{vcf_field.category}_{vcf_field.name}_dim")
153-
return ZarrArraySpec.new(
153+
return ZarrArraySpec(
154154
source=vcf_field.full_name,
155155
name=array_name,
156156
dtype=vcf_field.smallest_dtype(),
157157
shape=shape,
158158
chunks=chunks,
159159
dimensions=dimensions,
160160
description=vcf_field.description,
161+
compressor=compressor,
162+
filters=filters,
161163
)
162164

163-
def _choose_compressor_settings(self):
164-
"""
165-
Choose compressor and filter settings based on the size and
166-
type of the array, plus some hueristics from observed properties
167-
of VCFs.
168-
169-
See https://github.com/pystatgen/bio2zarr/discussions/74
170-
"""
171-
# Default is to not shuffle, because autoshuffle isn't recognised
172-
# by many Zarr implementations, and shuffling can lead to worse
173-
# performance in some cases anyway. Turning on shuffle should be a
174-
# deliberate choice.
175-
shuffle = numcodecs.Blosc.NOSHUFFLE
176-
if self.name == "call_genotype" and self.dtype == "i1":
177-
# call_genotype gets BITSHUFFLE by default as it gets
178-
# significantly better compression (at a cost of slower
179-
# decoding)
180-
shuffle = numcodecs.Blosc.BITSHUFFLE
181-
elif self.dtype == "bool":
182-
shuffle = numcodecs.Blosc.BITSHUFFLE
183-
184-
self.compressor["shuffle"] = shuffle
185-
186165
@property
187166
def chunk_nbytes(self):
188167
"""
@@ -240,16 +219,24 @@ class VcfZarrSchema(core.JsonDataclass):
240219
samples_chunk_size: int
241220
variants_chunk_size: int
242221
fields: list
222+
defaults: dict
243223

244224
def __init__(
245225
self,
246226
format_version: str,
247227
fields: list,
248228
variants_chunk_size: int = None,
249229
samples_chunk_size: int = None,
230+
defaults: dict = None,
250231
):
251232
self.format_version = format_version
252233
self.fields = fields
234+
defaults = defaults.copy() if defaults is not None else {}
235+
if defaults.get("compressor", None) is None:
236+
defaults["compressor"] = DEFAULT_ZARR_COMPRESSOR.get_config()
237+
if defaults.get("filters", None) is None:
238+
defaults["filters"] = []
239+
self.defaults = defaults
253240
if variants_chunk_size is None:
254241
variants_chunk_size = 1000
255242
self.variants_chunk_size = variants_chunk_size
@@ -533,7 +520,7 @@ def init(
533520

534521
total_chunks = 0
535522
for field in self.schema.fields:
536-
a = self.init_array(root, field, partitions[-1].stop)
523+
a = self.init_array(root, self.metadata.schema, field, partitions[-1].stop)
537524
total_chunks += a.nchunks
538525

539526
logger.info("Writing WIP metadata")
@@ -600,9 +587,20 @@ def encode_filters(self, root):
600587
)
601588
array.attrs["_ARRAY_DIMENSIONS"] = ["filters"]
602589

603-
def init_array(self, root, array_spec, variants_dim_size):
590+
def init_array(self, root, schema, array_spec, variants_dim_size):
604591
kwargs = dict(zarr_utils.ZARR_FORMAT_KWARGS)
605-
filters = [numcodecs.get_codec(filt) for filt in array_spec.filters]
592+
filters = (
593+
array_spec.filters
594+
if array_spec.filters is not None
595+
else schema.defaults["filters"]
596+
)
597+
filters = [numcodecs.get_codec(filt) for filt in filters]
598+
compressor = (
599+
array_spec.compressor
600+
if array_spec.compressor is not None
601+
else schema.defaults["compressor"]
602+
)
603+
compressor = numcodecs.get_codec(compressor)
606604
if array_spec.dtype == "O":
607605
if zarr_utils.zarr_v3():
608606
filters = [*list(filters), numcodecs.VLenUTF8()]
@@ -620,7 +618,7 @@ def init_array(self, root, array_spec, variants_dim_size):
620618
shape=shape,
621619
chunks=array_spec.chunks,
622620
dtype=array_spec.dtype,
623-
compressor=numcodecs.get_codec(array_spec.compressor),
621+
compressor=compressor,
624622
filters=filters,
625623
**kwargs,
626624
)

0 commit comments

Comments
 (0)