16
16
17
17
ZARR_SCHEMA_FORMAT_VERSION = "0.5"
18
18
DEFAULT_ZARR_COMPRESSOR = numcodecs .Blosc (cname = "zstd" , clevel = 7 )
19
+ DEFAULT_ZARR_COMPRESSOR_GENOTYPES = numcodecs .Blosc (
20
+ cname = "zstd" , clevel = 7 , shuffle = numcodecs .Blosc .BITSHUFFLE
21
+ )
22
+ DEFAULT_ZARR_COMPRESSOR_BOOL = numcodecs .Blosc (
23
+ cname = "zstd" , clevel = 7 , shuffle = numcodecs .Blosc .BITSHUFFLE
24
+ )
19
25
20
26
_fixed_field_descriptions = {
21
27
"variant_contig" : "An identifier from the reference genome or an angle-bracketed ID"
@@ -93,8 +99,8 @@ class ZarrArraySpec:
93
99
chunks : tuple
94
100
dimensions : tuple
95
101
description : str
96
- compressor : dict
97
- filters : list
102
+ compressor : dict = None
103
+ filters : list = None
98
104
source : str = None
99
105
100
106
def __post_init__ (self ):
@@ -105,15 +111,7 @@ def __post_init__(self):
105
111
self .shape = tuple (self .shape )
106
112
self .chunks = tuple (self .chunks )
107
113
self .dimensions = tuple (self .dimensions )
108
- self .filters = tuple (self .filters )
109
-
110
- @staticmethod
111
- def new (** kwargs ):
112
- spec = ZarrArraySpec (
113
- ** kwargs , compressor = DEFAULT_ZARR_COMPRESSOR .get_config (), filters = []
114
- )
115
- spec ._choose_compressor_settings ()
116
- return spec
114
+ self .filters = tuple (self .filters ) if self .filters is not None else None
117
115
118
116
@staticmethod
119
117
def from_field (
@@ -124,6 +122,8 @@ def from_field(
124
122
variants_chunk_size ,
125
123
samples_chunk_size ,
126
124
array_name = None ,
125
+ compressor = None ,
126
+ filters = None ,
127
127
):
128
128
shape = [num_variants ]
129
129
prefix = "variant_"
@@ -150,39 +150,18 @@ def from_field(
150
150
dimensions .append ("genotypes" )
151
151
else :
152
152
dimensions .append (f"{ vcf_field .category } _{ vcf_field .name } _dim" )
153
- return ZarrArraySpec . new (
153
+ return ZarrArraySpec (
154
154
source = vcf_field .full_name ,
155
155
name = array_name ,
156
156
dtype = vcf_field .smallest_dtype (),
157
157
shape = shape ,
158
158
chunks = chunks ,
159
159
dimensions = dimensions ,
160
160
description = vcf_field .description ,
161
+ compressor = compressor ,
162
+ filters = filters ,
161
163
)
162
164
163
- def _choose_compressor_settings (self ):
164
- """
165
- Choose compressor and filter settings based on the size and
166
- type of the array, plus some hueristics from observed properties
167
- of VCFs.
168
-
169
- See https://github.com/pystatgen/bio2zarr/discussions/74
170
- """
171
- # Default is to not shuffle, because autoshuffle isn't recognised
172
- # by many Zarr implementations, and shuffling can lead to worse
173
- # performance in some cases anyway. Turning on shuffle should be a
174
- # deliberate choice.
175
- shuffle = numcodecs .Blosc .NOSHUFFLE
176
- if self .name == "call_genotype" and self .dtype == "i1" :
177
- # call_genotype gets BITSHUFFLE by default as it gets
178
- # significantly better compression (at a cost of slower
179
- # decoding)
180
- shuffle = numcodecs .Blosc .BITSHUFFLE
181
- elif self .dtype == "bool" :
182
- shuffle = numcodecs .Blosc .BITSHUFFLE
183
-
184
- self .compressor ["shuffle" ] = shuffle
185
-
186
165
@property
187
166
def chunk_nbytes (self ):
188
167
"""
@@ -240,16 +219,24 @@ class VcfZarrSchema(core.JsonDataclass):
240
219
samples_chunk_size : int
241
220
variants_chunk_size : int
242
221
fields : list
222
+ defaults : dict
243
223
244
224
def __init__ (
245
225
self ,
246
226
format_version : str ,
247
227
fields : list ,
248
228
variants_chunk_size : int = None ,
249
229
samples_chunk_size : int = None ,
230
+ defaults : dict = None ,
250
231
):
251
232
self .format_version = format_version
252
233
self .fields = fields
234
+ defaults = defaults .copy () if defaults is not None else {}
235
+ if defaults .get ("compressor" , None ) is None :
236
+ defaults ["compressor" ] = DEFAULT_ZARR_COMPRESSOR .get_config ()
237
+ if defaults .get ("filters" , None ) is None :
238
+ defaults ["filters" ] = []
239
+ self .defaults = defaults
253
240
if variants_chunk_size is None :
254
241
variants_chunk_size = 1000
255
242
self .variants_chunk_size = variants_chunk_size
@@ -533,7 +520,7 @@ def init(
533
520
534
521
total_chunks = 0
535
522
for field in self .schema .fields :
536
- a = self .init_array (root , field , partitions [- 1 ].stop )
523
+ a = self .init_array (root , self . metadata . schema , field , partitions [- 1 ].stop )
537
524
total_chunks += a .nchunks
538
525
539
526
logger .info ("Writing WIP metadata" )
@@ -600,9 +587,20 @@ def encode_filters(self, root):
600
587
)
601
588
array .attrs ["_ARRAY_DIMENSIONS" ] = ["filters" ]
602
589
603
- def init_array (self , root , array_spec , variants_dim_size ):
590
+ def init_array (self , root , schema , array_spec , variants_dim_size ):
604
591
kwargs = dict (zarr_utils .ZARR_FORMAT_KWARGS )
605
- filters = [numcodecs .get_codec (filt ) for filt in array_spec .filters ]
592
+ filters = (
593
+ array_spec .filters
594
+ if array_spec .filters is not None
595
+ else schema .defaults ["filters" ]
596
+ )
597
+ filters = [numcodecs .get_codec (filt ) for filt in filters ]
598
+ compressor = (
599
+ array_spec .compressor
600
+ if array_spec .compressor is not None
601
+ else schema .defaults ["compressor" ]
602
+ )
603
+ compressor = numcodecs .get_codec (compressor )
606
604
if array_spec .dtype == "O" :
607
605
if zarr_utils .zarr_v3 ():
608
606
filters = [* list (filters ), numcodecs .VLenUTF8 ()]
@@ -620,7 +618,7 @@ def init_array(self, root, array_spec, variants_dim_size):
620
618
shape = shape ,
621
619
chunks = array_spec .chunks ,
622
620
dtype = array_spec .dtype ,
623
- compressor = numcodecs . get_codec ( array_spec . compressor ) ,
621
+ compressor = compressor ,
624
622
filters = filters ,
625
623
** kwargs ,
626
624
)
0 commit comments