@@ -1049,6 +1049,9 @@ def inspect(path):
1049
1049
return obj .summary_table ()
1050
1050
1051
1051
1052
+ DEFAULT_ZARR_COMPRESSOR = numcodecs .Blosc (cname = "zstd" , clevel = 7 )
1053
+
1054
+
1052
1055
@dataclasses .dataclass
1053
1056
class ZarrColumnSpec :
1054
1057
name : str
@@ -1058,17 +1061,45 @@ class ZarrColumnSpec:
1058
1061
dimensions : list
1059
1062
description : str
1060
1063
vcf_field : str
1061
- compressor : dict
1064
+ compressor : dict = None
1065
+ filters : list = None
1062
1066
# TODO add filters
1063
1067
1064
1068
def __post_init__ (self ):
1065
1069
self .shape = tuple (self .shape )
1066
1070
self .chunks = tuple (self .chunks )
1067
1071
self .dimensions = tuple (self .dimensions )
1072
+ self .compressor = DEFAULT_ZARR_COMPRESSOR .get_config ()
1073
+ self .filters = []
1074
+ self ._choose_compressor_settings ()
1075
+
1076
+ def _choose_compressor_settings (self ):
1077
+ """
1078
+ Choose compressor and filter settings based on the size and
1079
+ type of the array, plus some hueristics from observed properties
1080
+ of VCFs.
1081
+
1082
+ See https://github.com/pystatgen/bio2zarr/discussions/74
1083
+ """
1084
+ dt = np .dtype (self .dtype )
1085
+ # Default is to not shuffle, because autoshuffle isn't recognised
1086
+ # by many Zarr implementations, and shuffling can lead to worse
1087
+ # performance in some cases anyway. Turning on shuffle should be a
1088
+ # deliberate choice.
1089
+ shuffle = numcodecs .Blosc .NOSHUFFLE
1090
+ if dt .itemsize == 1 :
1091
+ # Any 1 byte field gets BITSHUFFLE by default
1092
+ shuffle = numcodecs .Blosc .BITSHUFFLE
1093
+ self .compressor ["shuffle" ] = shuffle
1094
+
1095
+ if dt .name == "bool" :
1096
+ self .filters .append (numcodecs .PackBits ().get_config ())
1068
1097
1069
1098
1070
1099
ZARR_SCHEMA_FORMAT_VERSION = "0.2"
1071
1100
1101
+ # RENAME to ZarrSchema
1102
+
1072
1103
1073
1104
@dataclasses .dataclass
1074
1105
class ZarrConversionSpec :
@@ -1117,7 +1148,6 @@ def generate(pcvcf, variants_chunk_size=None, samples_chunk_size=None):
1117
1148
logger .info (
1118
1149
f"Generating schema with chunks={ variants_chunk_size , samples_chunk_size } "
1119
1150
)
1120
- compressor = core .default_compressor .get_config ()
1121
1151
1122
1152
def fixed_field_spec (
1123
1153
name , dtype , vcf_field = None , shape = (m ,), dimensions = ("variants" ,)
@@ -1130,7 +1160,6 @@ def fixed_field_spec(
1130
1160
description = "" ,
1131
1161
dimensions = dimensions ,
1132
1162
chunks = [variants_chunk_size ],
1133
- compressor = compressor ,
1134
1163
)
1135
1164
1136
1165
alt_col = pcvcf .columns ["ALT" ]
@@ -1206,7 +1235,6 @@ def fixed_field_spec(
1206
1235
chunks = chunks ,
1207
1236
dimensions = dimensions ,
1208
1237
description = field .description ,
1209
- compressor = compressor ,
1210
1238
)
1211
1239
colspecs .append (colspec )
1212
1240
@@ -1225,7 +1253,6 @@ def fixed_field_spec(
1225
1253
chunks = list (chunks ),
1226
1254
dimensions = list (dimensions ),
1227
1255
description = "" ,
1228
- compressor = compressor ,
1229
1256
)
1230
1257
)
1231
1258
shape += [ploidy ]
@@ -1239,7 +1266,6 @@ def fixed_field_spec(
1239
1266
chunks = list (chunks ),
1240
1267
dimensions = list (dimensions ),
1241
1268
description = "" ,
1242
- compressor = compressor ,
1243
1269
)
1244
1270
)
1245
1271
colspecs .append (
@@ -1251,7 +1277,6 @@ def fixed_field_spec(
1251
1277
chunks = list (chunks ),
1252
1278
dimensions = list (dimensions ),
1253
1279
description = "" ,
1254
- compressor = compressor ,
1255
1280
)
1256
1281
)
1257
1282
@@ -1328,6 +1353,7 @@ def init_array(self, variable):
1328
1353
chunks = variable .chunks ,
1329
1354
dtype = variable .dtype ,
1330
1355
compressor = numcodecs .get_codec (variable .compressor ),
1356
+ filters = [numcodecs .get_codec (filt ) for filt in variable .filters ],
1331
1357
object_codec = object_codec ,
1332
1358
)
1333
1359
a .attrs ["_ARRAY_DIMENSIONS" ] = variable .dimensions
@@ -1446,7 +1472,7 @@ def encode_samples(self):
1446
1472
"sample_id" ,
1447
1473
self .schema .sample_id ,
1448
1474
dtype = "str" ,
1449
- compressor = core . default_compressor ,
1475
+ compressor = DEFAULT_ZARR_COMPRESSOR ,
1450
1476
chunks = (self .schema .samples_chunk_size ,),
1451
1477
)
1452
1478
array .attrs ["_ARRAY_DIMENSIONS" ] = ["samples" ]
@@ -1457,7 +1483,7 @@ def encode_contig_id(self):
1457
1483
"contig_id" ,
1458
1484
self .schema .contig_id ,
1459
1485
dtype = "str" ,
1460
- compressor = core . default_compressor ,
1486
+ compressor = DEFAULT_ZARR_COMPRESSOR ,
1461
1487
)
1462
1488
array .attrs ["_ARRAY_DIMENSIONS" ] = ["contigs" ]
1463
1489
if self .schema .contig_length is not None :
@@ -1474,7 +1500,7 @@ def encode_filter_id(self):
1474
1500
"filter_id" ,
1475
1501
self .schema .filter_id ,
1476
1502
dtype = "str" ,
1477
- compressor = core . default_compressor ,
1503
+ compressor = DEFAULT_ZARR_COMPRESSOR ,
1478
1504
)
1479
1505
array .attrs ["_ARRAY_DIMENSIONS" ] = ["filters" ]
1480
1506
return {v : j for j , v in enumerate (self .schema .filter_id )}
0 commit comments