Skip to content

Commit ac4aaad

Browse files
Implement basic defaults for schema
Closes #6
1 parent 1c7faff commit ac4aaad

File tree

3 files changed

+150
-20
lines changed

3 files changed

+150
-20
lines changed

bio2zarr/core.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,6 @@
1616

1717
numcodecs.blosc.use_threads = False
1818

19-
# TODO this should probably go in another module where we abstract
20-
# out the zarr defaults
21-
default_compressor = numcodecs.Blosc(
22-
cname="zstd", clevel=7, shuffle=numcodecs.Blosc.AUTOSHUFFLE
23-
)
24-
2519

2620
def chunk_aligned_slices(z, n, max_chunks=None):
2721
"""

bio2zarr/vcf.py

Lines changed: 36 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1049,6 +1049,9 @@ def inspect(path):
10491049
return obj.summary_table()
10501050

10511051

1052+
DEFAULT_ZARR_COMPRESSOR = numcodecs.Blosc(cname="zstd", clevel=7)
1053+
1054+
10521055
@dataclasses.dataclass
10531056
class ZarrColumnSpec:
10541057
name: str
@@ -1058,17 +1061,45 @@ class ZarrColumnSpec:
10581061
dimensions: list
10591062
description: str
10601063
vcf_field: str
1061-
compressor: dict
1064+
compressor: dict = None
1065+
filters: list = None
10621066
# TODO add filters
10631067

10641068
def __post_init__(self):
10651069
self.shape = tuple(self.shape)
10661070
self.chunks = tuple(self.chunks)
10671071
self.dimensions = tuple(self.dimensions)
1072+
self.compressor = DEFAULT_ZARR_COMPRESSOR.get_config()
1073+
self.filters = []
1074+
self._choose_compressor_settings()
1075+
1076+
def _choose_compressor_settings(self):
1077+
"""
1078+
Choose compressor and filter settings based on the size and
1079+
type of the array, plus some hueristics from observed properties
1080+
of VCFs.
1081+
1082+
See https://github.com/pystatgen/bio2zarr/discussions/74
1083+
"""
1084+
dt = np.dtype(self.dtype)
1085+
# Default is to not shuffle, because autoshuffle isn't recognised
1086+
# by many Zarr implementations, and shuffling can lead to worse
1087+
# performance in some cases anyway. Turning on shuffle should be a
1088+
# deliberate choice.
1089+
shuffle = numcodecs.Blosc.NOSHUFFLE
1090+
if dt.itemsize == 1:
1091+
# Any 1 byte field gets BITSHUFFLE by default
1092+
shuffle = numcodecs.Blosc.BITSHUFFLE
1093+
self.compressor["shuffle"] = shuffle
1094+
1095+
if dt.name == "bool":
1096+
self.filters.append(numcodecs.PackBits().get_config())
10681097

10691098

10701099
ZARR_SCHEMA_FORMAT_VERSION = "0.2"
10711100

1101+
# RENAME to ZarrSchema
1102+
10721103

10731104
@dataclasses.dataclass
10741105
class ZarrConversionSpec:
@@ -1117,7 +1148,6 @@ def generate(pcvcf, variants_chunk_size=None, samples_chunk_size=None):
11171148
logger.info(
11181149
f"Generating schema with chunks={variants_chunk_size, samples_chunk_size}"
11191150
)
1120-
compressor = core.default_compressor.get_config()
11211151

11221152
def fixed_field_spec(
11231153
name, dtype, vcf_field=None, shape=(m,), dimensions=("variants",)
@@ -1130,7 +1160,6 @@ def fixed_field_spec(
11301160
description="",
11311161
dimensions=dimensions,
11321162
chunks=[variants_chunk_size],
1133-
compressor=compressor,
11341163
)
11351164

11361165
alt_col = pcvcf.columns["ALT"]
@@ -1206,7 +1235,6 @@ def fixed_field_spec(
12061235
chunks=chunks,
12071236
dimensions=dimensions,
12081237
description=field.description,
1209-
compressor=compressor,
12101238
)
12111239
colspecs.append(colspec)
12121240

@@ -1225,7 +1253,6 @@ def fixed_field_spec(
12251253
chunks=list(chunks),
12261254
dimensions=list(dimensions),
12271255
description="",
1228-
compressor=compressor,
12291256
)
12301257
)
12311258
shape += [ploidy]
@@ -1239,7 +1266,6 @@ def fixed_field_spec(
12391266
chunks=list(chunks),
12401267
dimensions=list(dimensions),
12411268
description="",
1242-
compressor=compressor,
12431269
)
12441270
)
12451271
colspecs.append(
@@ -1251,7 +1277,6 @@ def fixed_field_spec(
12511277
chunks=list(chunks),
12521278
dimensions=list(dimensions),
12531279
description="",
1254-
compressor=compressor,
12551280
)
12561281
)
12571282

@@ -1328,6 +1353,7 @@ def init_array(self, variable):
13281353
chunks=variable.chunks,
13291354
dtype=variable.dtype,
13301355
compressor=numcodecs.get_codec(variable.compressor),
1356+
filters=[numcodecs.get_codec(filt) for filt in variable.filters],
13311357
object_codec=object_codec,
13321358
)
13331359
a.attrs["_ARRAY_DIMENSIONS"] = variable.dimensions
@@ -1446,7 +1472,7 @@ def encode_samples(self):
14461472
"sample_id",
14471473
self.schema.sample_id,
14481474
dtype="str",
1449-
compressor=core.default_compressor,
1475+
compressor=DEFAULT_ZARR_COMPRESSOR,
14501476
chunks=(self.schema.samples_chunk_size,),
14511477
)
14521478
array.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
@@ -1457,7 +1483,7 @@ def encode_contig_id(self):
14571483
"contig_id",
14581484
self.schema.contig_id,
14591485
dtype="str",
1460-
compressor=core.default_compressor,
1486+
compressor=DEFAULT_ZARR_COMPRESSOR,
14611487
)
14621488
array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
14631489
if self.schema.contig_length is not None:
@@ -1474,7 +1500,7 @@ def encode_filter_id(self):
14741500
"filter_id",
14751501
self.schema.filter_id,
14761502
dtype="str",
1477-
compressor=core.default_compressor,
1503+
compressor=DEFAULT_ZARR_COMPRESSOR,
14781504
)
14791505
array.attrs["_ARRAY_DIMENSIONS"] = ["filters"]
14801506
return {v: j for j, v in enumerate(self.schema.filter_id)}

tests/test_vcf.py

Lines changed: 114 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,12 @@ def schema_path(exploded_path, tmp_path_factory):
2626
return out
2727

2828

29+
@pytest.fixture(scope="module")
30+
def schema(schema_path):
31+
with open(schema_path) as f:
32+
return json.load(f)
33+
34+
2935
@pytest.fixture(scope="module")
3036
def zarr_path(exploded_path, tmp_path_factory):
3137
out = tmp_path_factory.mktemp("data") / "example.zarr"
@@ -35,10 +41,8 @@ def zarr_path(exploded_path, tmp_path_factory):
3541

3642
class TestJsonVersions:
3743
@pytest.mark.parametrize("version", ["0.1", "1.0", "xxxxx", 0.2])
38-
def test_zarr_schema_mismatch(self, schema_path, version):
39-
with open(schema_path) as f:
40-
d = json.load(f)
41-
44+
def test_zarr_schema_mismatch(self, schema, version):
45+
d = dict(schema)
4246
d["format_version"] = version
4347
with pytest.raises(ValueError, match="Zarr schema format version mismatch"):
4448
vcf.ZarrConversionSpec.fromdict(d)
@@ -53,3 +57,109 @@ def test_exploded_metadata_mismatch(self, tmpdir, exploded_path, version):
5357
ValueError, match="Exploded metadata format version mismatch"
5458
):
5559
vcf.VcfMetadata.fromdict(d)
60+
61+
62+
class TestDefaultSchema:
63+
def test_format_version(self, schema):
64+
assert schema["format_version"] == vcf.ZARR_SCHEMA_FORMAT_VERSION
65+
66+
def test_chunk_size(self, schema):
67+
assert schema["samples_chunk_size"] == 1000
68+
assert schema["variants_chunk_size"] == 10000
69+
70+
def test_dimensions(self, schema):
71+
assert schema["dimensions"] == [
72+
"variants",
73+
"samples",
74+
"ploidy",
75+
"alleles",
76+
"filters",
77+
]
78+
79+
def test_sample_id(self, schema):
80+
assert schema["sample_id"] == ["NA00001", "NA00002", "NA00003"]
81+
82+
def test_contig_id(self, schema):
83+
assert schema["contig_id"] == ["19", "20", "X"]
84+
85+
def test_contig_length(self, schema):
86+
assert schema["contig_length"] is None
87+
88+
def test_filter_id(self, schema):
89+
assert schema["filter_id"] == ["PASS", "s50", "q10"]
90+
91+
def test_variant_contig(self, schema):
92+
assert schema["columns"]["variant_contig"] == {
93+
"name": "variant_contig",
94+
"dtype": "i2",
95+
"shape": [9],
96+
"chunks": [10000],
97+
"dimensions": ["variants"],
98+
"description": "",
99+
"vcf_field": None,
100+
"compressor": {
101+
"id": "blosc",
102+
"cname": "zstd",
103+
"clevel": 7,
104+
"shuffle": 0,
105+
"blocksize": 0,
106+
},
107+
"filters": [],
108+
}
109+
110+
def test_call_genotype(self, schema):
111+
assert schema["columns"]["call_genotype"] == {
112+
"name": "call_genotype",
113+
"dtype": "i1",
114+
"shape": [9, 3, 2],
115+
"chunks": [10000, 1000],
116+
"dimensions": ["variants", "samples", "ploidy"],
117+
"description": "",
118+
"vcf_field": None,
119+
"compressor": {
120+
"id": "blosc",
121+
"cname": "zstd",
122+
"clevel": 7,
123+
"shuffle": 2,
124+
"blocksize": 0,
125+
},
126+
"filters": [],
127+
}
128+
129+
def test_call_genotype_mask(self, schema):
130+
assert schema["columns"]["call_genotype_mask"] == {
131+
"name": "call_genotype_mask",
132+
"dtype": "bool",
133+
"shape": [9, 3, 2],
134+
"chunks": [10000, 1000],
135+
"dimensions": ["variants", "samples", "ploidy"],
136+
"description": "",
137+
"vcf_field": None,
138+
"compressor": {
139+
"id": "blosc",
140+
"cname": "zstd",
141+
"clevel": 7,
142+
"shuffle": 2,
143+
"blocksize": 0,
144+
},
145+
"filters": [{"id": "packbits"}],
146+
}
147+
148+
def test_call_genotype_phased(self, schema):
149+
assert schema["columns"]["call_genotype_mask"] == {
150+
"name": "call_genotype_mask",
151+
"dtype": "bool",
152+
"shape": [9, 3, 2],
153+
"chunks": [10000, 1000],
154+
"dimensions": ["variants", "samples", "ploidy"],
155+
"description": "",
156+
"vcf_field": None,
157+
"compressor": {
158+
"id": "blosc",
159+
"cname": "zstd",
160+
"clevel": 7,
161+
"shuffle": 2,
162+
"blocksize": 0,
163+
},
164+
"filters": [{"id": "packbits"}],
165+
}

0 commit comments

Comments
 (0)