Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 13 additions & 5 deletions bio2zarr/icf.py
Original file line number Diff line number Diff line change
Expand Up @@ -843,7 +843,7 @@ def convert_local_allele_field_types(fields):
chunks = gt.chunks[:-1]
dimensions = gt.dimensions[:-1]

la = vcz.ZarrArraySpec.new(
la = vcz.ZarrArraySpec(
name="call_LA",
dtype="i1",
shape=gt.shape,
Expand Down Expand Up @@ -1064,14 +1064,20 @@ def fixed_field_spec(
dimensions=("variants",),
chunks=None,
):
return vcz.ZarrArraySpec.new(
compressor = (
vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config()
if dtype == "bool"
else None
)
return vcz.ZarrArraySpec(
source=source,
name=name,
dtype=dtype,
shape=shape,
description="",
dimensions=dimensions,
chunks=chunks or [schema_instance.variants_chunk_size],
compressor=compressor,
)

alt_field = self.fields["ALT"]
Expand Down Expand Up @@ -1135,7 +1141,7 @@ def fixed_field_spec(
]
dimensions = ["variants", "samples"]
array_specs.append(
vcz.ZarrArraySpec.new(
vcz.ZarrArraySpec(
name="call_genotype_phased",
dtype="bool",
shape=list(shape),
Expand All @@ -1148,23 +1154,25 @@ def fixed_field_spec(
chunks += [ploidy]
dimensions += ["ploidy"]
array_specs.append(
vcz.ZarrArraySpec.new(
vcz.ZarrArraySpec(
name="call_genotype",
dtype=gt_field.smallest_dtype(),
shape=list(shape),
chunks=list(chunks),
dimensions=list(dimensions),
description="",
compressor=vcz.DEFAULT_ZARR_COMPRESSOR_GENOTYPES.get_config(),
)
)
array_specs.append(
vcz.ZarrArraySpec.new(
vcz.ZarrArraySpec(
name="call_genotype_mask",
dtype="bool",
shape=list(shape),
chunks=list(chunks),
dimensions=list(dimensions),
description="",
compressor=vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config(),
)
)

Expand Down
13 changes: 8 additions & 5 deletions bio2zarr/plink.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def generate_schema(
)

array_specs = [
vcz.ZarrArraySpec.new(
vcz.ZarrArraySpec(
source="position",
name="variant_position",
dtype="i4",
Expand All @@ -91,15 +91,15 @@ def generate_schema(
chunks=[schema_instance.variants_chunk_size],
description=None,
),
vcz.ZarrArraySpec.new(
vcz.ZarrArraySpec(
name="variant_allele",
dtype="O",
shape=[m, 2],
dimensions=["variants", "alleles"],
chunks=[schema_instance.variants_chunk_size, 2],
description=None,
),
vcz.ZarrArraySpec.new(
vcz.ZarrArraySpec(
name="call_genotype_phased",
dtype="bool",
shape=[m, n],
Expand All @@ -109,8 +109,9 @@ def generate_schema(
schema_instance.samples_chunk_size,
],
description=None,
compressor=vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config(),
),
vcz.ZarrArraySpec.new(
vcz.ZarrArraySpec(
name="call_genotype",
dtype="i1",
shape=[m, n, 2],
Expand All @@ -121,8 +122,9 @@ def generate_schema(
2,
],
description=None,
compressor=vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config(),
),
vcz.ZarrArraySpec.new(
vcz.ZarrArraySpec(
name="call_genotype_mask",
dtype="bool",
shape=[m, n, 2],
Expand All @@ -133,6 +135,7 @@ def generate_schema(
2,
],
description=None,
compressor=vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config(),
),
]
schema_instance.fields = array_specs
Expand Down
76 changes: 37 additions & 39 deletions bio2zarr/vcz.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@

ZARR_SCHEMA_FORMAT_VERSION = "0.5"
DEFAULT_ZARR_COMPRESSOR = numcodecs.Blosc(cname="zstd", clevel=7)
DEFAULT_ZARR_COMPRESSOR_GENOTYPES = numcodecs.Blosc(
cname="zstd", clevel=7, shuffle=numcodecs.Blosc.BITSHUFFLE
)
DEFAULT_ZARR_COMPRESSOR_BOOL = numcodecs.Blosc(
cname="zstd", clevel=7, shuffle=numcodecs.Blosc.BITSHUFFLE
)

_fixed_field_descriptions = {
"variant_contig": "An identifier from the reference genome or an angle-bracketed ID"
Expand Down Expand Up @@ -93,8 +99,8 @@ class ZarrArraySpec:
chunks: tuple
dimensions: tuple
description: str
compressor: dict
filters: list
compressor: dict = None
filters: list = None
source: str = None

def __post_init__(self):
Expand All @@ -105,15 +111,7 @@ def __post_init__(self):
self.shape = tuple(self.shape)
self.chunks = tuple(self.chunks)
self.dimensions = tuple(self.dimensions)
self.filters = tuple(self.filters)

@staticmethod
def new(**kwargs):
spec = ZarrArraySpec(
**kwargs, compressor=DEFAULT_ZARR_COMPRESSOR.get_config(), filters=[]
)
spec._choose_compressor_settings()
return spec
self.filters = tuple(self.filters) if self.filters is not None else None

@staticmethod
def from_field(
Expand All @@ -124,6 +122,8 @@ def from_field(
variants_chunk_size,
samples_chunk_size,
array_name=None,
compressor=None,
filters=None,
):
shape = [num_variants]
prefix = "variant_"
Expand All @@ -150,39 +150,18 @@ def from_field(
dimensions.append("genotypes")
else:
dimensions.append(f"{vcf_field.category}_{vcf_field.name}_dim")
return ZarrArraySpec.new(
return ZarrArraySpec(
source=vcf_field.full_name,
name=array_name,
dtype=vcf_field.smallest_dtype(),
shape=shape,
chunks=chunks,
dimensions=dimensions,
description=vcf_field.description,
compressor=compressor,
filters=filters,
)

def _choose_compressor_settings(self):
"""
Choose compressor and filter settings based on the size and
type of the array, plus some hueristics from observed properties
of VCFs.

See https://github.com/pystatgen/bio2zarr/discussions/74
"""
# Default is to not shuffle, because autoshuffle isn't recognised
# by many Zarr implementations, and shuffling can lead to worse
# performance in some cases anyway. Turning on shuffle should be a
# deliberate choice.
shuffle = numcodecs.Blosc.NOSHUFFLE
if self.name == "call_genotype" and self.dtype == "i1":
# call_genotype gets BITSHUFFLE by default as it gets
# significantly better compression (at a cost of slower
# decoding)
shuffle = numcodecs.Blosc.BITSHUFFLE
elif self.dtype == "bool":
shuffle = numcodecs.Blosc.BITSHUFFLE

self.compressor["shuffle"] = shuffle

@property
def chunk_nbytes(self):
"""
Expand Down Expand Up @@ -240,16 +219,24 @@ class VcfZarrSchema(core.JsonDataclass):
samples_chunk_size: int
variants_chunk_size: int
fields: list
defaults: dict

def __init__(
self,
format_version: str,
fields: list,
variants_chunk_size: int = None,
samples_chunk_size: int = None,
defaults: dict = None,
):
self.format_version = format_version
self.fields = fields
defaults = defaults.copy() if defaults is not None else {}
if defaults.get("compressor", None) is None:
defaults["compressor"] = DEFAULT_ZARR_COMPRESSOR.get_config()
if defaults.get("filters", None) is None:
defaults["filters"] = []
self.defaults = defaults
if variants_chunk_size is None:
variants_chunk_size = 1000
self.variants_chunk_size = variants_chunk_size
Expand Down Expand Up @@ -533,7 +520,7 @@ def init(

total_chunks = 0
for field in self.schema.fields:
a = self.init_array(root, field, partitions[-1].stop)
a = self.init_array(root, self.metadata.schema, field, partitions[-1].stop)
total_chunks += a.nchunks

logger.info("Writing WIP metadata")
Expand Down Expand Up @@ -600,9 +587,20 @@ def encode_filters(self, root):
)
array.attrs["_ARRAY_DIMENSIONS"] = ["filters"]

def init_array(self, root, array_spec, variants_dim_size):
def init_array(self, root, schema, array_spec, variants_dim_size):
kwargs = dict(zarr_utils.ZARR_FORMAT_KWARGS)
filters = [numcodecs.get_codec(filt) for filt in array_spec.filters]
filters = (
array_spec.filters
if array_spec.filters is not None
else schema.defaults["filters"]
)
filters = [numcodecs.get_codec(filt) for filt in filters]
compressor = (
array_spec.compressor
if array_spec.compressor is not None
else schema.defaults["compressor"]
)
compressor = numcodecs.get_codec(compressor)
if array_spec.dtype == "O":
if zarr_utils.zarr_v3():
filters = [*list(filters), numcodecs.VLenUTF8()]
Expand All @@ -620,7 +618,7 @@ def init_array(self, root, array_spec, variants_dim_size):
shape=shape,
chunks=array_spec.chunks,
dtype=array_spec.dtype,
compressor=numcodecs.get_codec(array_spec.compressor),
compressor=compressor,
filters=filters,
**kwargs,
)
Expand Down
Loading