Skip to content

Commit f2796b1

Browse files
Rename ConversionSpec to VcfZarrSchema
1 parent ac4aaad commit f2796b1

File tree

4 files changed

+25
-21
lines changed

4 files changed

+25
-21
lines changed

bio2zarr/plink.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import numpy as np
55
import zarr
66
import bed_reader
7+
import numcodecs
78

89
from . import core
910

@@ -82,11 +83,16 @@ def convert(
8283
chunks = [variants_chunk_size, samples_chunk_size]
8384
dimensions = ["variants", "samples"]
8485

86+
# TODO we should be reusing some logic from vcfzarr here on laying
87+
# out the basic dataset, and using the schema generator. Currently
88+
# we're not using the best Blosc settings for genotypes here.
89+
default_compressor = numcodecs.Blosc(cname="zstd", clevel=7)
90+
8591
a = root.array(
8692
"sample_id",
8793
bed.iid,
8894
dtype="str",
89-
compressor=core.default_compressor,
95+
compressor=default_compressor,
9096
chunks=(samples_chunk_size,),
9197
)
9298
a.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
@@ -98,7 +104,7 @@ def convert(
98104
"variant_position",
99105
bed.bp_position,
100106
dtype=np.int32,
101-
compressor=core.default_compressor,
107+
compressor=default_compressor,
102108
chunks=(variants_chunk_size,),
103109
)
104110
a.attrs["_ARRAY_DIMENSIONS"] = ["variants"]
@@ -109,7 +115,7 @@ def convert(
109115
"variant_allele",
110116
alleles,
111117
dtype="str",
112-
compressor=core.default_compressor,
118+
compressor=default_compressor,
113119
chunks=(variants_chunk_size,),
114120
)
115121
a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
@@ -121,7 +127,7 @@ def convert(
121127
dtype="bool",
122128
shape=list(shape),
123129
chunks=list(chunks),
124-
compressor=core.default_compressor,
130+
compressor=default_compressor,
125131
)
126132
a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
127133

@@ -132,7 +138,7 @@ def convert(
132138
dtype="i1",
133139
shape=list(shape),
134140
chunks=list(chunks),
135-
compressor=core.default_compressor,
141+
compressor=default_compressor,
136142
)
137143
a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
138144

@@ -141,7 +147,7 @@ def convert(
141147
dtype="bool",
142148
shape=list(shape),
143149
chunks=list(chunks),
144-
compressor=core.default_compressor,
150+
compressor=default_compressor,
145151
)
146152
a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
147153

bio2zarr/vcf.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1098,11 +1098,9 @@ def _choose_compressor_settings(self):
10981098

10991099
ZARR_SCHEMA_FORMAT_VERSION = "0.2"
11001100

1101-
# RENAME to ZarrSchema
1102-
11031101

11041102
@dataclasses.dataclass
1105-
class ZarrConversionSpec:
1103+
class VcfZarrSchema:
11061104
format_version: str
11071105
samples_chunk_size: int
11081106
variants_chunk_size: int
@@ -1126,15 +1124,15 @@ def fromdict(d):
11261124
"Zarr schema format version mismatch: "
11271125
f"{d['format_version']} != {ZARR_SCHEMA_FORMAT_VERSION}"
11281126
)
1129-
ret = ZarrConversionSpec(**d)
1127+
ret = VcfZarrSchema(**d)
11301128
ret.columns = {
11311129
key: ZarrColumnSpec(**value) for key, value in d["columns"].items()
11321130
}
11331131
return ret
11341132

11351133
@staticmethod
11361134
def fromjson(s):
1137-
return ZarrConversionSpec.fromdict(json.loads(s))
1135+
return VcfZarrSchema.fromdict(json.loads(s))
11381136

11391137
@staticmethod
11401138
def generate(pcvcf, variants_chunk_size=None, samples_chunk_size=None):
@@ -1280,7 +1278,7 @@ def fixed_field_spec(
12801278
)
12811279
)
12821280

1283-
return ZarrConversionSpec(
1281+
return VcfZarrSchema(
12841282
format_version=ZARR_SCHEMA_FORMAT_VERSION,
12851283
samples_chunk_size=samples_chunk_size,
12861284
variants_chunk_size=variants_chunk_size,
@@ -1673,7 +1671,7 @@ def service_completed_futures():
16731671

16741672
def mkschema(if_path, out):
16751673
pcvcf = PickleChunkedVcf.load(if_path)
1676-
spec = ZarrConversionSpec.generate(pcvcf)
1674+
spec = VcfZarrSchema.generate(pcvcf)
16771675
out.write(spec.asjson())
16781676

16791677

@@ -1690,7 +1688,7 @@ def encode(
16901688
):
16911689
pcvcf = PickleChunkedVcf.load(if_path)
16921690
if schema_path is None:
1693-
schema = ZarrConversionSpec.generate(
1691+
schema = VcfZarrSchema.generate(
16941692
pcvcf,
16951693
variants_chunk_size=variants_chunk_size,
16961694
samples_chunk_size=samples_chunk_size,
@@ -1700,7 +1698,7 @@ def encode(
17001698
if variants_chunk_size is not None or samples_chunk_size is not None:
17011699
raise ValueError("Cannot specify schema along with chunk sizes")
17021700
with open(schema_path, "r") as f:
1703-
schema = ZarrConversionSpec.fromjson(f.read())
1701+
schema = VcfZarrSchema.fromjson(f.read())
17041702
zarr_path = pathlib.Path(zarr_path)
17051703
if zarr_path.exists():
17061704
logger.warning(f"Deleting existing {zarr_path}")

tests/test_pcvcf.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,8 @@ def test_mkschema(self, tmp_path, pcvcf):
2727
with open(schema_file, "w") as f:
2828
vcf.mkschema(pcvcf.path, f)
2929
with open(schema_file, "r") as f:
30-
schema1 = vcf.ZarrConversionSpec.fromjson(f.read())
31-
schema2 = vcf.ZarrConversionSpec.generate(pcvcf)
30+
schema1 = vcf.VcfZarrSchema.fromjson(f.read())
31+
schema2 = vcf.VcfZarrSchema.generate(pcvcf)
3232
assert schema1 == schema2
3333

3434
def test_summary_table(self, pcvcf):
@@ -95,7 +95,7 @@ def pcvcf(self, tmp_path_factory):
9595

9696
@pytest.fixture(scope="class")
9797
def schema(self, pcvcf):
98-
return vcf.ZarrConversionSpec.generate(pcvcf)
98+
return vcf.VcfZarrSchema.generate(pcvcf)
9999

100100
@pytest.mark.parametrize(
101101
("name", "dtype", "shape"),
@@ -165,8 +165,8 @@ def test_repr(self, pcvcf):
165165

166166
def test_pos_repr(self, pcvcf):
167167
assert repr(pcvcf["POS"]).startswith(
168-
"PickleChunkedVcfField(name=POS, partition_chunks=[8, 8, 8, 8, 8], path=")
169-
168+
"PickleChunkedVcfField(name=POS, partition_chunks=[8, 8, 8, 8, 8], path="
169+
)
170170

171171
def test_partition_record_index(self, pcvcf):
172172
nt.assert_array_equal(

tests/test_vcf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def test_zarr_schema_mismatch(self, schema, version):
4545
d = dict(schema)
4646
d["format_version"] = version
4747
with pytest.raises(ValueError, match="Zarr schema format version mismatch"):
48-
vcf.ZarrConversionSpec.fromdict(d)
48+
vcf.VcfZarrSchema.fromdict(d)
4949

5050
@pytest.mark.parametrize("version", ["0.0", "1.0", "xxxxx", 0.1])
5151
def test_exploded_metadata_mismatch(self, tmpdir, exploded_path, version):

0 commit comments

Comments
 (0)