Skip to content

Commit dfb3ed4

Browse files
Merge pull request #129 from jeromekelleher/noshuffle-default
Some refactoring of schema generation
2 parents 98478fe + 2e5189f commit dfb3ed4

File tree

6 files changed

+179
-69
lines changed

6 files changed

+179
-69
lines changed

.github/workflows/lint.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,4 @@ jobs:
1414
- uses: actions/setup-python@v4
1515
with:
1616
python-version: '3.11'
17-
- uses: pre-commit/action@v3
17+
- uses: pre-commit/action@v3.0.1

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
# 0.0.6 2024-04-xx
2+
3+
- Only use NOSHUFFLE by default on ``call_genotype`` and bool arrays.
4+
15
# 0.0.5 2024-04-17
26

37
- Fix bug in schema handling (compressor settings ignored)

bio2zarr/core.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,16 @@
1616
numcodecs.blosc.use_threads = False
1717

1818

19+
def min_int_dtype(min_value, max_value):
20+
if min_value > max_value:
21+
raise ValueError("min_value must be <= max_value")
22+
for a_dtype in ["i1", "i2", "i4", "i8"]:
23+
info = np.iinfo(a_dtype)
24+
if info.min <= min_value and max_value <= info.max:
25+
return a_dtype
26+
raise OverflowError("Integer cannot be represented")
27+
28+
1929
def chunk_aligned_slices(z, n, max_chunks=None):
2030
"""
2131
Returns at n slices in the specified zarr array, aligned

bio2zarr/vcf.py

Lines changed: 95 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -111,9 +111,6 @@ def full_name(self):
111111
return self.name
112112
return f"{self.category}/{self.name}"
113113

114-
# TODO add method here to choose a good set compressor and
115-
# filters default here for this field.
116-
117114
def smallest_dtype(self):
118115
"""
119116
Returns the smallest dtype suitable for this field based
@@ -123,13 +120,13 @@ def smallest_dtype(self):
123120
if self.vcf_type == "Float":
124121
ret = "f4"
125122
elif self.vcf_type == "Integer":
126-
dtype = "i4"
127-
for a_dtype in ["i1", "i2"]:
128-
info = np.iinfo(a_dtype)
129-
if info.min <= s.min_value and s.max_value <= info.max:
130-
dtype = a_dtype
131-
break
132-
ret = dtype
123+
if not math.isfinite(s.max_value):
124+
# All missing values; use i1. Note we should have some API to
125+
# check more explicitly for missingness:
126+
# https://github.com/sgkit-dev/bio2zarr/issues/131
127+
ret = "i1"
128+
else:
129+
ret = core.min_int_dtype(s.min_value, s.max_value)
133130
elif self.vcf_type == "Flag":
134131
ret = "bool"
135132
elif self.vcf_type == "Character":
@@ -152,6 +149,10 @@ class VcfPartition:
152149
cname="zstd", clevel=7, shuffle=numcodecs.Blosc.NOSHUFFLE
153150
)
154151

152+
# TODO refactor this to have embedded Contig dataclass, Filters
153+
# and Samples dataclasses to allow for more information to be
154+
# retained and forward compatibility.
155+
155156

156157
@dataclasses.dataclass
157158
class IcfMetadata:
@@ -183,6 +184,14 @@ def format_fields(self):
183184
fields.append(field)
184185
return fields
185186

187+
@property
188+
def num_contigs(self):
189+
return len(self.contig_names)
190+
191+
@property
192+
def num_filters(self):
193+
return len(self.filters)
194+
186195
@property
187196
def num_records(self):
188197
return sum(self.contig_record_counts.values())
@@ -1242,6 +1251,50 @@ def new(**kwargs):
12421251
spec._choose_compressor_settings()
12431252
return spec
12441253

1254+
@staticmethod
1255+
def from_field(
1256+
vcf_field,
1257+
*,
1258+
num_variants,
1259+
num_samples,
1260+
variants_chunk_size,
1261+
samples_chunk_size,
1262+
variable_name=None,
1263+
):
1264+
shape = [num_variants]
1265+
prefix = "variant_"
1266+
dimensions = ["variants"]
1267+
chunks = [variants_chunk_size]
1268+
if vcf_field.category == "FORMAT":
1269+
prefix = "call_"
1270+
shape.append(num_samples)
1271+
chunks.append(samples_chunk_size)
1272+
dimensions.append("samples")
1273+
if variable_name is None:
1274+
variable_name = prefix + vcf_field.name
1275+
# TODO make an option to add in the empty extra dimension
1276+
if vcf_field.summary.max_number > 1:
1277+
shape.append(vcf_field.summary.max_number)
1278+
# TODO we should really be checking this to see if the named dimensions
1279+
# are actually correct.
1280+
if vcf_field.vcf_number == "R":
1281+
dimensions.append("alleles")
1282+
elif vcf_field.vcf_number == "A":
1283+
dimensions.append("alt_alleles")
1284+
elif vcf_field.vcf_number == "G":
1285+
dimensions.append("genotypes")
1286+
else:
1287+
dimensions.append(f"{vcf_field.category}_{vcf_field.name}_dim")
1288+
return ZarrColumnSpec.new(
1289+
vcf_field=vcf_field.full_name,
1290+
name=variable_name,
1291+
dtype=vcf_field.smallest_dtype(),
1292+
shape=shape,
1293+
chunks=chunks,
1294+
dimensions=dimensions,
1295+
description=vcf_field.description,
1296+
)
1297+
12451298
def _choose_compressor_settings(self):
12461299
"""
12471300
Choose compressor and filter settings based on the size and
@@ -1250,15 +1303,19 @@ def _choose_compressor_settings(self):
12501303
12511304
See https://github.com/pystatgen/bio2zarr/discussions/74
12521305
"""
1253-
dt = np.dtype(self.dtype)
12541306
# Default is to not shuffle, because autoshuffle isn't recognised
12551307
# by many Zarr implementations, and shuffling can lead to worse
12561308
# performance in some cases anyway. Turning on shuffle should be a
12571309
# deliberate choice.
12581310
shuffle = numcodecs.Blosc.NOSHUFFLE
1259-
if dt.itemsize == 1:
1260-
# Any 1 byte field gets BITSHUFFLE by default
1311+
if self.name == "call_genotype" and self.dtype == "i1":
1312+
# call_genotype gets BITSHUFFLE by default as it gets
1313+
# significantly better compression (at a cost of slower
1314+
# decoding)
1315+
shuffle = numcodecs.Blosc.BITSHUFFLE
1316+
elif self.dtype == "bool":
12611317
shuffle = numcodecs.Blosc.BITSHUFFLE
1318+
12621319
self.compressor["shuffle"] = shuffle
12631320

12641321

@@ -1313,6 +1370,16 @@ def generate(icf, variants_chunk_size=None, samples_chunk_size=None):
13131370
f"Generating schema with chunks={variants_chunk_size, samples_chunk_size}"
13141371
)
13151372

1373+
def spec_from_field(field, variable_name=None):
1374+
return ZarrColumnSpec.from_field(
1375+
field,
1376+
num_samples=n,
1377+
num_variants=m,
1378+
samples_chunk_size=samples_chunk_size,
1379+
variants_chunk_size=variants_chunk_size,
1380+
variable_name=variable_name,
1381+
)
1382+
13161383
def fixed_field_spec(
13171384
name, dtype, vcf_field=None, shape=(m,), dimensions=("variants",)
13181385
):
@@ -1328,95 +1395,56 @@ def fixed_field_spec(
13281395

13291396
alt_col = icf.columns["ALT"]
13301397
max_alleles = alt_col.vcf_field.summary.max_number + 1
1331-
num_filters = len(icf.metadata.filters)
13321398

1333-
# # FIXME get dtype from lookup table
13341399
colspecs = [
13351400
fixed_field_spec(
13361401
name="variant_contig",
1337-
dtype="i2", # FIXME
1402+
dtype=core.min_int_dtype(0, icf.metadata.num_contigs),
13381403
),
13391404
fixed_field_spec(
13401405
name="variant_filter",
13411406
dtype="bool",
1342-
shape=(m, num_filters),
1407+
shape=(m, icf.metadata.num_filters),
13431408
dimensions=["variants", "filters"],
13441409
),
13451410
fixed_field_spec(
13461411
name="variant_allele",
13471412
dtype="str",
1348-
shape=[m, max_alleles],
1413+
shape=(m, max_alleles),
13491414
dimensions=["variants", "alleles"],
13501415
),
13511416
fixed_field_spec(
1352-
vcf_field="POS",
1353-
name="variant_position",
1354-
dtype="i4",
1355-
),
1356-
fixed_field_spec(
1357-
vcf_field=None,
13581417
name="variant_id",
13591418
dtype="str",
13601419
),
13611420
fixed_field_spec(
1362-
vcf_field=None,
13631421
name="variant_id_mask",
13641422
dtype="bool",
13651423
),
1366-
fixed_field_spec(
1367-
vcf_field="QUAL",
1368-
name="variant_quality",
1369-
dtype="f4",
1370-
),
13711424
]
1425+
name_map = {field.full_name: field for field in icf.metadata.fields}
1426+
1427+
# Only two of the fixed fields have a direct one-to-one mapping.
1428+
colspecs.extend(
1429+
[
1430+
spec_from_field(name_map["QUAL"], variable_name="variant_quality"),
1431+
spec_from_field(name_map["POS"], variable_name="variant_position"),
1432+
]
1433+
)
1434+
colspecs.extend([spec_from_field(field) for field in icf.metadata.info_fields])
13721435

13731436
gt_field = None
1374-
for field in icf.metadata.fields:
1375-
if field.category == "fixed":
1376-
continue
1437+
for field in icf.metadata.format_fields:
13771438
if field.name == "GT":
13781439
gt_field = field
13791440
continue
1380-
shape = [m]
1381-
prefix = "variant_"
1382-
dimensions = ["variants"]
1383-
chunks = [variants_chunk_size]
1384-
if field.category == "FORMAT":
1385-
prefix = "call_"
1386-
shape.append(n)
1387-
chunks.append(samples_chunk_size)
1388-
dimensions.append("samples")
1389-
# TODO make an option to add in the empty extra dimension
1390-
if field.summary.max_number > 1:
1391-
shape.append(field.summary.max_number)
1392-
# TODO we should really be checking this to see if the named dimensions
1393-
# are actually correct.
1394-
if field.vcf_number == "R":
1395-
dimensions.append("alleles")
1396-
elif field.vcf_number == "A":
1397-
dimensions.append("alt_alleles")
1398-
elif field.vcf_number == "G":
1399-
dimensions.append("genotypes")
1400-
else:
1401-
dimensions.append(f"{field.category}_{field.name}_dim")
1402-
variable_name = prefix + field.name
1403-
colspec = ZarrColumnSpec.new(
1404-
vcf_field=field.full_name,
1405-
name=variable_name,
1406-
dtype=field.smallest_dtype(),
1407-
shape=shape,
1408-
chunks=chunks,
1409-
dimensions=dimensions,
1410-
description=field.description,
1411-
)
1412-
colspecs.append(colspec)
1441+
colspecs.append(spec_from_field(field))
14131442

14141443
if gt_field is not None:
14151444
ploidy = gt_field.summary.max_number - 1
14161445
shape = [m, n]
14171446
chunks = [variants_chunk_size, samples_chunk_size]
14181447
dimensions = ["variants", "samples"]
1419-
14201448
colspecs.append(
14211449
ZarrColumnSpec.new(
14221450
vcf_field=None,

tests/test_core.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,55 @@
55
from bio2zarr import core
66

77

8+
class TestMinIntDtype:
9+
@pytest.mark.parametrize(
10+
("min_value", "max_value", "dtype"),
11+
[
12+
(0, 1, "i1"),
13+
(0, 0, "i1"),
14+
(0, 127, "i1"),
15+
(127, 128, "i2"),
16+
(-127, 0, "i1"),
17+
(-127, -126, "i1"),
18+
(0, 2**15 - 1, "i2"),
19+
(-(2**15), 2**15 - 1, "i2"),
20+
(0, 2**15, "i4"),
21+
(-(2**15), 2**15, "i4"),
22+
(0, 2**31 - 1, "i4"),
23+
(-(2**31), 2**31 - 1, "i4"),
24+
(2**31 - 1, 2**31 - 1, "i4"),
25+
(0, 2**31, "i8"),
26+
(0, 2**32, "i8"),
27+
],
28+
)
29+
def test_values(self, min_value, max_value, dtype):
30+
assert core.min_int_dtype(min_value, max_value) == dtype
31+
32+
@pytest.mark.parametrize(
33+
("min_value", "max_value"),
34+
[
35+
(0, 2**63),
36+
(-(2**63) - 1, 0),
37+
(0, 2**65),
38+
],
39+
)
40+
def test_overflow(self, min_value, max_value):
41+
with pytest.raises(OverflowError, match="Integer cannot"):
42+
core.min_int_dtype(min_value, max_value)
43+
44+
@pytest.mark.parametrize(
45+
("min_value", "max_value"),
46+
[
47+
(1, 0),
48+
(-1, -2),
49+
(2**31, 2**31 - 1),
50+
],
51+
)
52+
def test_bad_min_max(self, min_value, max_value):
53+
with pytest.raises(ValueError, match="must be <="):
54+
core.min_int_dtype(min_value, max_value)
55+
56+
857
class TestParallelWorkManager:
958
@pytest.mark.parametrize("total", [1, 10, 2**63])
1059
@pytest.mark.parametrize("workers", [0, 1])

tests/test_vcf.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ def test_filter_id(self, schema):
225225
def test_variant_contig(self, schema):
226226
assert schema["columns"]["variant_contig"] == {
227227
"name": "variant_contig",
228-
"dtype": "i2",
228+
"dtype": "i1",
229229
"shape": [9],
230230
"chunks": [10000],
231231
"dimensions": ["variants"],
@@ -298,6 +298,25 @@ def test_call_genotype_phased(self, schema):
298298
"filters": [],
299299
}
300300

301+
def test_call_GQ(self, schema):
302+
assert schema["columns"]["call_GQ"] == {
303+
"name": "call_GQ",
304+
"dtype": "i1",
305+
"shape": [9, 3],
306+
"chunks": [10000, 1000],
307+
"dimensions": ["variants", "samples"],
308+
"description": "Genotype Quality",
309+
"vcf_field": "FORMAT/GQ",
310+
"compressor": {
311+
"id": "blosc",
312+
"cname": "zstd",
313+
"clevel": 7,
314+
"shuffle": 0,
315+
"blocksize": 0,
316+
},
317+
"filters": [],
318+
}
319+
301320

302321
@pytest.mark.parametrize(
303322
"regions",

0 commit comments

Comments
 (0)