Skip to content

Commit 0aef4e0

Browse files
Move verification code to own file.
1 parent bdebd92 commit 0aef4e0

File tree

5 files changed

+278
-275
lines changed

5 files changed

+278
-275
lines changed

bio2zarr/constants.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import numpy as np
2+
3+
INT_MISSING = -1
4+
INT_FILL = -2
5+
STR_MISSING = "."
6+
STR_FILL = ""
7+
8+
FLOAT32_MISSING, FLOAT32_FILL = np.array([0x7F800001, 0x7F800002], dtype=np.int32).view(
9+
np.float32
10+
)
11+
FLOAT32_MISSING_AS_INT32, FLOAT32_FILL_AS_INT32 = np.array(
12+
[0x7F800001, 0x7F800002], dtype=np.int32
13+
)
14+
15+
16+
MIN_INT_VALUE = np.iinfo(np.int32).min + 2
17+
VCF_INT_MISSING = np.iinfo(np.int32).min
18+
VCF_INT_FILL = np.iinfo(np.int32).min + 1

bio2zarr/vcf.py

Lines changed: 20 additions & 266 deletions
Original file line numberDiff line numberDiff line change
@@ -13,30 +13,15 @@
1313
import tempfile
1414
from typing import Any
1515

16-
import cyvcf2
1716
import humanfriendly
1817
import numcodecs
1918
import numpy as np
20-
import numpy.testing as nt
21-
import tqdm
2219
import zarr
2320

24-
from . import core, provenance, vcf_utils
21+
from . import constants, core, provenance, vcf_utils
2522

2623
logger = logging.getLogger(__name__)
2724

28-
INT_MISSING = -1
29-
INT_FILL = -2
30-
STR_MISSING = "."
31-
STR_FILL = ""
32-
33-
FLOAT32_MISSING, FLOAT32_FILL = np.array([0x7F800001, 0x7F800002], dtype=np.int32).view(
34-
np.float32
35-
)
36-
FLOAT32_MISSING_AS_INT32, FLOAT32_FILL_AS_INT32 = np.array(
37-
[0x7F800001, 0x7F800002], dtype=np.int32
38-
)
39-
4025

4126
def display_number(x):
4227
ret = "n/a"
@@ -377,15 +362,15 @@ def sanitise_value_bool(buff, j, value):
377362
def sanitise_value_float_scalar(buff, j, value):
378363
x = value
379364
if value is None:
380-
x = [FLOAT32_MISSING]
365+
x = [constants.FLOAT32_MISSING]
381366
buff[j] = x[0]
382367

383368

384369
def sanitise_value_int_scalar(buff, j, value):
385370
x = value
386371
if value is None:
387372
# print("MISSING", INT_MISSING, INT_FILL)
388-
x = [INT_MISSING]
373+
x = [constants.INT_MISSING]
389374
else:
390375
x = sanitise_int_array(value, ndmin=1, dtype=np.int32)
391376
buff[j] = x[0]
@@ -436,33 +421,35 @@ def drop_empty_second_dim(value):
436421

437422
def sanitise_value_float_1d(buff, j, value):
438423
if value is None:
439-
buff[j] = FLOAT32_MISSING
424+
buff[j] = constants.FLOAT32_MISSING
440425
else:
441426
value = np.array(value, ndmin=1, dtype=buff.dtype, copy=False)
442427
# numpy will map None values to Nan, but we need a
443428
# specific NaN
444-
value[np.isnan(value)] = FLOAT32_MISSING
429+
value[np.isnan(value)] = constants.FLOAT32_MISSING
445430
value = drop_empty_second_dim(value)
446-
buff[j] = FLOAT32_FILL
431+
buff[j] = constants.FLOAT32_FILL
447432
buff[j, : value.shape[0]] = value
448433

449434

450435
def sanitise_value_float_2d(buff, j, value):
451436
if value is None:
452-
buff[j] = FLOAT32_MISSING
437+
buff[j] = constants.FLOAT32_MISSING
453438
else:
454439
# print("value = ", value)
455440
value = np.array(value, ndmin=2, dtype=buff.dtype, copy=False)
456-
buff[j] = FLOAT32_FILL
441+
buff[j] = constants.FLOAT32_FILL
457442
buff[j, :, : value.shape[1]] = value
458443

459444

460445
def sanitise_int_array(value, ndmin, dtype):
461446
if isinstance(value, tuple):
462-
value = [VCF_INT_MISSING if x is None else x for x in value] # NEEDS TEST
447+
value = [
448+
constants.VCF_INT_MISSING if x is None else x for x in value
449+
] # NEEDS TEST
463450
value = np.array(value, ndmin=ndmin, copy=False)
464-
value[value == VCF_INT_MISSING] = -1
465-
value[value == VCF_INT_FILL] = -2
451+
value[value == constants.VCF_INT_MISSING] = -1
452+
value[value == constants.VCF_INT_FILL] = -2
466453
# TODO watch out for clipping here!
467454
return value.astype(dtype)
468455

@@ -486,15 +473,11 @@ def sanitise_value_int_2d(buff, j, value):
486473
buff[j, :, : value.shape[1]] = value
487474

488475

489-
MIN_INT_VALUE = np.iinfo(np.int32).min + 2
490-
VCF_INT_MISSING = np.iinfo(np.int32).min
491-
VCF_INT_FILL = np.iinfo(np.int32).min + 1
492-
493476
missing_value_map = {
494-
"Integer": -1,
495-
"Float": FLOAT32_MISSING,
496-
"String": ".",
497-
"Character": ".",
477+
"Integer": constants.INT_MISSING,
478+
"Float": constants.FLOAT32_MISSING,
479+
"String": constants.STR_MISSING,
480+
"Character": constants.STR_MISSING,
498481
"Flag": False,
499482
}
500483

@@ -538,17 +521,12 @@ def transform_and_update_bounds(self, vcf_value):
538521
return value
539522

540523

541-
MIN_INT_VALUE = np.iinfo(np.int32).min + 2
542-
VCF_INT_MISSING = np.iinfo(np.int32).min
543-
VCF_INT_FILL = np.iinfo(np.int32).min + 1
544-
545-
546524
class IntegerValueTransformer(VcfValueTransformer):
547525
def update_bounds(self, value):
548526
summary = self.field.summary
549527
# Mask out missing and fill values
550528
# print(value)
551-
a = value[value >= MIN_INT_VALUE]
529+
a = value[value >= constants.MIN_INT_VALUE]
552530
if a.size > 0:
553531
summary.max_value = int(max(summary.max_value, np.max(a)))
554532
summary.min_value = int(min(summary.min_value, np.min(a)))
@@ -1935,7 +1913,7 @@ def encode_alleles_partition(self, partition_index):
19351913
alt_col.iter_values(partition.start, partition.stop),
19361914
):
19371915
j = alleles.next_buffer_row()
1938-
alleles.buff[j, :] = STR_FILL
1916+
alleles.buff[j, :] = constants.STR_FILL
19391917
alleles.buff[j, 0] = ref[0]
19401918
alleles.buff[j, 1 : 1 + len(alt)] = alt
19411919
alleles.flush()
@@ -1958,7 +1936,7 @@ def encode_id_partition(self, partition_index):
19581936
vid.buff[j] = value[0]
19591937
vid_mask.buff[j] = False
19601938
else:
1961-
vid.buff[j] = STR_MISSING
1939+
vid.buff[j] = constants.STR_MISSING
19621940
vid_mask.buff[j] = True
19631941
vid.flush()
19641942
vid_mask.flush()
@@ -2253,227 +2231,3 @@ def convert(
22532231
worker_processes=worker_processes,
22542232
show_progress=show_progress,
22552233
)
2256-
2257-
2258-
def assert_all_missing_float(a):
2259-
v = np.array(a, dtype=np.float32).view(np.int32)
2260-
nt.assert_equal(v, FLOAT32_MISSING_AS_INT32)
2261-
2262-
2263-
def assert_all_fill_float(a):
2264-
v = np.array(a, dtype=np.float32).view(np.int32)
2265-
nt.assert_equal(v, FLOAT32_FILL_AS_INT32)
2266-
2267-
2268-
def assert_all_missing_int(a):
2269-
v = np.array(a, dtype=int)
2270-
nt.assert_equal(v, -1)
2271-
2272-
2273-
def assert_all_fill_int(a):
2274-
v = np.array(a, dtype=int)
2275-
nt.assert_equal(v, -2)
2276-
2277-
2278-
def assert_all_missing_string(a):
2279-
nt.assert_equal(a, ".")
2280-
2281-
2282-
def assert_all_fill_string(a):
2283-
nt.assert_equal(a, "")
2284-
2285-
2286-
def assert_all_fill(zarr_val, vcf_type):
2287-
if vcf_type == "Integer":
2288-
assert_all_fill_int(zarr_val)
2289-
elif vcf_type in ("String", "Character"):
2290-
assert_all_fill_string(zarr_val)
2291-
elif vcf_type == "Float":
2292-
assert_all_fill_float(zarr_val)
2293-
else: # pragma: no cover
2294-
assert False # noqa PT015
2295-
2296-
2297-
def assert_all_missing(zarr_val, vcf_type):
2298-
if vcf_type == "Integer":
2299-
assert_all_missing_int(zarr_val)
2300-
elif vcf_type in ("String", "Character"):
2301-
assert_all_missing_string(zarr_val)
2302-
elif vcf_type == "Flag":
2303-
assert zarr_val == False # noqa 712
2304-
elif vcf_type == "Float":
2305-
assert_all_missing_float(zarr_val)
2306-
else: # pragma: no cover
2307-
assert False # noqa PT015
2308-
2309-
2310-
def assert_info_val_missing(zarr_val, vcf_type):
2311-
assert_all_missing(zarr_val, vcf_type)
2312-
2313-
2314-
def assert_format_val_missing(zarr_val, vcf_type):
2315-
assert_info_val_missing(zarr_val, vcf_type)
2316-
2317-
2318-
# Note: checking exact equality may prove problematic here
2319-
# but we should be deterministically storing what cyvcf2
2320-
# provides, which should compare equal.
2321-
2322-
2323-
def assert_info_val_equal(vcf_val, zarr_val, vcf_type):
2324-
assert vcf_val is not None
2325-
if vcf_type in ("String", "Character"):
2326-
split = list(vcf_val.split(","))
2327-
k = len(split)
2328-
if isinstance(zarr_val, str):
2329-
assert k == 1
2330-
# Scalar
2331-
assert vcf_val == zarr_val
2332-
else:
2333-
nt.assert_equal(split, zarr_val[:k])
2334-
assert_all_fill(zarr_val[k:], vcf_type)
2335-
2336-
elif isinstance(vcf_val, tuple):
2337-
vcf_missing_value_map = {
2338-
"Integer": -1,
2339-
"Float": FLOAT32_MISSING,
2340-
}
2341-
v = [vcf_missing_value_map[vcf_type] if x is None else x for x in vcf_val]
2342-
missing = np.array([j for j, x in enumerate(vcf_val) if x is None], dtype=int)
2343-
a = np.array(v)
2344-
k = len(a)
2345-
# We are checking for int missing twice here, but it's necessary to have
2346-
# a separate check for floats because different NaNs compare equal
2347-
nt.assert_equal(a, zarr_val[:k])
2348-
assert_all_missing(zarr_val[missing], vcf_type)
2349-
if k < len(zarr_val):
2350-
assert_all_fill(zarr_val[k:], vcf_type)
2351-
else:
2352-
# Scalar
2353-
zarr_val = np.array(zarr_val, ndmin=1)
2354-
assert len(zarr_val.shape) == 1
2355-
assert vcf_val == zarr_val[0]
2356-
if len(zarr_val) > 1:
2357-
assert_all_fill(zarr_val[1:], vcf_type)
2358-
2359-
2360-
def assert_format_val_equal(vcf_val, zarr_val, vcf_type):
2361-
assert vcf_val is not None
2362-
assert isinstance(vcf_val, np.ndarray)
2363-
if vcf_type in ("String", "Character"):
2364-
assert len(vcf_val) == len(zarr_val)
2365-
for v, z in zip(vcf_val, zarr_val):
2366-
split = list(v.split(","))
2367-
# Note: deliberately duplicating logic here between this and the
2368-
# INFO col above to make sure all combinations are covered by tests
2369-
k = len(split)
2370-
if k == 1:
2371-
assert v == z
2372-
else:
2373-
nt.assert_equal(split, z[:k])
2374-
assert_all_fill(z[k:], vcf_type)
2375-
else:
2376-
assert vcf_val.shape[0] == zarr_val.shape[0]
2377-
if len(vcf_val.shape) == len(zarr_val.shape) + 1:
2378-
assert vcf_val.shape[-1] == 1
2379-
vcf_val = vcf_val[..., 0]
2380-
assert len(vcf_val.shape) <= 2
2381-
assert len(vcf_val.shape) == len(zarr_val.shape)
2382-
if len(vcf_val.shape) == 2:
2383-
k = vcf_val.shape[1]
2384-
if zarr_val.shape[1] != k:
2385-
assert_all_fill(zarr_val[:, k:], vcf_type)
2386-
zarr_val = zarr_val[:, :k]
2387-
assert vcf_val.shape == zarr_val.shape
2388-
if vcf_type == "Integer":
2389-
vcf_val[vcf_val == VCF_INT_MISSING] = INT_MISSING
2390-
vcf_val[vcf_val == VCF_INT_FILL] = INT_FILL
2391-
elif vcf_type == "Float":
2392-
nt.assert_equal(vcf_val.view(np.int32), zarr_val.view(np.int32))
2393-
2394-
nt.assert_equal(vcf_val, zarr_val)
2395-
2396-
2397-
# TODO rename to "verify"
2398-
def validate(vcf_path, zarr_path, show_progress=False):
2399-
store = zarr.DirectoryStore(zarr_path)
2400-
2401-
root = zarr.group(store=store)
2402-
pos = root["variant_position"][:]
2403-
allele = root["variant_allele"][:]
2404-
chrom = root["contig_id"][:][root["variant_contig"][:]]
2405-
vid = root["variant_id"][:]
2406-
call_genotype = None
2407-
if "call_genotype" in root:
2408-
call_genotype = iter(root["call_genotype"])
2409-
2410-
vcf = cyvcf2.VCF(vcf_path)
2411-
format_headers = {}
2412-
info_headers = {}
2413-
for h in vcf.header_iter():
2414-
if h["HeaderType"] == "FORMAT":
2415-
format_headers[h["ID"]] = h
2416-
if h["HeaderType"] == "INFO":
2417-
info_headers[h["ID"]] = h
2418-
2419-
format_fields = {}
2420-
info_fields = {}
2421-
for colname in root.keys():
2422-
if colname.startswith("call") and not colname.startswith("call_genotype"):
2423-
vcf_name = colname.split("_", 1)[1]
2424-
vcf_type = format_headers[vcf_name]["Type"]
2425-
format_fields[vcf_name] = vcf_type, iter(root[colname])
2426-
if colname.startswith("variant"):
2427-
name = colname.split("_", 1)[1]
2428-
if name.isupper():
2429-
vcf_type = info_headers[name]["Type"]
2430-
info_fields[name] = vcf_type, iter(root[colname])
2431-
2432-
first_pos = next(vcf).POS
2433-
start_index = np.searchsorted(pos, first_pos)
2434-
assert pos[start_index] == first_pos
2435-
vcf = cyvcf2.VCF(vcf_path)
2436-
if show_progress:
2437-
iterator = tqdm.tqdm(vcf, desc=" Verify", total=vcf.num_records) # NEEDS TEST
2438-
else:
2439-
iterator = vcf
2440-
for j, row in enumerate(iterator, start_index):
2441-
assert chrom[j] == row.CHROM
2442-
assert pos[j] == row.POS
2443-
assert vid[j] == ("." if row.ID is None else row.ID)
2444-
assert allele[j, 0] == row.REF
2445-
k = len(row.ALT)
2446-
nt.assert_array_equal(allele[j, 1 : k + 1], row.ALT)
2447-
assert np.all(allele[j, k + 1 :] == "")
2448-
# TODO FILTERS
2449-
2450-
if call_genotype is None:
2451-
val = None
2452-
try:
2453-
val = row.format("GT")
2454-
except KeyError:
2455-
pass
2456-
assert val is None
2457-
else:
2458-
gt = row.genotype.array()
2459-
gt_zarr = next(call_genotype)
2460-
gt_vcf = gt[:, :-1]
2461-
# NOTE cyvcf2 remaps genotypes automatically
2462-
# into the same missing/pad encoding that sgkit uses.
2463-
nt.assert_array_equal(gt_zarr, gt_vcf)
2464-
2465-
for name, (vcf_type, zarr_iter) in info_fields.items():
2466-
vcf_val = row.INFO.get(name, None)
2467-
zarr_val = next(zarr_iter)
2468-
if vcf_val is None:
2469-
assert_info_val_missing(zarr_val, vcf_type)
2470-
else:
2471-
assert_info_val_equal(vcf_val, zarr_val, vcf_type)
2472-
2473-
for name, (vcf_type, zarr_iter) in format_fields.items():
2474-
vcf_val = row.format(name)
2475-
zarr_val = next(zarr_iter)
2476-
if vcf_val is None:
2477-
assert_format_val_missing(zarr_val, vcf_type)
2478-
else:
2479-
assert_format_val_equal(vcf_val, zarr_val, vcf_type)

0 commit comments

Comments
 (0)