13
13
import tempfile
14
14
from typing import Any
15
15
16
- import cyvcf2
17
16
import humanfriendly
18
17
import numcodecs
19
18
import numpy as np
20
- import numpy .testing as nt
21
- import tqdm
22
19
import zarr
23
20
24
- from . import core , provenance , vcf_utils
21
+ from . import constants , core , provenance , vcf_utils
25
22
26
23
logger = logging .getLogger (__name__ )
27
24
28
- INT_MISSING = - 1
29
- INT_FILL = - 2
30
- STR_MISSING = "."
31
- STR_FILL = ""
32
-
33
- FLOAT32_MISSING , FLOAT32_FILL = np .array ([0x7F800001 , 0x7F800002 ], dtype = np .int32 ).view (
34
- np .float32
35
- )
36
- FLOAT32_MISSING_AS_INT32 , FLOAT32_FILL_AS_INT32 = np .array (
37
- [0x7F800001 , 0x7F800002 ], dtype = np .int32
38
- )
39
-
40
25
41
26
def display_number (x ):
42
27
ret = "n/a"
@@ -377,15 +362,15 @@ def sanitise_value_bool(buff, j, value):
377
362
def sanitise_value_float_scalar (buff , j , value ):
378
363
x = value
379
364
if value is None :
380
- x = [FLOAT32_MISSING ]
365
+ x = [constants . FLOAT32_MISSING ]
381
366
buff [j ] = x [0 ]
382
367
383
368
384
369
def sanitise_value_int_scalar (buff , j , value ):
385
370
x = value
386
371
if value is None :
387
372
# print("MISSING", INT_MISSING, INT_FILL)
388
- x = [INT_MISSING ]
373
+ x = [constants . INT_MISSING ]
389
374
else :
390
375
x = sanitise_int_array (value , ndmin = 1 , dtype = np .int32 )
391
376
buff [j ] = x [0 ]
@@ -436,33 +421,35 @@ def drop_empty_second_dim(value):
436
421
437
422
def sanitise_value_float_1d (buff , j , value ):
438
423
if value is None :
439
- buff [j ] = FLOAT32_MISSING
424
+ buff [j ] = constants . FLOAT32_MISSING
440
425
else :
441
426
value = np .array (value , ndmin = 1 , dtype = buff .dtype , copy = False )
442
427
# numpy will map None values to Nan, but we need a
443
428
# specific NaN
444
- value [np .isnan (value )] = FLOAT32_MISSING
429
+ value [np .isnan (value )] = constants . FLOAT32_MISSING
445
430
value = drop_empty_second_dim (value )
446
- buff [j ] = FLOAT32_FILL
431
+ buff [j ] = constants . FLOAT32_FILL
447
432
buff [j , : value .shape [0 ]] = value
448
433
449
434
450
435
def sanitise_value_float_2d (buff , j , value ):
451
436
if value is None :
452
- buff [j ] = FLOAT32_MISSING
437
+ buff [j ] = constants . FLOAT32_MISSING
453
438
else :
454
439
# print("value = ", value)
455
440
value = np .array (value , ndmin = 2 , dtype = buff .dtype , copy = False )
456
- buff [j ] = FLOAT32_FILL
441
+ buff [j ] = constants . FLOAT32_FILL
457
442
buff [j , :, : value .shape [1 ]] = value
458
443
459
444
460
445
def sanitise_int_array (value , ndmin , dtype ):
461
446
if isinstance (value , tuple ):
462
- value = [VCF_INT_MISSING if x is None else x for x in value ] # NEEDS TEST
447
+ value = [
448
+ constants .VCF_INT_MISSING if x is None else x for x in value
449
+ ] # NEEDS TEST
463
450
value = np .array (value , ndmin = ndmin , copy = False )
464
- value [value == VCF_INT_MISSING ] = - 1
465
- value [value == VCF_INT_FILL ] = - 2
451
+ value [value == constants . VCF_INT_MISSING ] = - 1
452
+ value [value == constants . VCF_INT_FILL ] = - 2
466
453
# TODO watch out for clipping here!
467
454
return value .astype (dtype )
468
455
@@ -486,15 +473,11 @@ def sanitise_value_int_2d(buff, j, value):
486
473
buff [j , :, : value .shape [1 ]] = value
487
474
488
475
489
- MIN_INT_VALUE = np .iinfo (np .int32 ).min + 2
490
- VCF_INT_MISSING = np .iinfo (np .int32 ).min
491
- VCF_INT_FILL = np .iinfo (np .int32 ).min + 1
492
-
493
476
missing_value_map = {
494
- "Integer" : - 1 ,
495
- "Float" : FLOAT32_MISSING ,
496
- "String" : "." ,
497
- "Character" : "." ,
477
+ "Integer" : constants . INT_MISSING ,
478
+ "Float" : constants . FLOAT32_MISSING ,
479
+ "String" : constants . STR_MISSING ,
480
+ "Character" : constants . STR_MISSING ,
498
481
"Flag" : False ,
499
482
}
500
483
@@ -538,17 +521,12 @@ def transform_and_update_bounds(self, vcf_value):
538
521
return value
539
522
540
523
541
- MIN_INT_VALUE = np .iinfo (np .int32 ).min + 2
542
- VCF_INT_MISSING = np .iinfo (np .int32 ).min
543
- VCF_INT_FILL = np .iinfo (np .int32 ).min + 1
544
-
545
-
546
524
class IntegerValueTransformer (VcfValueTransformer ):
547
525
def update_bounds (self , value ):
548
526
summary = self .field .summary
549
527
# Mask out missing and fill values
550
528
# print(value)
551
- a = value [value >= MIN_INT_VALUE ]
529
+ a = value [value >= constants . MIN_INT_VALUE ]
552
530
if a .size > 0 :
553
531
summary .max_value = int (max (summary .max_value , np .max (a )))
554
532
summary .min_value = int (min (summary .min_value , np .min (a )))
@@ -1935,7 +1913,7 @@ def encode_alleles_partition(self, partition_index):
1935
1913
alt_col .iter_values (partition .start , partition .stop ),
1936
1914
):
1937
1915
j = alleles .next_buffer_row ()
1938
- alleles .buff [j , :] = STR_FILL
1916
+ alleles .buff [j , :] = constants . STR_FILL
1939
1917
alleles .buff [j , 0 ] = ref [0 ]
1940
1918
alleles .buff [j , 1 : 1 + len (alt )] = alt
1941
1919
alleles .flush ()
@@ -1958,7 +1936,7 @@ def encode_id_partition(self, partition_index):
1958
1936
vid .buff [j ] = value [0 ]
1959
1937
vid_mask .buff [j ] = False
1960
1938
else :
1961
- vid .buff [j ] = STR_MISSING
1939
+ vid .buff [j ] = constants . STR_MISSING
1962
1940
vid_mask .buff [j ] = True
1963
1941
vid .flush ()
1964
1942
vid_mask .flush ()
@@ -2253,227 +2231,3 @@ def convert(
2253
2231
worker_processes = worker_processes ,
2254
2232
show_progress = show_progress ,
2255
2233
)
2256
-
2257
-
2258
- def assert_all_missing_float (a ):
2259
- v = np .array (a , dtype = np .float32 ).view (np .int32 )
2260
- nt .assert_equal (v , FLOAT32_MISSING_AS_INT32 )
2261
-
2262
-
2263
- def assert_all_fill_float (a ):
2264
- v = np .array (a , dtype = np .float32 ).view (np .int32 )
2265
- nt .assert_equal (v , FLOAT32_FILL_AS_INT32 )
2266
-
2267
-
2268
- def assert_all_missing_int (a ):
2269
- v = np .array (a , dtype = int )
2270
- nt .assert_equal (v , - 1 )
2271
-
2272
-
2273
- def assert_all_fill_int (a ):
2274
- v = np .array (a , dtype = int )
2275
- nt .assert_equal (v , - 2 )
2276
-
2277
-
2278
- def assert_all_missing_string (a ):
2279
- nt .assert_equal (a , "." )
2280
-
2281
-
2282
- def assert_all_fill_string (a ):
2283
- nt .assert_equal (a , "" )
2284
-
2285
-
2286
- def assert_all_fill (zarr_val , vcf_type ):
2287
- if vcf_type == "Integer" :
2288
- assert_all_fill_int (zarr_val )
2289
- elif vcf_type in ("String" , "Character" ):
2290
- assert_all_fill_string (zarr_val )
2291
- elif vcf_type == "Float" :
2292
- assert_all_fill_float (zarr_val )
2293
- else : # pragma: no cover
2294
- assert False # noqa PT015
2295
-
2296
-
2297
- def assert_all_missing (zarr_val , vcf_type ):
2298
- if vcf_type == "Integer" :
2299
- assert_all_missing_int (zarr_val )
2300
- elif vcf_type in ("String" , "Character" ):
2301
- assert_all_missing_string (zarr_val )
2302
- elif vcf_type == "Flag" :
2303
- assert zarr_val == False # noqa 712
2304
- elif vcf_type == "Float" :
2305
- assert_all_missing_float (zarr_val )
2306
- else : # pragma: no cover
2307
- assert False # noqa PT015
2308
-
2309
-
2310
- def assert_info_val_missing (zarr_val , vcf_type ):
2311
- assert_all_missing (zarr_val , vcf_type )
2312
-
2313
-
2314
- def assert_format_val_missing (zarr_val , vcf_type ):
2315
- assert_info_val_missing (zarr_val , vcf_type )
2316
-
2317
-
2318
- # Note: checking exact equality may prove problematic here
2319
- # but we should be deterministically storing what cyvcf2
2320
- # provides, which should compare equal.
2321
-
2322
-
2323
- def assert_info_val_equal (vcf_val , zarr_val , vcf_type ):
2324
- assert vcf_val is not None
2325
- if vcf_type in ("String" , "Character" ):
2326
- split = list (vcf_val .split ("," ))
2327
- k = len (split )
2328
- if isinstance (zarr_val , str ):
2329
- assert k == 1
2330
- # Scalar
2331
- assert vcf_val == zarr_val
2332
- else :
2333
- nt .assert_equal (split , zarr_val [:k ])
2334
- assert_all_fill (zarr_val [k :], vcf_type )
2335
-
2336
- elif isinstance (vcf_val , tuple ):
2337
- vcf_missing_value_map = {
2338
- "Integer" : - 1 ,
2339
- "Float" : FLOAT32_MISSING ,
2340
- }
2341
- v = [vcf_missing_value_map [vcf_type ] if x is None else x for x in vcf_val ]
2342
- missing = np .array ([j for j , x in enumerate (vcf_val ) if x is None ], dtype = int )
2343
- a = np .array (v )
2344
- k = len (a )
2345
- # We are checking for int missing twice here, but it's necessary to have
2346
- # a separate check for floats because different NaNs compare equal
2347
- nt .assert_equal (a , zarr_val [:k ])
2348
- assert_all_missing (zarr_val [missing ], vcf_type )
2349
- if k < len (zarr_val ):
2350
- assert_all_fill (zarr_val [k :], vcf_type )
2351
- else :
2352
- # Scalar
2353
- zarr_val = np .array (zarr_val , ndmin = 1 )
2354
- assert len (zarr_val .shape ) == 1
2355
- assert vcf_val == zarr_val [0 ]
2356
- if len (zarr_val ) > 1 :
2357
- assert_all_fill (zarr_val [1 :], vcf_type )
2358
-
2359
-
2360
- def assert_format_val_equal (vcf_val , zarr_val , vcf_type ):
2361
- assert vcf_val is not None
2362
- assert isinstance (vcf_val , np .ndarray )
2363
- if vcf_type in ("String" , "Character" ):
2364
- assert len (vcf_val ) == len (zarr_val )
2365
- for v , z in zip (vcf_val , zarr_val ):
2366
- split = list (v .split ("," ))
2367
- # Note: deliberately duplicating logic here between this and the
2368
- # INFO col above to make sure all combinations are covered by tests
2369
- k = len (split )
2370
- if k == 1 :
2371
- assert v == z
2372
- else :
2373
- nt .assert_equal (split , z [:k ])
2374
- assert_all_fill (z [k :], vcf_type )
2375
- else :
2376
- assert vcf_val .shape [0 ] == zarr_val .shape [0 ]
2377
- if len (vcf_val .shape ) == len (zarr_val .shape ) + 1 :
2378
- assert vcf_val .shape [- 1 ] == 1
2379
- vcf_val = vcf_val [..., 0 ]
2380
- assert len (vcf_val .shape ) <= 2
2381
- assert len (vcf_val .shape ) == len (zarr_val .shape )
2382
- if len (vcf_val .shape ) == 2 :
2383
- k = vcf_val .shape [1 ]
2384
- if zarr_val .shape [1 ] != k :
2385
- assert_all_fill (zarr_val [:, k :], vcf_type )
2386
- zarr_val = zarr_val [:, :k ]
2387
- assert vcf_val .shape == zarr_val .shape
2388
- if vcf_type == "Integer" :
2389
- vcf_val [vcf_val == VCF_INT_MISSING ] = INT_MISSING
2390
- vcf_val [vcf_val == VCF_INT_FILL ] = INT_FILL
2391
- elif vcf_type == "Float" :
2392
- nt .assert_equal (vcf_val .view (np .int32 ), zarr_val .view (np .int32 ))
2393
-
2394
- nt .assert_equal (vcf_val , zarr_val )
2395
-
2396
-
2397
- # TODO rename to "verify"
2398
- def validate (vcf_path , zarr_path , show_progress = False ):
2399
- store = zarr .DirectoryStore (zarr_path )
2400
-
2401
- root = zarr .group (store = store )
2402
- pos = root ["variant_position" ][:]
2403
- allele = root ["variant_allele" ][:]
2404
- chrom = root ["contig_id" ][:][root ["variant_contig" ][:]]
2405
- vid = root ["variant_id" ][:]
2406
- call_genotype = None
2407
- if "call_genotype" in root :
2408
- call_genotype = iter (root ["call_genotype" ])
2409
-
2410
- vcf = cyvcf2 .VCF (vcf_path )
2411
- format_headers = {}
2412
- info_headers = {}
2413
- for h in vcf .header_iter ():
2414
- if h ["HeaderType" ] == "FORMAT" :
2415
- format_headers [h ["ID" ]] = h
2416
- if h ["HeaderType" ] == "INFO" :
2417
- info_headers [h ["ID" ]] = h
2418
-
2419
- format_fields = {}
2420
- info_fields = {}
2421
- for colname in root .keys ():
2422
- if colname .startswith ("call" ) and not colname .startswith ("call_genotype" ):
2423
- vcf_name = colname .split ("_" , 1 )[1 ]
2424
- vcf_type = format_headers [vcf_name ]["Type" ]
2425
- format_fields [vcf_name ] = vcf_type , iter (root [colname ])
2426
- if colname .startswith ("variant" ):
2427
- name = colname .split ("_" , 1 )[1 ]
2428
- if name .isupper ():
2429
- vcf_type = info_headers [name ]["Type" ]
2430
- info_fields [name ] = vcf_type , iter (root [colname ])
2431
-
2432
- first_pos = next (vcf ).POS
2433
- start_index = np .searchsorted (pos , first_pos )
2434
- assert pos [start_index ] == first_pos
2435
- vcf = cyvcf2 .VCF (vcf_path )
2436
- if show_progress :
2437
- iterator = tqdm .tqdm (vcf , desc = " Verify" , total = vcf .num_records ) # NEEDS TEST
2438
- else :
2439
- iterator = vcf
2440
- for j , row in enumerate (iterator , start_index ):
2441
- assert chrom [j ] == row .CHROM
2442
- assert pos [j ] == row .POS
2443
- assert vid [j ] == ("." if row .ID is None else row .ID )
2444
- assert allele [j , 0 ] == row .REF
2445
- k = len (row .ALT )
2446
- nt .assert_array_equal (allele [j , 1 : k + 1 ], row .ALT )
2447
- assert np .all (allele [j , k + 1 :] == "" )
2448
- # TODO FILTERS
2449
-
2450
- if call_genotype is None :
2451
- val = None
2452
- try :
2453
- val = row .format ("GT" )
2454
- except KeyError :
2455
- pass
2456
- assert val is None
2457
- else :
2458
- gt = row .genotype .array ()
2459
- gt_zarr = next (call_genotype )
2460
- gt_vcf = gt [:, :- 1 ]
2461
- # NOTE cyvcf2 remaps genotypes automatically
2462
- # into the same missing/pad encoding that sgkit uses.
2463
- nt .assert_array_equal (gt_zarr , gt_vcf )
2464
-
2465
- for name , (vcf_type , zarr_iter ) in info_fields .items ():
2466
- vcf_val = row .INFO .get (name , None )
2467
- zarr_val = next (zarr_iter )
2468
- if vcf_val is None :
2469
- assert_info_val_missing (zarr_val , vcf_type )
2470
- else :
2471
- assert_info_val_equal (vcf_val , zarr_val , vcf_type )
2472
-
2473
- for name , (vcf_type , zarr_iter ) in format_fields .items ():
2474
- vcf_val = row .format (name )
2475
- zarr_val = next (zarr_iter )
2476
- if vcf_val is None :
2477
- assert_format_val_missing (zarr_val , vcf_type )
2478
- else :
2479
- assert_format_val_equal (vcf_val , zarr_val , vcf_type )
0 commit comments