@@ -116,7 +116,7 @@ def smallest_dtype(self):
116
116
elif self .vcf_type == "Flag" :
117
117
ret = "bool"
118
118
elif self .vcf_type == "Character" :
119
- ret = "S1 "
119
+ ret = "U1 "
120
120
else :
121
121
assert self .vcf_type == "String"
122
122
ret = "O"
@@ -1393,7 +1393,7 @@ def assert_all_fill(zarr_val, vcf_type):
1393
1393
assert_all_fill_string (zarr_val )
1394
1394
elif vcf_type == "Float" :
1395
1395
assert_all_fill_float (zarr_val )
1396
- else :
1396
+ else : # pragma: no cover
1397
1397
assert False
1398
1398
1399
1399
@@ -1406,7 +1406,7 @@ def assert_all_missing(zarr_val, vcf_type):
1406
1406
assert zarr_val == False # noqa 712
1407
1407
elif vcf_type == "Float" :
1408
1408
assert_all_missing_float (zarr_val )
1409
- else :
1409
+ else : # pragma: no cover
1410
1410
assert False
1411
1411
1412
1412
@@ -1425,19 +1425,20 @@ def assert_format_val_missing(zarr_val, vcf_type):
1425
1425
1426
1426
def assert_info_val_equal (vcf_val , zarr_val , vcf_type ):
1427
1427
assert vcf_val is not None
1428
- if not isinstance (vcf_val , tuple ):
1429
- # Scalar
1430
- zarr_val = np .array (zarr_val , ndmin = 1 )
1431
- assert len (zarr_val .shape ) == 1
1432
- assert vcf_val == zarr_val [0 ]
1433
- if len (zarr_val ) > 1 :
1434
- assert_all_fill (zarr_val [1 :], vcf_type )
1435
- else :
1428
+ if vcf_type in ("String" , "Character" ):
1429
+ split = list (vcf_val .split ("," ))
1430
+ k = len (split )
1431
+ if k == 1 :
1432
+ # Scalar
1433
+ assert vcf_val == zarr_val
1434
+ else :
1435
+ nt .assert_equal (split , zarr_val [:k ])
1436
+ assert_all_fill (zarr_val [k :], vcf_type )
1437
+
1438
+ elif isinstance (vcf_val , tuple ):
1436
1439
vcf_missing_value_map = {
1437
1440
"Integer" : - 1 ,
1438
1441
"Float" : FLOAT32_MISSING ,
1439
- "String" : "." ,
1440
- "Character" : "." ,
1441
1442
}
1442
1443
v = [vcf_missing_value_map [vcf_type ] if x is None else x for x in vcf_val ]
1443
1444
missing = np .array ([j for j , x in enumerate (vcf_val ) if x is None ], dtype = int )
@@ -1449,31 +1450,50 @@ def assert_info_val_equal(vcf_val, zarr_val, vcf_type):
1449
1450
assert_all_missing (zarr_val [missing ], vcf_type )
1450
1451
if k < len (zarr_val ):
1451
1452
assert_all_fill (zarr_val [k :], vcf_type )
1453
+ else :
1454
+ # Scalar
1455
+ zarr_val = np .array (zarr_val , ndmin = 1 )
1456
+ assert len (zarr_val .shape ) == 1
1457
+ assert vcf_val == zarr_val [0 ]
1458
+ if len (zarr_val ) > 1 :
1459
+ assert_all_fill (zarr_val [1 :], vcf_type )
1452
1460
1453
1461
1454
1462
def assert_format_val_equal (vcf_val , zarr_val , vcf_type ):
1455
1463
assert vcf_val is not None
1456
1464
assert isinstance (vcf_val , np .ndarray )
1457
-
1458
- assert vcf_val .shape [0 ] == zarr_val .shape [0 ]
1459
- if len (vcf_val .shape ) == len (zarr_val .shape ) + 1 :
1460
- assert vcf_val .shape [- 1 ] == 1
1461
- vcf_val = vcf_val [..., 0 ]
1462
- assert len (vcf_val .shape ) <= 2
1463
- assert len (vcf_val .shape ) == len (zarr_val .shape )
1464
- if len (vcf_val .shape ) == 2 :
1465
- k = vcf_val .shape [1 ]
1466
- if zarr_val .shape [1 ] != k :
1467
- assert_all_fill (zarr_val [:, k :], vcf_type )
1468
- zarr_val = zarr_val [:, :k ]
1469
- assert vcf_val .shape == zarr_val .shape
1470
- if vcf_type == "Integer" :
1471
- vcf_val [vcf_val == VCF_INT_MISSING ] = INT_MISSING
1472
- vcf_val [vcf_val == VCF_INT_FILL ] = INT_FILL
1473
- elif vcf_type == "Float" :
1474
- nt .assert_equal (vcf_val .view (np .int32 ), zarr_val .view (np .int32 ))
1475
-
1476
- nt .assert_equal (vcf_val , zarr_val )
1465
+ if vcf_type in ("String" , "Character" ):
1466
+ assert len (vcf_val ) == len (zarr_val )
1467
+ for v , z in zip (vcf_val , zarr_val ):
1468
+ split = list (v .split ("," ))
1469
+ # Note: deliberately duplicating logic here between this and the
1470
+ # INFO col above to make sure all combinations are covered by tests
1471
+ k = len (split )
1472
+ if k == 1 :
1473
+ assert v == z
1474
+ else :
1475
+ nt .assert_equal (split , z [:k ])
1476
+ assert_all_fill (z [k :], vcf_type )
1477
+ else :
1478
+ assert vcf_val .shape [0 ] == zarr_val .shape [0 ]
1479
+ if len (vcf_val .shape ) == len (zarr_val .shape ) + 1 :
1480
+ assert vcf_val .shape [- 1 ] == 1
1481
+ vcf_val = vcf_val [..., 0 ]
1482
+ assert len (vcf_val .shape ) <= 2
1483
+ assert len (vcf_val .shape ) == len (zarr_val .shape )
1484
+ if len (vcf_val .shape ) == 2 :
1485
+ k = vcf_val .shape [1 ]
1486
+ if zarr_val .shape [1 ] != k :
1487
+ assert_all_fill (zarr_val [:, k :], vcf_type )
1488
+ zarr_val = zarr_val [:, :k ]
1489
+ assert vcf_val .shape == zarr_val .shape
1490
+ if vcf_type == "Integer" :
1491
+ vcf_val [vcf_val == VCF_INT_MISSING ] = INT_MISSING
1492
+ vcf_val [vcf_val == VCF_INT_FILL ] = INT_FILL
1493
+ elif vcf_type == "Float" :
1494
+ nt .assert_equal (vcf_val .view (np .int32 ), zarr_val .view (np .int32 ))
1495
+
1496
+ nt .assert_equal (vcf_val , zarr_val )
1477
1497
1478
1498
1479
1499
def validate (vcf_path , zarr_path , show_progress = False ):
@@ -1541,12 +1561,8 @@ def validate(vcf_path, zarr_path, show_progress=False):
1541
1561
gt = row .genotype .array ()
1542
1562
gt_zarr = next (call_genotype )
1543
1563
gt_vcf = gt [:, :- 1 ]
1544
- # NOTE weirdly cyvcf2 seems to remap genotypes automatically
1564
+ # NOTE cyvcf2 remaps genotypes automatically
1545
1565
# into the same missing/pad encoding that sgkit uses.
1546
- # if np.any(gt_zarr < 0):
1547
- # print("MISSING")
1548
- # print(gt_zarr)
1549
- # print(gt_vcf)
1550
1566
nt .assert_array_equal (gt_zarr , gt_vcf )
1551
1567
1552
1568
for name , (vcf_type , zarr_iter ) in info_fields .items ():
0 commit comments