1
- import os
2
- import tempfile
3
1
from unittest import mock
4
2
5
3
import numpy as np
13
11
from bio2zarr import vcf
14
12
15
13
14
+ def test_missing_dependency ():
15
+ with mock .patch (
16
+ "importlib.import_module" ,
17
+ side_effect = ImportError ("No module named 'tskit'" ),
18
+ ):
19
+ with pytest .raises (ImportError ) as exc_info :
20
+ tsk .convert (
21
+ "UNUSED_PATH" ,
22
+ "UNUSED_PATH" ,
23
+ )
24
+ assert (
25
+ "This process requires the optional tskit module. Install "
26
+ "it with: pip install bio2zarr[tskit]" in str (exc_info .value )
27
+ )
28
+
29
+
16
30
def simple_ts (add_individuals = False ):
17
31
tables = tskit .TableCollection (sequence_length = 100 )
18
32
for _ in range (4 ):
@@ -37,98 +51,96 @@ def simple_ts(add_individuals=False):
37
51
return tables .tree_sequence ()
38
52
39
53
40
- class TestTskit :
41
- def test_simple_tree_sequence (self , tmp_path ):
42
- tree_sequence = simple_ts ()
43
- tree_sequence .dump (tmp_path / "test.trees" )
44
-
45
- # Manually specify the individuals_nodes, other tests use
46
- # tsk individuals.
47
- ind_nodes = np .array ([[0 , 1 ], [2 , 3 ]])
48
-
49
- with tempfile .TemporaryDirectory () as tempdir :
50
- zarr_path = os .path .join (tempdir , "test_output.zarr" )
51
- tsk .convert (
52
- tmp_path / "test.trees" ,
53
- zarr_path ,
54
- individuals_nodes = ind_nodes ,
55
- show_progress = False ,
56
- )
57
- zroot = zarr .open (zarr_path , mode = "r" )
58
- pos = zroot ["variant_position" ][:]
59
- assert pos .shape == (3 ,)
60
- assert pos .dtype == np .int8
61
- assert np .array_equal (pos , [10 , 20 , 30 ])
62
-
63
- alleles = zroot ["variant_allele" ][:]
64
- assert alleles .shape == (3 , 2 )
65
- assert alleles .dtype == "O"
66
- assert np .array_equal (alleles , [["A" , "TTTT" ], ["CCC" , "G" ], ["G" , "AA" ]])
67
-
68
- lengths = zroot ["variant_length" ][:]
69
- assert lengths .shape == (3 ,)
70
- assert lengths .dtype == np .int8
71
- assert np .array_equal (lengths , [1 , 3 , 1 ])
72
-
73
- genotypes = zroot ["call_genotype" ][:]
74
- assert genotypes .shape == (3 , 2 , 2 )
75
- assert genotypes .dtype == np .int8
76
- assert np .array_equal (
77
- genotypes , [[[1 , 1 ], [0 , 0 ]], [[0 , 0 ], [1 , 1 ]], [[1 , 0 ], [0 , 0 ]]]
78
- )
54
+ class TestSimpleTs :
55
+ @pytest .fixture ()
56
+ def conversion (self , tmp_path ):
57
+ ts = simple_ts ()
58
+ zarr_path = tmp_path / "test_output.vcz"
59
+ tsk .convert (ts , zarr_path )
60
+ zroot = zarr .open (zarr_path , mode = "r" )
61
+ return ts , zroot
62
+
63
+ def test_position (self , conversion ):
64
+ ts , zroot = conversion
65
+
66
+ pos = zroot ["variant_position" ][:]
67
+ assert pos .shape == (3 ,)
68
+ assert pos .dtype == np .int8
69
+ assert np .array_equal (pos , [10 , 20 , 30 ])
70
+
71
+ def test_alleles (self , conversion ):
72
+ ts , zroot = conversion
73
+ alleles = zroot ["variant_allele" ][:]
74
+ assert alleles .shape == (3 , 2 )
75
+ assert alleles .dtype == "O"
76
+ assert np .array_equal (alleles , [["A" , "TTTT" ], ["CCC" , "G" ], ["G" , "AA" ]])
77
+
78
+ def test_variant_length (self , conversion ):
79
+ ts , zroot = conversion
80
+ lengths = zroot ["variant_length" ][:]
81
+ assert lengths .shape == (3 ,)
82
+ assert lengths .dtype == np .int8
83
+ assert np .array_equal (lengths , [1 , 3 , 1 ])
84
+
85
+ def test_genotypes (self , conversion ):
86
+ ts , zroot = conversion
87
+ genotypes = zroot ["call_genotype" ][:]
88
+ assert genotypes .shape == (3 , 4 , 1 )
89
+ assert genotypes .dtype == np .int8
90
+ assert np .array_equal (
91
+ genotypes ,
92
+ [[[1 ], [1 ], [0 ], [0 ]], [[0 ], [0 ], [1 ], [1 ]], [[1 ], [0 ], [0 ], [0 ]]],
93
+ )
79
94
80
- phased = zroot ["call_genotype_phased" ][:]
81
- assert phased .shape == (3 , 2 )
82
- assert phased .dtype == "bool"
83
- assert np .all (phased )
84
-
85
- contigs = zroot ["contig_id" ][:]
86
- assert contigs .shape == (1 ,)
87
- assert contigs .dtype == "O"
88
- assert np .array_equal (contigs , ["1" ])
89
-
90
- contig = zroot ["variant_contig" ][:]
91
- assert contig .shape == (3 ,)
92
- assert contig .dtype == np .int8
93
- assert np .array_equal (contig , [0 , 0 , 0 ])
94
-
95
- samples = zroot ["sample_id" ][:]
96
- assert samples .shape == (2 ,)
97
- assert samples .dtype == "O"
98
- assert np .array_equal (samples , ["tsk_0" , "tsk_1" ])
99
-
100
- region_index = zroot ["region_index" ][:]
101
- assert region_index .shape == (1 , 6 )
102
- assert region_index .dtype == np .int8
103
- assert np .array_equal (region_index , [[0 , 0 , 10 , 30 , 30 , 3 ]])
104
-
105
- assert set (zroot .array_keys ()) == {
106
- "variant_position" ,
107
- "variant_allele" ,
108
- "variant_length" ,
109
- "call_genotype" ,
110
- "call_genotype_phased" ,
111
- "call_genotype_mask" ,
112
- "contig_id" ,
113
- "variant_contig" ,
114
- "sample_id" ,
115
- "region_index" ,
116
- }
117
-
118
- def test_missing_dependency (self ):
119
- with mock .patch (
120
- "importlib.import_module" ,
121
- side_effect = ImportError ("No module named 'tskit'" ),
122
- ):
123
- with pytest .raises (ImportError ) as exc_info :
124
- tsk .convert (
125
- "UNUSED_PATH" ,
126
- "UNUSED_PATH" ,
127
- )
128
- assert (
129
- "This process requires the optional tskit module. Install "
130
- "it with: pip install bio2zarr[tskit]" in str (exc_info .value )
131
- )
95
+ def test_phased (self , conversion ):
96
+ ts , zroot = conversion
97
+ phased = zroot ["call_genotype_phased" ][:]
98
+ assert phased .shape == (3 , 4 )
99
+ assert phased .dtype == "bool"
100
+ assert np .all (~ phased )
101
+
102
+ def test_contig_id (self , conversion ):
103
+ ts , zroot = conversion
104
+ contigs = zroot ["contig_id" ][:]
105
+ assert contigs .shape == (1 ,)
106
+ assert contigs .dtype == "O"
107
+ assert np .array_equal (contigs , ["1" ])
108
+
109
+ def test_variant_contig (self , conversion ):
110
+ ts , zroot = conversion
111
+ contig = zroot ["variant_contig" ][:]
112
+ assert contig .shape == (3 ,)
113
+ assert contig .dtype == np .int8
114
+ assert np .array_equal (contig , [0 , 0 , 0 ])
115
+
116
+ def test_sample_id (self , conversion ):
117
+ ts , zroot = conversion
118
+ samples = zroot ["sample_id" ][:]
119
+ assert samples .shape == (4 ,)
120
+ assert samples .dtype == "O"
121
+ assert np .array_equal (samples , ["tsk_0" , "tsk_1" , "tsk_2" , "tsk_3" ])
122
+
123
+ def test_region_index (self , conversion ):
124
+ ts , zroot = conversion
125
+ region_index = zroot ["region_index" ][:]
126
+ assert region_index .shape == (1 , 6 )
127
+ assert region_index .dtype == np .int8
128
+ assert np .array_equal (region_index , [[0 , 0 , 10 , 30 , 30 , 3 ]])
129
+
130
+ def test_fields (self , conversion ):
131
+ ts , zroot = conversion
132
+ assert set (zroot .array_keys ()) == {
133
+ "variant_position" ,
134
+ "variant_allele" ,
135
+ "variant_length" ,
136
+ "call_genotype" ,
137
+ "call_genotype_phased" ,
138
+ "call_genotype_mask" ,
139
+ "contig_id" ,
140
+ "variant_contig" ,
141
+ "sample_id" ,
142
+ "region_index" ,
143
+ }
132
144
133
145
134
146
class TestTskitFormat :
@@ -463,7 +475,7 @@ def insert_branch_sites(tsk, m=1):
463
475
expected_gt_missing = np .array ([[1 ], [0 ], [- 1 ]])
464
476
assert np .array_equal (variant_data_missing .genotypes , expected_gt_missing )
465
477
466
- def test_genotype_dtype_selection (self , tmp_path ):
478
+ def test_genotype_dtype_i1 (self , tmp_path ):
467
479
tables = tskit .TableCollection (sequence_length = 100 )
468
480
for _ in range (4 ):
469
481
tables .nodes .add_row (flags = tskit .NODE_IS_SAMPLE , time = 0 )
@@ -477,12 +489,12 @@ def test_genotype_dtype_selection(self, tmp_path):
477
489
ts_path = tmp_path / "small_alleles.trees"
478
490
tree_sequence .dump (ts_path )
479
491
480
- ind_nodes = np .array ([[0 , 1 ], [2 , 3 ]])
481
- format_obj = tsk .TskitFormat (ts_path , individuals_nodes = ind_nodes )
492
+ format_obj = tsk .TskitFormat (ts_path )
482
493
schema = format_obj .generate_schema ()
483
494
call_genotype_spec = next (s for s in schema .fields if s .name == "call_genotype" )
484
495
assert call_genotype_spec .dtype == "i1"
485
496
497
+ def test_genotype_dtype_i4 (self , tmp_path ):
486
498
tables = tskit .TableCollection (sequence_length = 100 )
487
499
for _ in range (4 ):
488
500
tables .nodes .add_row (flags = tskit .NODE_IS_SAMPLE , time = 0 )
@@ -498,7 +510,7 @@ def test_genotype_dtype_selection(self, tmp_path):
498
510
ts_path = tmp_path / "large_alleles.trees"
499
511
tree_sequence .dump (ts_path )
500
512
501
- format_obj = tsk .TskitFormat (ts_path , individuals_nodes = ind_nodes )
513
+ format_obj = tsk .TskitFormat (ts_path )
502
514
schema = format_obj .generate_schema ()
503
515
call_genotype_spec = next (s for s in schema .fields if s .name == "call_genotype" )
504
516
assert call_genotype_spec .dtype == "i4"
@@ -508,6 +520,7 @@ def test_genotype_dtype_selection(self, tmp_path):
508
520
"ts" ,
509
521
[
510
522
simple_ts (add_individuals = True ),
523
+ simple_ts (add_individuals = False ),
511
524
],
512
525
)
513
526
def test_against_tskit_vcf_output (ts , tmp_path ):
0 commit comments