5
5
import numpy as np
6
6
import zarr
7
7
8
- from bio2zarr import constants , vcz
8
+ from bio2zarr import constants , core , vcz
9
9
10
10
logger = logging .getLogger (__name__ )
11
11
@@ -18,6 +18,9 @@ def __init__(self, path):
18
18
self .samples = [vcz .Sample (id = sample ) for sample in self .bed .iid ]
19
19
self .num_samples = len (self .samples )
20
20
self .root_attrs = {}
21
+ self .contigs = [
22
+ vcz .Contig (id = str (chrom )) for chrom in np .unique (self .bed .chromosome )
23
+ ]
21
24
22
25
def iter_alleles (self , start , stop , num_alleles ):
23
26
ref_field = self .bed .allele_1
@@ -32,6 +35,11 @@ def iter_alleles(self, start, stop, num_alleles):
32
35
alleles [1 : 1 + len (alt )] = alt
33
36
yield alleles
34
37
38
+ def iter_contig (self , start , stop ):
39
+ chrom_to_contig_index = {contig .id : i for i , contig in enumerate (self .contigs )}
40
+ for chrom in self .bed .chromosome [start :stop ]:
41
+ yield chrom_to_contig_index [str (chrom )]
42
+
35
43
def iter_field (self , field_name , shape , start , stop ):
36
44
assert field_name == "position" # Only position field is supported from plink
37
45
yield from self .bed .bp_position [start :stop ]
@@ -88,6 +96,15 @@ def generate_schema(
88
96
chunks = [schema_instance .variants_chunk_size , 2 ],
89
97
description = None ,
90
98
),
99
+ vcz .ZarrArraySpec .new (
100
+ vcf_field = None ,
101
+ name = "variant_contig" ,
102
+ dtype = core .min_int_dtype (0 , len (np .unique (self .bed .chromosome ))),
103
+ shape = [m ],
104
+ dimensions = ["variants" ],
105
+ chunks = [schema_instance .variants_chunk_size ],
106
+ description = "Contig/chromosome index for each variant" ,
107
+ ),
91
108
vcz .ZarrArraySpec .new (
92
109
vcf_field = None ,
93
110
name = "call_genotype_phased" ,
@@ -159,9 +176,7 @@ def convert(
159
176
show_progress = show_progress ,
160
177
)
161
178
vzw .finalise (show_progress )
162
-
163
- # TODO - index code needs variant_contig
164
- # vzw.create_index()
179
+ vzw .create_index ()
165
180
166
181
167
182
# FIXME do this more efficiently - currently reading the whole thing
0 commit comments