1+ import dataclasses
12import logging
23import pathlib
34
910logger = logging .getLogger (__name__ )
1011
1112
13+ @dataclasses .dataclass
14+ class PlinkPaths :
15+ bed_path : pathlib .Path
16+ bim_path : pathlib .Path
17+ fam_path : pathlib .Path
18+
19+
1220class PlinkFormat (vcz .Source ):
1321 @core .requires_optional_dependency ("bed_reader" , "plink" )
14- def __init__ (self , path ):
22+ def __init__ (self , prefix ):
1523 import bed_reader
1624
17- self ._path = pathlib .Path (path )
18- self .bed = bed_reader .open_bed (path , num_threads = 1 , count_A1 = False )
25+ # TODO we will need support multiple chromosomes here to join
26+ # plinks into on big zarr. So, these will require multiple
27+ # bed and bim files, but should share a .fam
28+ self .prefix = pathlib .Path (prefix )
29+ paths = PlinkPaths (
30+ self .prefix .with_suffix (".bed" ),
31+ self .prefix .with_suffix (".bim" ),
32+ self .prefix .with_suffix (".fam" ),
33+ )
34+
35+ self .bed = bed_reader .open_bed (
36+ paths .bed_path ,
37+ bim_location = paths .bim_path ,
38+ fam_location = paths .fam_path ,
39+ num_threads = 1 ,
40+ count_A1 = False ,
41+ )
1942
2043 @property
2144 def path (self ):
22- return self ._path
45+ return self .prefix
2346
2447 @property
2548 def num_records (self ):
@@ -46,6 +69,9 @@ def iter_field(self, field_name, shape, start, stop):
4669 assert field_name == "position" # Only position field is supported from plink
4770 yield from self .bed .bp_position [start :stop ]
4871
72+ def iter_id (self , start , stop ):
73+ yield from self .bed .sid [start :stop ]
74+
4975 def iter_alleles_and_genotypes (self , start , stop , shape , num_alleles ):
5076 ref_field = self .bed .allele_1
5177 alt_field = self .bed .allele_2
@@ -107,6 +133,18 @@ def generate_schema(
107133 dimensions = ["variants" , "alleles" ],
108134 description = None ,
109135 ),
136+ vcz .ZarrArraySpec (
137+ name = "variant_id" ,
138+ dtype = "O" ,
139+ dimensions = ["variants" ],
140+ description = None ,
141+ ),
142+ vcz .ZarrArraySpec (
143+ name = "variant_id_mask" ,
144+ dtype = "bool" ,
145+ dimensions = ["variants" ],
146+ description = None ,
147+ ),
110148 vcz .ZarrArraySpec (
111149 source = None ,
112150 name = "variant_length" ,
@@ -147,20 +185,20 @@ def generate_schema(
147185
148186
149187def convert (
150- bed_path ,
151- zarr_path ,
188+ prefix ,
189+ out ,
152190 * ,
153191 variants_chunk_size = None ,
154192 samples_chunk_size = None ,
155193 worker_processes = 1 ,
156194 show_progress = False ,
157195):
158- plink_format = PlinkFormat (bed_path )
196+ plink_format = PlinkFormat (prefix )
159197 schema_instance = plink_format .generate_schema (
160198 variants_chunk_size = variants_chunk_size ,
161199 samples_chunk_size = samples_chunk_size ,
162200 )
163- zarr_path = pathlib .Path (zarr_path )
201+ zarr_path = pathlib .Path (out )
164202 vzw = vcz .VcfZarrWriter (PlinkFormat , zarr_path )
165203 # Rough heuristic to split work up enough to keep utilisation high
166204 target_num_partitions = max (1 , worker_processes * 4 )
0 commit comments