5
5
import warnings
6
6
7
7
import numpy as np
8
+ import pandas as pd
8
9
9
10
from bio2zarr import constants , core , vcz
10
11
11
12
logger = logging .getLogger (__name__ )
12
13
13
14
15
+ FAM_FIELDS = [
16
+ ("family_id" , str , "U" ),
17
+ ("member_id" , str , "U" ),
18
+ ("paternal_id" , str , "U" ),
19
+ ("maternal_id" , str , "U" ),
20
+ ("sex" , str , "int8" ),
21
+ ("phenotype" , str , "int8" ),
22
+ ]
23
+ FAM_DF_DTYPE = dict ([(f [0 ], f [1 ]) for f in FAM_FIELDS ])
24
+ FAM_ARRAY_DTYPE = dict ([(f [0 ], f [2 ]) for f in FAM_FIELDS ])
25
+
26
+ BIM_FIELDS = [
27
+ ("contig" , str , "U" ),
28
+ ("variant_id" , str , "U" ),
29
+ ("cm_position" , "float32" , "float32" ),
30
+ ("position" , "int32" , "int32" ),
31
+ ("allele_1" , str , "S" ),
32
+ ("allele_2" , str , "S" ),
33
+ ]
34
+ BIM_DF_DTYPE = dict ([(f [0 ], f [1 ]) for f in BIM_FIELDS ])
35
+ BIM_ARRAY_DTYPE = dict ([(f [0 ], f [2 ]) for f in BIM_FIELDS ])
36
+
37
+
38
+ def read_fam (path , sep = None ):
39
+ if sep is None :
40
+ sep = " "
41
+ # See: https://www.cog-genomics.org/plink/1.9/formats#fam
42
+ names = [f [0 ] for f in FAM_FIELDS ]
43
+ return pd .read_csv (path , sep = sep , names = names , dtype = FAM_DF_DTYPE )
44
+
45
+
46
+ def read_bim (path , sep = None ):
47
+ if sep is None :
48
+ sep = "\t "
49
+ # See: https://www.cog-genomics.org/plink/1.9/formats#bim
50
+ names = [f [0 ] for f in BIM_FIELDS ]
51
+ df = pd .read_csv (str (path ), sep = sep , names = names , dtype = BIM_DF_DTYPE )
52
+ # df["contig"] = df["contig"].where(df["contig"] != "0", None)
53
+ return df
54
+
55
+
14
56
@dataclasses .dataclass
15
57
class PlinkPaths :
16
58
bed_path : str
@@ -24,16 +66,6 @@ class FamData:
24
66
sid_count : int
25
67
26
68
27
- @dataclasses .dataclass
28
- class BimData :
29
- chromosome : np .ndarray
30
- vid : np .ndarray
31
- bp_position : np .ndarray
32
- allele_1 : np .ndarray
33
- allele_2 : np .ndarray
34
- vid_count : int
35
-
36
-
37
69
class PlinkFormat (vcz .Source ):
38
70
def __init__ (self , prefix ):
39
71
# TODO we will need support multiple chromosomes here to join
@@ -56,40 +88,8 @@ def __init__(self, prefix):
56
88
self .fam = FamData (sid = np .array (samples ), sid_count = len (samples ))
57
89
self .n_samples = len (samples )
58
90
59
- # Read variant information from .bim file
60
- chromosomes = []
61
- vids = []
62
- positions = []
63
- allele1 = []
64
- allele2 = []
65
-
66
- with open (self .paths .bim_path ) as f :
67
- for line in f :
68
- fields = line .strip ().split ()
69
- if len (fields ) >= 6 :
70
- chrom , vid , _ , pos , a1 , a2 = (
71
- fields [0 ],
72
- fields [1 ],
73
- fields [2 ],
74
- fields [3 ],
75
- fields [4 ],
76
- fields [5 ],
77
- )
78
- chromosomes .append (chrom )
79
- vids .append (vid )
80
- positions .append (int (pos ))
81
- allele1 .append (a1 )
82
- allele2 .append (a2 )
83
-
84
- self .bim = BimData (
85
- chromosome = np .array (chromosomes ),
86
- vid = np .array (vids ),
87
- bp_position = np .array (positions ),
88
- allele_1 = np .array (allele1 ),
89
- allele_2 = np .array (allele2 ),
90
- vid_count = len (vids ),
91
- )
92
- self .n_variants = len (vids )
91
+ self .bim = read_bim (self .paths .bim_path )
92
+ self .n_variants = self .bim .shape [0 ]
93
93
94
94
# Calculate bytes per SNP: 1 byte per 4 samples, rounded up
95
95
self .bytes_per_snp = (self .n_samples + 3 ) // 4
@@ -144,35 +144,35 @@ def path(self):
144
144
145
145
@property
146
146
def num_records (self ):
147
- return self .bim . vid_count
147
+ return self .n_variants
148
148
149
149
@property
150
150
def samples (self ):
151
151
return [vcz .Sample (id = sample ) for sample in self .fam .sid ]
152
152
153
153
@property
154
154
def contigs (self ):
155
- return [vcz .Contig (id = str (chrom )) for chrom in np . unique ( self .bim .chromosome )]
155
+ return [vcz .Contig (id = str (chrom )) for chrom in self .bim .contig . unique ( )]
156
156
157
157
@property
158
158
def num_samples (self ):
159
159
return len (self .samples )
160
160
161
161
def iter_contig (self , start , stop ):
162
162
chrom_to_contig_index = {contig .id : i for i , contig in enumerate (self .contigs )}
163
- for chrom in self .bim .chromosome [start :stop ]:
163
+ for chrom in self .bim .contig [start :stop ]:
164
164
yield chrom_to_contig_index [str (chrom )]
165
165
166
166
def iter_field (self , field_name , shape , start , stop ):
167
167
assert field_name == "position" # Only position field is supported from plink
168
- yield from self .bim .bp_position [start :stop ]
168
+ yield from self .bim .position [start :stop ]
169
169
170
170
def iter_id (self , start , stop ):
171
- yield from self .bim .vid [start :stop ]
171
+ yield from self .bim .variant_id [start :stop ]
172
172
173
173
def iter_alleles_and_genotypes (self , start , stop , shape , num_alleles ):
174
- alt_field = self .bim .allele_1
175
- ref_field = self .bim .allele_2
174
+ alt_field = self .bim .allele_1 . values
175
+ ref_field = self .bim .allele_2 . values
176
176
177
177
chunk_size = stop - start
178
178
@@ -218,7 +218,7 @@ def generate_schema(
218
218
samples_chunk_size = None ,
219
219
):
220
220
n = self .fam .sid_count
221
- m = self .bim . vid_count
221
+ m = self .num_records
222
222
logging .info (f"Scanned plink with { n } samples and { m } variants" )
223
223
dimensions = vcz .standard_dimensions (
224
224
variants_size = m ,
@@ -241,7 +241,7 @@ def generate_schema(
241
241
)
242
242
# If we don't have SVLEN or END annotations, the rlen field is defined
243
243
# as the length of the REF
244
- max_len = self .bim .allele_2 .itemsize
244
+ max_len = self .bim .allele_2 .values . itemsize
245
245
246
246
array_specs = [
247
247
vcz .ZarrArraySpec (
@@ -278,7 +278,7 @@ def generate_schema(
278
278
),
279
279
vcz .ZarrArraySpec (
280
280
name = "variant_contig" ,
281
- dtype = core .min_int_dtype (0 , len (np .unique (self .bim .chromosome ))),
281
+ dtype = core .min_int_dtype (0 , len (np .unique (self .bim .contig ))),
282
282
dimensions = ["variants" ],
283
283
description = "Contig/chromosome index for each variant" ,
284
284
),
0 commit comments