14
14
15
15
FAM_FIELDS = [
16
16
("family_id" , str , "U" ),
17
- ("member_id " , str , "U" ),
17
+ ("individual_id " , str , "U" ),
18
18
("paternal_id" , str , "U" ),
19
19
("maternal_id" , str , "U" ),
20
20
("sex" , str , "int8" ),
36
36
37
37
38
38
def read_fam (path , sep = None ):
39
- if sep is None :
40
- sep = " "
41
39
# See: https://www.cog-genomics.org/plink/1.9/formats#fam
42
40
names = [f [0 ] for f in FAM_FIELDS ]
43
- return pd .read_csv (path , sep = sep , names = names , dtype = FAM_DF_DTYPE )
41
+ df = pd .read_csv (path , sep = sep , names = names , dtype = FAM_DF_DTYPE )
42
+ return df
44
43
45
44
46
45
def read_bim (path , sep = None ):
47
- if sep is None :
48
- sep = "\t "
49
46
# See: https://www.cog-genomics.org/plink/1.9/formats#bim
50
47
names = [f [0 ] for f in BIM_FIELDS ]
51
48
df = pd .read_csv (str (path ), sep = sep , names = names , dtype = BIM_DF_DTYPE )
52
- # df["contig"] = df["contig"].where(df["contig"] != "0", None)
53
49
return df
54
50
55
51
@@ -78,28 +74,21 @@ def __init__(self, prefix):
78
74
self .prefix + ".fam" ,
79
75
)
80
76
81
- # Read sample information from .fam file
82
- samples = []
83
- with open (self .paths .fam_path ) as f :
84
- for line in f :
85
- fields = line .strip ().split ()
86
- if len (fields ) >= 2 : # At minimum, we need FID and IID
87
- samples .append (fields [1 ])
88
- self .fam = FamData (sid = np .array (samples ), sid_count = len (samples ))
89
- self .n_samples = len (samples )
90
-
91
77
self .bim = read_bim (self .paths .bim_path )
92
- self .n_variants = self .bim .shape [0 ]
78
+ self .fam = read_fam (self .paths .fam_path )
79
+
80
+ self ._num_records = self .bim .shape [0 ]
81
+ self ._num_samples = self .fam .shape [0 ]
93
82
94
83
# Calculate bytes per SNP: 1 byte per 4 samples, rounded up
95
- self .bytes_per_snp = (self .n_samples + 3 ) // 4
84
+ self .bytes_per_snp = (self ._num_samples + 3 ) // 4
96
85
97
86
# Verify BED file has correct magic bytes
98
87
with open (self .paths .bed_path , "rb" ) as f :
99
88
magic = f .read (3 )
100
89
assert magic == b"\x6c \x1b \x01 " , "Invalid BED file format"
101
90
102
- expected_size = self .n_variants * self .bytes_per_snp + 3 # +3 for magic bytes
91
+ expected_size = self .num_records * self .bytes_per_snp + 3 # +3 for magic bytes
103
92
actual_size = os .path .getsize (self .paths .bed_path )
104
93
if actual_size < expected_size :
105
94
raise ValueError (
@@ -144,20 +133,20 @@ def path(self):
144
133
145
134
@property
146
135
def num_records (self ):
147
- return self .n_variants
136
+ return self ._num_records
137
+
138
+ @property
139
+ def num_samples (self ):
140
+ return self ._num_samples
148
141
149
142
@property
150
143
def samples (self ):
151
- return [vcz .Sample (id = sample ) for sample in self .fam .sid ]
144
+ return [vcz .Sample (id = iid ) for iid in self .fam .individual_id ]
152
145
153
146
@property
154
147
def contigs (self ):
155
148
return [vcz .Contig (id = str (chrom )) for chrom in self .bim .contig .unique ()]
156
149
157
- @property
158
- def num_samples (self ):
159
- return len (self .samples )
160
-
161
150
def iter_contig (self , start , stop ):
162
151
chrom_to_contig_index = {contig .id : i for i , contig in enumerate (self .contigs )}
163
152
for chrom in self .bim .contig [start :stop ]:
@@ -198,9 +187,9 @@ def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
198
187
samples_padded = self .bytes_per_snp * 4
199
188
genotypes_reshaped = all_genotypes .reshape (chunk_size , samples_padded , 2 )
200
189
201
- gt = genotypes_reshaped [:, : self .n_samples ]
190
+ gt = genotypes_reshaped [:, : self ._num_samples ]
202
191
203
- phased = np .zeros ((chunk_size , self .n_samples ), dtype = bool )
192
+ phased = np .zeros ((chunk_size , self ._num_samples ), dtype = bool )
204
193
205
194
for i , (ref , alt ) in enumerate (
206
195
zip (ref_field [start :stop ], alt_field [start :stop ])
@@ -217,7 +206,7 @@ def generate_schema(
217
206
variants_chunk_size = None ,
218
207
samples_chunk_size = None ,
219
208
):
220
- n = self .fam . sid_count
209
+ n = self .num_samples
221
210
m = self .num_records
222
211
logging .info (f"Scanned plink with { n } samples and { m } variants" )
223
212
dimensions = vcz .standard_dimensions (
0 commit comments