Skip to content

Commit ac9f1d9

Browse files
Use the pandas-based FAM reader
1 parent 7fe383a commit ac9f1d9

File tree

2 files changed

+43
-29
lines changed

2 files changed

+43
-29
lines changed

bio2zarr/plink.py

Lines changed: 18 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
FAM_FIELDS = [
1616
("family_id", str, "U"),
17-
("member_id", str, "U"),
17+
("individual_id", str, "U"),
1818
("paternal_id", str, "U"),
1919
("maternal_id", str, "U"),
2020
("sex", str, "int8"),
@@ -36,20 +36,16 @@
3636

3737

3838
def read_fam(path, sep=None):
39-
if sep is None:
40-
sep = " "
4139
# See: https://www.cog-genomics.org/plink/1.9/formats#fam
4240
names = [f[0] for f in FAM_FIELDS]
43-
return pd.read_csv(path, sep=sep, names=names, dtype=FAM_DF_DTYPE)
41+
df = pd.read_csv(path, sep=sep, names=names, dtype=FAM_DF_DTYPE)
42+
return df
4443

4544

4645
def read_bim(path, sep=None):
47-
if sep is None:
48-
sep = "\t"
4946
# See: https://www.cog-genomics.org/plink/1.9/formats#bim
5047
names = [f[0] for f in BIM_FIELDS]
5148
df = pd.read_csv(str(path), sep=sep, names=names, dtype=BIM_DF_DTYPE)
52-
# df["contig"] = df["contig"].where(df["contig"] != "0", None)
5349
return df
5450

5551

@@ -78,28 +74,21 @@ def __init__(self, prefix):
7874
self.prefix + ".fam",
7975
)
8076

81-
# Read sample information from .fam file
82-
samples = []
83-
with open(self.paths.fam_path) as f:
84-
for line in f:
85-
fields = line.strip().split()
86-
if len(fields) >= 2: # At minimum, we need FID and IID
87-
samples.append(fields[1])
88-
self.fam = FamData(sid=np.array(samples), sid_count=len(samples))
89-
self.n_samples = len(samples)
90-
9177
self.bim = read_bim(self.paths.bim_path)
92-
self.n_variants = self.bim.shape[0]
78+
self.fam = read_fam(self.paths.fam_path)
79+
80+
self._num_records = self.bim.shape[0]
81+
self._num_samples = self.fam.shape[0]
9382

9483
# Calculate bytes per SNP: 1 byte per 4 samples, rounded up
95-
self.bytes_per_snp = (self.n_samples + 3) // 4
84+
self.bytes_per_snp = (self._num_samples + 3) // 4
9685

9786
# Verify BED file has correct magic bytes
9887
with open(self.paths.bed_path, "rb") as f:
9988
magic = f.read(3)
10089
assert magic == b"\x6c\x1b\x01", "Invalid BED file format"
10190

102-
expected_size = self.n_variants * self.bytes_per_snp + 3 # +3 for magic bytes
91+
expected_size = self.num_records * self.bytes_per_snp + 3 # +3 for magic bytes
10392
actual_size = os.path.getsize(self.paths.bed_path)
10493
if actual_size < expected_size:
10594
raise ValueError(
@@ -144,20 +133,20 @@ def path(self):
144133

145134
@property
146135
def num_records(self):
147-
return self.n_variants
136+
return self._num_records
137+
138+
@property
139+
def num_samples(self):
140+
return self._num_samples
148141

149142
@property
150143
def samples(self):
151-
return [vcz.Sample(id=sample) for sample in self.fam.sid]
144+
return [vcz.Sample(id=iid) for iid in self.fam.individual_id]
152145

153146
@property
154147
def contigs(self):
155148
return [vcz.Contig(id=str(chrom)) for chrom in self.bim.contig.unique()]
156149

157-
@property
158-
def num_samples(self):
159-
return len(self.samples)
160-
161150
def iter_contig(self, start, stop):
162151
chrom_to_contig_index = {contig.id: i for i, contig in enumerate(self.contigs)}
163152
for chrom in self.bim.contig[start:stop]:
@@ -198,9 +187,9 @@ def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
198187
samples_padded = self.bytes_per_snp * 4
199188
genotypes_reshaped = all_genotypes.reshape(chunk_size, samples_padded, 2)
200189

201-
gt = genotypes_reshaped[:, : self.n_samples]
190+
gt = genotypes_reshaped[:, : self._num_samples]
202191

203-
phased = np.zeros((chunk_size, self.n_samples), dtype=bool)
192+
phased = np.zeros((chunk_size, self._num_samples), dtype=bool)
204193

205194
for i, (ref, alt) in enumerate(
206195
zip(ref_field[start:stop], alt_field[start:stop])
@@ -217,7 +206,7 @@ def generate_schema(
217206
variants_chunk_size=None,
218207
samples_chunk_size=None,
219208
):
220-
n = self.fam.sid_count
209+
n = self.num_samples
221210
m = self.num_records
222211
logging.info(f"Scanned plink with {n} samples and {m} variants")
223212
dimensions = vcz.standard_dimensions(

tests/test_plink.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,31 @@ def test_example(self):
2424
nt.assert_array_equal(df["allele_2"].values, ["GG", "C"])
2525

2626

27+
class TestReadFam:
28+
def test_example(self):
29+
path = "tests/data/plink/example.fam"
30+
df = plink.read_fam(path)
31+
# FID IID FATHER MOTHER SEX PHENOTYPE
32+
# ind0 ind0 0 0 0 -9
33+
# ind1 ind1 0 0 0 -9
34+
# ind2 ind2 0 0 0 -9
35+
# ind3 ind3 0 0 0 -9
36+
# ind4 ind4 0 0 0 -9
37+
# ind5 ind5 0 0 0 -9
38+
# ind6 ind6 0 0 0 -9
39+
# ind7 ind7 0 0 0 -9
40+
# ind8 ind8 0 0 0 -9
41+
# ind9 ind9 0 0 0 -9
42+
nt.assert_array_equal(df["family_id"].values, [f"ind{j}" for j in range(10)])
43+
nt.assert_array_equal(
44+
df["individual_id"].values, [f"ind{j}" for j in range(10)]
45+
)
46+
nt.assert_array_equal(df["paternal_id"].values, ["0" for j in range(10)])
47+
nt.assert_array_equal(df["maternal_id"].values, ["0" for j in range(10)])
48+
nt.assert_array_equal(df["sex"].values, ["0" for j in range(10)])
49+
nt.assert_array_equal(df["phenotype"].values, ["-9" for j in range(10)])
50+
51+
2752
class TestSmallExample:
2853
@pytest.fixture(scope="class")
2954
def bed_path(self, tmp_path_factory):

0 commit comments

Comments
 (0)