Skip to content

Commit 7fe383a

Browse files
Add read_bim function based on sgkit
1 parent 81443fe commit 7fe383a

File tree

2 files changed

+69
-54
lines changed

2 files changed

+69
-54
lines changed

bio2zarr/plink.py

Lines changed: 54 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,54 @@
55
import warnings
66

77
import numpy as np
8+
import pandas as pd
89

910
from bio2zarr import constants, core, vcz
1011

1112
logger = logging.getLogger(__name__)
1213

1314

15+
FAM_FIELDS = [
16+
("family_id", str, "U"),
17+
("member_id", str, "U"),
18+
("paternal_id", str, "U"),
19+
("maternal_id", str, "U"),
20+
("sex", str, "int8"),
21+
("phenotype", str, "int8"),
22+
]
23+
FAM_DF_DTYPE = dict([(f[0], f[1]) for f in FAM_FIELDS])
24+
FAM_ARRAY_DTYPE = dict([(f[0], f[2]) for f in FAM_FIELDS])
25+
26+
BIM_FIELDS = [
27+
("contig", str, "U"),
28+
("variant_id", str, "U"),
29+
("cm_position", "float32", "float32"),
30+
("position", "int32", "int32"),
31+
("allele_1", str, "S"),
32+
("allele_2", str, "S"),
33+
]
34+
BIM_DF_DTYPE = dict([(f[0], f[1]) for f in BIM_FIELDS])
35+
BIM_ARRAY_DTYPE = dict([(f[0], f[2]) for f in BIM_FIELDS])
36+
37+
38+
def read_fam(path, sep=None):
39+
if sep is None:
40+
sep = " "
41+
# See: https://www.cog-genomics.org/plink/1.9/formats#fam
42+
names = [f[0] for f in FAM_FIELDS]
43+
return pd.read_csv(path, sep=sep, names=names, dtype=FAM_DF_DTYPE)
44+
45+
46+
def read_bim(path, sep=None):
47+
if sep is None:
48+
sep = "\t"
49+
# See: https://www.cog-genomics.org/plink/1.9/formats#bim
50+
names = [f[0] for f in BIM_FIELDS]
51+
df = pd.read_csv(str(path), sep=sep, names=names, dtype=BIM_DF_DTYPE)
52+
# df["contig"] = df["contig"].where(df["contig"] != "0", None)
53+
return df
54+
55+
1456
@dataclasses.dataclass
1557
class PlinkPaths:
1658
bed_path: str
@@ -24,16 +66,6 @@ class FamData:
2466
sid_count: int
2567

2668

27-
@dataclasses.dataclass
28-
class BimData:
29-
chromosome: np.ndarray
30-
vid: np.ndarray
31-
bp_position: np.ndarray
32-
allele_1: np.ndarray
33-
allele_2: np.ndarray
34-
vid_count: int
35-
36-
3769
class PlinkFormat(vcz.Source):
3870
def __init__(self, prefix):
3971
# TODO we will need support multiple chromosomes here to join
@@ -56,40 +88,8 @@ def __init__(self, prefix):
5688
self.fam = FamData(sid=np.array(samples), sid_count=len(samples))
5789
self.n_samples = len(samples)
5890

59-
# Read variant information from .bim file
60-
chromosomes = []
61-
vids = []
62-
positions = []
63-
allele1 = []
64-
allele2 = []
65-
66-
with open(self.paths.bim_path) as f:
67-
for line in f:
68-
fields = line.strip().split()
69-
if len(fields) >= 6:
70-
chrom, vid, _, pos, a1, a2 = (
71-
fields[0],
72-
fields[1],
73-
fields[2],
74-
fields[3],
75-
fields[4],
76-
fields[5],
77-
)
78-
chromosomes.append(chrom)
79-
vids.append(vid)
80-
positions.append(int(pos))
81-
allele1.append(a1)
82-
allele2.append(a2)
83-
84-
self.bim = BimData(
85-
chromosome=np.array(chromosomes),
86-
vid=np.array(vids),
87-
bp_position=np.array(positions),
88-
allele_1=np.array(allele1),
89-
allele_2=np.array(allele2),
90-
vid_count=len(vids),
91-
)
92-
self.n_variants = len(vids)
91+
self.bim = read_bim(self.paths.bim_path)
92+
self.n_variants = self.bim.shape[0]
9393

9494
# Calculate bytes per SNP: 1 byte per 4 samples, rounded up
9595
self.bytes_per_snp = (self.n_samples + 3) // 4
@@ -144,35 +144,35 @@ def path(self):
144144

145145
@property
146146
def num_records(self):
147-
return self.bim.vid_count
147+
return self.n_variants
148148

149149
@property
150150
def samples(self):
151151
return [vcz.Sample(id=sample) for sample in self.fam.sid]
152152

153153
@property
154154
def contigs(self):
155-
return [vcz.Contig(id=str(chrom)) for chrom in np.unique(self.bim.chromosome)]
155+
return [vcz.Contig(id=str(chrom)) for chrom in self.bim.contig.unique()]
156156

157157
@property
158158
def num_samples(self):
159159
return len(self.samples)
160160

161161
def iter_contig(self, start, stop):
162162
chrom_to_contig_index = {contig.id: i for i, contig in enumerate(self.contigs)}
163-
for chrom in self.bim.chromosome[start:stop]:
163+
for chrom in self.bim.contig[start:stop]:
164164
yield chrom_to_contig_index[str(chrom)]
165165

166166
def iter_field(self, field_name, shape, start, stop):
167167
assert field_name == "position" # Only position field is supported from plink
168-
yield from self.bim.bp_position[start:stop]
168+
yield from self.bim.position[start:stop]
169169

170170
def iter_id(self, start, stop):
171-
yield from self.bim.vid[start:stop]
171+
yield from self.bim.variant_id[start:stop]
172172

173173
def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
174-
alt_field = self.bim.allele_1
175-
ref_field = self.bim.allele_2
174+
alt_field = self.bim.allele_1.values
175+
ref_field = self.bim.allele_2.values
176176

177177
chunk_size = stop - start
178178

@@ -218,7 +218,7 @@ def generate_schema(
218218
samples_chunk_size=None,
219219
):
220220
n = self.fam.sid_count
221-
m = self.bim.vid_count
221+
m = self.num_records
222222
logging.info(f"Scanned plink with {n} samples and {m} variants")
223223
dimensions = vcz.standard_dimensions(
224224
variants_size=m,
@@ -241,7 +241,7 @@ def generate_schema(
241241
)
242242
# If we don't have SVLEN or END annotations, the rlen field is defined
243243
# as the length of the REF
244-
max_len = self.bim.allele_2.itemsize
244+
max_len = self.bim.allele_2.values.itemsize
245245

246246
array_specs = [
247247
vcz.ZarrArraySpec(
@@ -278,7 +278,7 @@ def generate_schema(
278278
),
279279
vcz.ZarrArraySpec(
280280
name="variant_contig",
281-
dtype=core.min_int_dtype(0, len(np.unique(self.bim.chromosome))),
281+
dtype=core.min_int_dtype(0, len(np.unique(self.bim.contig))),
282282
dimensions=["variants"],
283283
description="Contig/chromosome index for each variant",
284284
),

tests/test_plink.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,21 @@
99
from bio2zarr import plink, vcf
1010

1111

12+
class TestReadBim:
13+
def test_example(self):
14+
path = "tests/data/plink/example.bim"
15+
df = plink.read_bim(path)
16+
# chrom, vid, cm_pos, pos, a1, a2
17+
# 1 1_10 0 10 A GG
18+
# 1 1_20 0 20 TTT C
19+
nt.assert_array_equal(df["contig"].values, ["1", "1"])
20+
nt.assert_array_equal(df["variant_id"].values, ["1_10", "1_20"])
21+
nt.assert_array_equal(df["cm_position"].values, [0, 0])
22+
nt.assert_array_equal(df["position"].values, [10, 20])
23+
nt.assert_array_equal(df["allele_1"].values, ["A", "TTT"])
24+
nt.assert_array_equal(df["allele_2"].values, ["GG", "C"])
25+
26+
1227
class TestSmallExample:
1328
@pytest.fixture(scope="class")
1429
def bed_path(self, tmp_path_factory):

0 commit comments

Comments
 (0)