From 36def1b6483e5136ac6a55423baf3f3b15c266cc Mon Sep 17 00:00:00 2001
From: Ben Jeffery <ben.jeffery@bdi.ox.ac.uk>
Date: Thu, 22 May 2025 14:50:04 +0100
Subject: [PATCH 1/7] Remove bed_reader

---
 bio2zarr/plink.py   | 182 ++++++++++++++++++++++++++++++++++++--------
 tests/test_plink.py |  18 -----
 2 files changed, 152 insertions(+), 48 deletions(-)

diff --git a/bio2zarr/plink.py b/bio2zarr/plink.py
index e170ddea..d6f5e438 100644
--- a/bio2zarr/plink.py
+++ b/bio2zarr/plink.py
@@ -1,6 +1,8 @@
 import dataclasses
 import logging
+import os
 import pathlib
+import warnings
 
 import numpy as np
 
@@ -16,27 +18,125 @@ class PlinkPaths:
     fam_path: str
 
 
+@dataclasses.dataclass
+class FamData:
+    sid: np.ndarray
+    sid_count: int
+
+
+@dataclasses.dataclass
+class BimData:
+    chromosome: np.ndarray
+    vid: np.ndarray
+    bp_position: np.ndarray
+    allele_1: np.ndarray
+    allele_2: np.ndarray
+    vid_count: int
+
+
 class PlinkFormat(vcz.Source):
-    @core.requires_optional_dependency("bed_reader", "plink")
     def __init__(self, prefix):
-        import bed_reader
-
         # TODO we will need support multiple chromosomes here to join
         # plinks into on big zarr. So, these will require multiple
         # bed and bim files, but should share a .fam
         self.prefix = str(prefix)
-        paths = PlinkPaths(
+        self.paths = PlinkPaths(
             self.prefix + ".bed",
             self.prefix + ".bim",
             self.prefix + ".fam",
         )
-        self.bed = bed_reader.open_bed(
-            paths.bed_path,
-            bim_location=paths.bim_path,
-            fam_location=paths.fam_path,
-            num_threads=1,
-            count_A1=True,
+
+        # Read sample information from .fam file
+        samples = []
+        with open(self.paths.fam_path) as f:
+            for line in f:
+                fields = line.strip().split()
+                if len(fields) >= 2:  # At minimum, we need FID and IID
+                    samples.append(fields[1])
+        self.fam = FamData(sid=np.array(samples), sid_count=len(samples))
+        self.n_samples = len(samples)
+
+        # Read variant information from .bim file
+        chromosomes = []
+        vids = []
+        positions = []
+        allele1 = []
+        allele2 = []
+
+        with open(self.paths.bim_path) as f:
+            for line in f:
+                fields = line.strip().split()
+                if len(fields) >= 6:
+                    chrom, vid, _, pos, a1, a2 = (
+                        fields[0],
+                        fields[1],
+                        fields[2],
+                        fields[3],
+                        fields[4],
+                        fields[5],
+                    )
+                    chromosomes.append(chrom)
+                    vids.append(vid)
+                    positions.append(int(pos))
+                    allele1.append(a1)
+                    allele2.append(a2)
+
+        self.bim = BimData(
+            chromosome=np.array(chromosomes),
+            vid=np.array(vids),
+            bp_position=np.array(positions),
+            allele_1=np.array(allele1),
+            allele_2=np.array(allele2),
+            vid_count=len(vids),
         )
+        self.n_variants = len(vids)
+
+        # Calculate bytes per SNP: 1 byte per 4 samples, rounded up
+        self.bytes_per_snp = (self.n_samples + 3) // 4
+
+        # Verify BED file has correct magic bytes
+        with open(self.paths.bed_path, "rb") as f:
+            magic = f.read(3)
+            assert magic == b"\x6c\x1b\x01", "Invalid BED file format"
+
+        expected_size = self.n_variants * self.bytes_per_snp + 3  # +3 for magic bytes
+        actual_size = os.path.getsize(self.paths.bed_path)
+        if actual_size < expected_size:
+            raise ValueError(
+                f"BED file is truncated: expected at least {expected_size} bytes, "
+                f"but only found {actual_size} bytes. "
+                f"Check that .bed, .bim, and .fam files match."
+            )
+        elif actual_size > expected_size:
+            # Warn if there's extra data (might indicate file mismatch)
+            warnings.warn(
+                f"BED file contains {actual_size} bytes but only expected "
+                f"{expected_size}. "
+                f"Using first {expected_size} bytes only.",
+                stacklevel=1,
+            )
+
+        # Initialize the lookup table with shape (256, 4, 2)
+        # 256 possible byte values, 4 samples per byte, 2 alleles per sample
+        lookup = np.zeros((256, 4, 2), dtype=np.int8)
+
+        # For each possible byte value (0-255)
+        for byte in range(256):
+            # For each of the 4 samples encoded in this byte
+            for sample in range(4):
+                # Extract the 2 bits for this sample
+                bits = (byte >> (sample * 2)) & 0b11
+                # Convert PLINK's bit encoding to genotype values
+                if bits == 0b00:
+                    lookup[byte, sample] = [1, 1]
+                elif bits == 0b01:
+                    lookup[byte, sample] = [-1, -1]
+                elif bits == 0b10:
+                    lookup[byte, sample] = [0, 1]
+                elif bits == 0b11:
+                    lookup[byte, sample] = [0, 0]
+
+        self.byte_lookup = lookup
 
     @property
     def path(self):
@@ -44,15 +144,15 @@ def path(self):
 
     @property
     def num_records(self):
-        return self.bed.sid_count
+        return self.bim.vid_count
 
     @property
     def samples(self):
-        return [vcz.Sample(id=sample) for sample in self.bed.iid]
+        return [vcz.Sample(id=sample) for sample in self.fam.sid]
 
     @property
     def contigs(self):
-        return [vcz.Contig(id=str(chrom)) for chrom in np.unique(self.bed.chromosome)]
+        return [vcz.Contig(id=str(chrom)) for chrom in np.unique(self.bim.chromosome)]
 
     @property
     def num_samples(self):
@@ -60,43 +160,65 @@ def num_samples(self):
 
     def iter_contig(self, start, stop):
         chrom_to_contig_index = {contig.id: i for i, contig in enumerate(self.contigs)}
-        for chrom in self.bed.chromosome[start:stop]:
+        for chrom in self.bim.chromosome[start:stop]:
             yield chrom_to_contig_index[str(chrom)]
 
     def iter_field(self, field_name, shape, start, stop):
         assert field_name == "position"  # Only position field is supported from plink
-        yield from self.bed.bp_position[start:stop]
+        yield from self.bim.bp_position[start:stop]
 
     def iter_id(self, start, stop):
-        yield from self.bed.sid[start:stop]
+        yield from self.bim.vid[start:stop]
 
     def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
-        alt_field = self.bed.allele_1
-        ref_field = self.bed.allele_2
-        bed_chunk = self.bed.read(slice(start, stop), dtype=np.int8).T
-        gt = np.zeros(shape, dtype=np.int8)
-        phased = np.zeros(shape[:-1], dtype=bool)
+        alt_field = self.bim.allele_1
+        ref_field = self.bim.allele_2
+
+        chunk_size = stop - start
+
+        # Calculate file offsets for the required data
+        # 3 bytes for the magic number at the beginning of the file
+        start_offset = 3 + (start * self.bytes_per_snp)
+        bytes_to_read = chunk_size * self.bytes_per_snp
+
+        # Read only the needed portion of the BED file
+        with open(self.paths.bed_path, "rb") as f:
+            f.seek(start_offset)
+            chunk_data = f.read(bytes_to_read)
+
+        data_bytes = np.frombuffer(chunk_data, dtype=np.uint8)
+        data_matrix = data_bytes.reshape(chunk_size, self.bytes_per_snp)
+
+        # Apply lookup table to get genotypes
+        # Shape becomes: (chunk_size, bytes_per_snp, 4, 2)
+        all_genotypes = self.byte_lookup[data_matrix]
+
+        # Reshape to get all samples in one dimension
+        # (chunk_size, bytes_per_snp*4, 2)
+        samples_padded = self.bytes_per_snp * 4
+        genotypes_reshaped = all_genotypes.reshape(chunk_size, samples_padded, 2)
+
+        gt = genotypes_reshaped[:, : self.n_samples]
+
+        phased = np.zeros((chunk_size, self.n_samples), dtype=bool)
+
         for i, (ref, alt) in enumerate(
             zip(ref_field[start:stop], alt_field[start:stop])
         ):
             alleles = np.full(num_alleles, constants.STR_FILL, dtype="O")
             alleles[0] = ref
             alleles[1 : 1 + len(alt)] = alt
-            gt[:] = 0
-            gt[bed_chunk[i] == -127] = -1
-            gt[bed_chunk[i] == 2] = 1
-            gt[bed_chunk[i] == 1, 1] = 1
 
             # rlen is the length of the REF in PLINK as there's no END annotations
-            yield vcz.VariantData(len(alleles[0]), alleles, gt, phased)
+            yield vcz.VariantData(len(alleles[0]), alleles, gt[i], phased[i])
 
     def generate_schema(
         self,
         variants_chunk_size=None,
         samples_chunk_size=None,
     ):
-        n = self.bed.iid_count
-        m = self.bed.sid_count
+        n = self.fam.sid_count
+        m = self.bim.vid_count
         logging.info(f"Scanned plink with {n} samples and {m} variants")
         dimensions = vcz.standard_dimensions(
             variants_size=m,
@@ -119,7 +241,7 @@ def generate_schema(
         )
         # If we don't have SVLEN or END annotations, the rlen field is defined
         # as the length of the REF
-        max_len = self.bed.allele_2.itemsize
+        max_len = self.bim.allele_2.itemsize
 
         array_specs = [
             vcz.ZarrArraySpec(
@@ -156,7 +278,7 @@ def generate_schema(
             ),
             vcz.ZarrArraySpec(
                 name="variant_contig",
-                dtype=core.min_int_dtype(0, len(np.unique(self.bed.chromosome))),
+                dtype=core.min_int_dtype(0, len(np.unique(self.bim.chromosome))),
                 dimensions=["variants"],
                 description="Contig/chromosome index for each variant",
             ),
diff --git a/tests/test_plink.py b/tests/test_plink.py
index 661b62f6..f116020a 100644
--- a/tests/test_plink.py
+++ b/tests/test_plink.py
@@ -1,5 +1,3 @@
-from unittest import mock
-
 import bed_reader
 import numpy as np
 import numpy.testing as nt
@@ -11,22 +9,6 @@
 from bio2zarr import plink, vcf
 
 
-def test_missing_dependency():
-    with mock.patch(
-        "importlib.import_module",
-        side_effect=ImportError("No module named 'bed_reader'"),
-    ):
-        with pytest.raises(ImportError) as exc_info:
-            plink.convert(
-                "UNUSED_PATH",
-                "UNUSED_PATH",
-            )
-        assert (
-            "This process requires the optional bed_reader module. "
-            "Install it with: pip install bio2zarr[plink]" in str(exc_info.value)
-        )
-
-
 class TestSmallExample:
     @pytest.fixture(scope="class")
     def bed_path(self, tmp_path_factory):

From 81443fe5fc7358390b909335eddd080574c47e4e Mon Sep 17 00:00:00 2001
From: Jerome Kelleher <jk@well.ox.ac.uk>
Date: Thu, 22 May 2025 20:38:23 +0100
Subject: [PATCH 2/7] Remove bed_reader/plink from CI tests

---
 .github/workflows/ci.yml | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 911489d2..da2c4d98 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -101,16 +101,6 @@ jobs:
           python -m bio2zarr tskit2zarr convert tests/data/ts/example.trees ts.vcz
           deactivate
 
-          python -m venv env-plink
-          source env-plink/bin/activate
-          python -m pip install .
-          python -m bio2zarr plink2zarr convert tests/data/plink/example plink.vcz > plink.txt 2>&1 || echo $? > plink_exit.txt
-          test "$(cat plink_exit.txt)" = "1"
-          grep -q "This process requires the optional bed_reader module. Install it with: pip install bio2zarr\[plink\]" plink.txt
-          python -m pip install '.[plink]'
-          python -m bio2zarr plink2zarr convert tests/data/plink/example plink.vcz
-          deactivate
-
           python -m venv env-vcf
           source env-vcf/bin/activate
           python -m pip install .

From 7fe383a7a532ad58a4c7a7de68d428d41c4e0a91 Mon Sep 17 00:00:00 2001
From: Jerome Kelleher <jk@well.ox.ac.uk>
Date: Thu, 22 May 2025 21:07:23 +0100
Subject: [PATCH 3/7] Add read_bim function based on sgkit

---
 bio2zarr/plink.py   | 108 ++++++++++++++++++++++----------------------
 tests/test_plink.py |  15 ++++++
 2 files changed, 69 insertions(+), 54 deletions(-)

diff --git a/bio2zarr/plink.py b/bio2zarr/plink.py
index d6f5e438..5effa162 100644
--- a/bio2zarr/plink.py
+++ b/bio2zarr/plink.py
@@ -5,12 +5,54 @@
 import warnings
 
 import numpy as np
+import pandas as pd
 
 from bio2zarr import constants, core, vcz
 
 logger = logging.getLogger(__name__)
 
 
+FAM_FIELDS = [
+    ("family_id", str, "U"),
+    ("member_id", str, "U"),
+    ("paternal_id", str, "U"),
+    ("maternal_id", str, "U"),
+    ("sex", str, "int8"),
+    ("phenotype", str, "int8"),
+]
+FAM_DF_DTYPE = dict([(f[0], f[1]) for f in FAM_FIELDS])
+FAM_ARRAY_DTYPE = dict([(f[0], f[2]) for f in FAM_FIELDS])
+
+BIM_FIELDS = [
+    ("contig", str, "U"),
+    ("variant_id", str, "U"),
+    ("cm_position", "float32", "float32"),
+    ("position", "int32", "int32"),
+    ("allele_1", str, "S"),
+    ("allele_2", str, "S"),
+]
+BIM_DF_DTYPE = dict([(f[0], f[1]) for f in BIM_FIELDS])
+BIM_ARRAY_DTYPE = dict([(f[0], f[2]) for f in BIM_FIELDS])
+
+
+def read_fam(path, sep=None):
+    if sep is None:
+        sep = " "
+    # See: https://www.cog-genomics.org/plink/1.9/formats#fam
+    names = [f[0] for f in FAM_FIELDS]
+    return pd.read_csv(path, sep=sep, names=names, dtype=FAM_DF_DTYPE)
+
+
+def read_bim(path, sep=None):
+    if sep is None:
+        sep = "\t"
+    # See: https://www.cog-genomics.org/plink/1.9/formats#bim
+    names = [f[0] for f in BIM_FIELDS]
+    df = pd.read_csv(str(path), sep=sep, names=names, dtype=BIM_DF_DTYPE)
+    # df["contig"] = df["contig"].where(df["contig"] != "0", None)
+    return df
+
+
 @dataclasses.dataclass
 class PlinkPaths:
     bed_path: str
@@ -24,16 +66,6 @@ class FamData:
     sid_count: int
 
 
-@dataclasses.dataclass
-class BimData:
-    chromosome: np.ndarray
-    vid: np.ndarray
-    bp_position: np.ndarray
-    allele_1: np.ndarray
-    allele_2: np.ndarray
-    vid_count: int
-
-
 class PlinkFormat(vcz.Source):
     def __init__(self, prefix):
         # TODO we will need support multiple chromosomes here to join
@@ -56,40 +88,8 @@ def __init__(self, prefix):
         self.fam = FamData(sid=np.array(samples), sid_count=len(samples))
         self.n_samples = len(samples)
 
-        # Read variant information from .bim file
-        chromosomes = []
-        vids = []
-        positions = []
-        allele1 = []
-        allele2 = []
-
-        with open(self.paths.bim_path) as f:
-            for line in f:
-                fields = line.strip().split()
-                if len(fields) >= 6:
-                    chrom, vid, _, pos, a1, a2 = (
-                        fields[0],
-                        fields[1],
-                        fields[2],
-                        fields[3],
-                        fields[4],
-                        fields[5],
-                    )
-                    chromosomes.append(chrom)
-                    vids.append(vid)
-                    positions.append(int(pos))
-                    allele1.append(a1)
-                    allele2.append(a2)
-
-        self.bim = BimData(
-            chromosome=np.array(chromosomes),
-            vid=np.array(vids),
-            bp_position=np.array(positions),
-            allele_1=np.array(allele1),
-            allele_2=np.array(allele2),
-            vid_count=len(vids),
-        )
-        self.n_variants = len(vids)
+        self.bim = read_bim(self.paths.bim_path)
+        self.n_variants = self.bim.shape[0]
 
         # Calculate bytes per SNP: 1 byte per 4 samples, rounded up
         self.bytes_per_snp = (self.n_samples + 3) // 4
@@ -144,7 +144,7 @@ def path(self):
 
     @property
     def num_records(self):
-        return self.bim.vid_count
+        return self.n_variants
 
     @property
     def samples(self):
@@ -152,7 +152,7 @@ def samples(self):
 
     @property
     def contigs(self):
-        return [vcz.Contig(id=str(chrom)) for chrom in np.unique(self.bim.chromosome)]
+        return [vcz.Contig(id=str(chrom)) for chrom in self.bim.contig.unique()]
 
     @property
     def num_samples(self):
@@ -160,19 +160,19 @@ def num_samples(self):
 
     def iter_contig(self, start, stop):
         chrom_to_contig_index = {contig.id: i for i, contig in enumerate(self.contigs)}
-        for chrom in self.bim.chromosome[start:stop]:
+        for chrom in self.bim.contig[start:stop]:
             yield chrom_to_contig_index[str(chrom)]
 
     def iter_field(self, field_name, shape, start, stop):
         assert field_name == "position"  # Only position field is supported from plink
-        yield from self.bim.bp_position[start:stop]
+        yield from self.bim.position[start:stop]
 
     def iter_id(self, start, stop):
-        yield from self.bim.vid[start:stop]
+        yield from self.bim.variant_id[start:stop]
 
     def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
-        alt_field = self.bim.allele_1
-        ref_field = self.bim.allele_2
+        alt_field = self.bim.allele_1.values
+        ref_field = self.bim.allele_2.values
 
         chunk_size = stop - start
 
@@ -218,7 +218,7 @@ def generate_schema(
         samples_chunk_size=None,
     ):
         n = self.fam.sid_count
-        m = self.bim.vid_count
+        m = self.num_records
         logging.info(f"Scanned plink with {n} samples and {m} variants")
         dimensions = vcz.standard_dimensions(
             variants_size=m,
@@ -241,7 +241,7 @@ def generate_schema(
         )
         # If we don't have SVLEN or END annotations, the rlen field is defined
         # as the length of the REF
-        max_len = self.bim.allele_2.itemsize
+        max_len = self.bim.allele_2.values.itemsize
 
         array_specs = [
             vcz.ZarrArraySpec(
@@ -278,7 +278,7 @@ def generate_schema(
             ),
             vcz.ZarrArraySpec(
                 name="variant_contig",
-                dtype=core.min_int_dtype(0, len(np.unique(self.bim.chromosome))),
+                dtype=core.min_int_dtype(0, len(np.unique(self.bim.contig))),
                 dimensions=["variants"],
                 description="Contig/chromosome index for each variant",
             ),
diff --git a/tests/test_plink.py b/tests/test_plink.py
index f116020a..e3aa425e 100644
--- a/tests/test_plink.py
+++ b/tests/test_plink.py
@@ -9,6 +9,21 @@
 from bio2zarr import plink, vcf
 
 
+class TestReadBim:
+    def test_example(self):
+        path = "tests/data/plink/example.bim"
+        df = plink.read_bim(path)
+        # chrom, vid, cm_pos, pos, a1, a2
+        # 1       1_10    0       10      A       GG
+        # 1       1_20    0       20      TTT     C
+        nt.assert_array_equal(df["contig"].values, ["1", "1"])
+        nt.assert_array_equal(df["variant_id"].values, ["1_10", "1_20"])
+        nt.assert_array_equal(df["cm_position"].values, [0, 0])
+        nt.assert_array_equal(df["position"].values, [10, 20])
+        nt.assert_array_equal(df["allele_1"].values, ["A", "TTT"])
+        nt.assert_array_equal(df["allele_2"].values, ["GG", "C"])
+
+
 class TestSmallExample:
     @pytest.fixture(scope="class")
     def bed_path(self, tmp_path_factory):

From ac9f1d942ab22a0847dd8a73db3fc3d8547a651e Mon Sep 17 00:00:00 2001
From: Jerome Kelleher <jk@well.ox.ac.uk>
Date: Thu, 22 May 2025 21:29:01 +0100
Subject: [PATCH 4/7] Use the pandas-based FAM reader

---
 bio2zarr/plink.py   | 47 +++++++++++++++++----------------------------
 tests/test_plink.py | 25 ++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 29 deletions(-)

diff --git a/bio2zarr/plink.py b/bio2zarr/plink.py
index 5effa162..347ada0e 100644
--- a/bio2zarr/plink.py
+++ b/bio2zarr/plink.py
@@ -14,7 +14,7 @@
 
 FAM_FIELDS = [
     ("family_id", str, "U"),
-    ("member_id", str, "U"),
+    ("individual_id", str, "U"),
     ("paternal_id", str, "U"),
     ("maternal_id", str, "U"),
     ("sex", str, "int8"),
@@ -36,20 +36,16 @@
 
 
 def read_fam(path, sep=None):
-    if sep is None:
-        sep = " "
     # See: https://www.cog-genomics.org/plink/1.9/formats#fam
     names = [f[0] for f in FAM_FIELDS]
-    return pd.read_csv(path, sep=sep, names=names, dtype=FAM_DF_DTYPE)
+    df = pd.read_csv(path, sep=sep, names=names, dtype=FAM_DF_DTYPE)
+    return df
 
 
 def read_bim(path, sep=None):
-    if sep is None:
-        sep = "\t"
     # See: https://www.cog-genomics.org/plink/1.9/formats#bim
     names = [f[0] for f in BIM_FIELDS]
     df = pd.read_csv(str(path), sep=sep, names=names, dtype=BIM_DF_DTYPE)
-    # df["contig"] = df["contig"].where(df["contig"] != "0", None)
     return df
 
 
@@ -78,28 +74,21 @@ def __init__(self, prefix):
             self.prefix + ".fam",
         )
 
-        # Read sample information from .fam file
-        samples = []
-        with open(self.paths.fam_path) as f:
-            for line in f:
-                fields = line.strip().split()
-                if len(fields) >= 2:  # At minimum, we need FID and IID
-                    samples.append(fields[1])
-        self.fam = FamData(sid=np.array(samples), sid_count=len(samples))
-        self.n_samples = len(samples)
-
         self.bim = read_bim(self.paths.bim_path)
-        self.n_variants = self.bim.shape[0]
+        self.fam = read_fam(self.paths.fam_path)
+
+        self._num_records = self.bim.shape[0]
+        self._num_samples = self.fam.shape[0]
 
         # Calculate bytes per SNP: 1 byte per 4 samples, rounded up
-        self.bytes_per_snp = (self.n_samples + 3) // 4
+        self.bytes_per_snp = (self._num_samples + 3) // 4
 
         # Verify BED file has correct magic bytes
         with open(self.paths.bed_path, "rb") as f:
             magic = f.read(3)
             assert magic == b"\x6c\x1b\x01", "Invalid BED file format"
 
-        expected_size = self.n_variants * self.bytes_per_snp + 3  # +3 for magic bytes
+        expected_size = self.num_records * self.bytes_per_snp + 3  # +3 for magic bytes
         actual_size = os.path.getsize(self.paths.bed_path)
         if actual_size < expected_size:
             raise ValueError(
@@ -144,20 +133,20 @@ def path(self):
 
     @property
     def num_records(self):
-        return self.n_variants
+        return self._num_records
+
+    @property
+    def num_samples(self):
+        return self._num_samples
 
     @property
     def samples(self):
-        return [vcz.Sample(id=sample) for sample in self.fam.sid]
+        return [vcz.Sample(id=iid) for iid in self.fam.individual_id]
 
     @property
     def contigs(self):
         return [vcz.Contig(id=str(chrom)) for chrom in self.bim.contig.unique()]
 
-    @property
-    def num_samples(self):
-        return len(self.samples)
-
     def iter_contig(self, start, stop):
         chrom_to_contig_index = {contig.id: i for i, contig in enumerate(self.contigs)}
         for chrom in self.bim.contig[start:stop]:
@@ -198,9 +187,9 @@ def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
         samples_padded = self.bytes_per_snp * 4
         genotypes_reshaped = all_genotypes.reshape(chunk_size, samples_padded, 2)
 
-        gt = genotypes_reshaped[:, : self.n_samples]
+        gt = genotypes_reshaped[:, : self._num_samples]
 
-        phased = np.zeros((chunk_size, self.n_samples), dtype=bool)
+        phased = np.zeros((chunk_size, self._num_samples), dtype=bool)
 
         for i, (ref, alt) in enumerate(
             zip(ref_field[start:stop], alt_field[start:stop])
@@ -217,7 +206,7 @@ def generate_schema(
         variants_chunk_size=None,
         samples_chunk_size=None,
     ):
-        n = self.fam.sid_count
+        n = self.num_samples
         m = self.num_records
         logging.info(f"Scanned plink with {n} samples and {m} variants")
         dimensions = vcz.standard_dimensions(
diff --git a/tests/test_plink.py b/tests/test_plink.py
index e3aa425e..4e8b5ce5 100644
--- a/tests/test_plink.py
+++ b/tests/test_plink.py
@@ -24,6 +24,31 @@ def test_example(self):
         nt.assert_array_equal(df["allele_2"].values, ["GG", "C"])
 
 
+class TestReadFam:
+    def test_example(self):
+        path = "tests/data/plink/example.fam"
+        df = plink.read_fam(path)
+        # FID IID FATHER MOTHER SEX PHENOTYPE
+        # ind0 ind0 0 0 0 -9
+        # ind1 ind1 0 0 0 -9
+        # ind2 ind2 0 0 0 -9
+        # ind3 ind3 0 0 0 -9
+        # ind4 ind4 0 0 0 -9
+        # ind5 ind5 0 0 0 -9
+        # ind6 ind6 0 0 0 -9
+        # ind7 ind7 0 0 0 -9
+        # ind8 ind8 0 0 0 -9
+        # ind9 ind9 0 0 0 -9
+        nt.assert_array_equal(df["family_id"].values, [f"ind{j}" for j in range(10)])
+        nt.assert_array_equal(
+            df["individual_id"].values, [f"ind{j}" for j in range(10)]
+        )
+        nt.assert_array_equal(df["paternal_id"].values, ["0" for j in range(10)])
+        nt.assert_array_equal(df["maternal_id"].values, ["0" for j in range(10)])
+        nt.assert_array_equal(df["sex"].values, ["0" for j in range(10)])
+        nt.assert_array_equal(df["phenotype"].values, ["-9" for j in range(10)])
+
+
 class TestSmallExample:
     @pytest.fixture(scope="class")
     def bed_path(self, tmp_path_factory):

From bf56b86fa03e3cb5487f30a3f932b7320b94909a Mon Sep 17 00:00:00 2001
From: Jerome Kelleher <jk@well.ox.ac.uk>
Date: Thu, 22 May 2025 21:47:26 +0100
Subject: [PATCH 5/7] Separate out the BedReader code

---
 bio2zarr/plink.py   | 139 +++++++++++++++++++++-----------------------
 tests/test_plink.py |  13 +++++
 2 files changed, 78 insertions(+), 74 deletions(-)

diff --git a/bio2zarr/plink.py b/bio2zarr/plink.py
index 347ada0e..382489fb 100644
--- a/bio2zarr/plink.py
+++ b/bio2zarr/plink.py
@@ -1,8 +1,6 @@
 import dataclasses
 import logging
-import os
 import pathlib
-import warnings
 
 import numpy as np
 import pandas as pd
@@ -56,54 +54,21 @@ class PlinkPaths:
     fam_path: str
 
 
-@dataclasses.dataclass
-class FamData:
-    sid: np.ndarray
-    sid_count: int
-
-
-class PlinkFormat(vcz.Source):
-    def __init__(self, prefix):
-        # TODO we will need support multiple chromosomes here to join
-        # plinks into on big zarr. So, these will require multiple
-        # bed and bim files, but should share a .fam
-        self.prefix = str(prefix)
-        self.paths = PlinkPaths(
-            self.prefix + ".bed",
-            self.prefix + ".bim",
-            self.prefix + ".fam",
-        )
-
-        self.bim = read_bim(self.paths.bim_path)
-        self.fam = read_fam(self.paths.fam_path)
-
-        self._num_records = self.bim.shape[0]
-        self._num_samples = self.fam.shape[0]
-
-        # Calculate bytes per SNP: 1 byte per 4 samples, rounded up
-        self.bytes_per_snp = (self._num_samples + 3) // 4
+class BedReader:
+    def __init__(self, path, num_variants, num_samples):
+        self.num_variants = num_variants
+        self.num_samples = num_samples
+        self.path = path
+        # bytes per variant: 1 byte per 4 samples, rounded up
+        self.bytes_per_variant = (self.num_samples + 3) // 4
 
-        # Verify BED file has correct magic bytes
-        with open(self.paths.bed_path, "rb") as f:
+        with open(self.path, "rb") as f:
             magic = f.read(3)
-            assert magic == b"\x6c\x1b\x01", "Invalid BED file format"
-
-        expected_size = self.num_records * self.bytes_per_snp + 3  # +3 for magic bytes
-        actual_size = os.path.getsize(self.paths.bed_path)
-        if actual_size < expected_size:
-            raise ValueError(
-                f"BED file is truncated: expected at least {expected_size} bytes, "
-                f"but only found {actual_size} bytes. "
-                f"Check that .bed, .bim, and .fam files match."
-            )
-        elif actual_size > expected_size:
-            # Warn if there's extra data (might indicate file mismatch)
-            warnings.warn(
-                f"BED file contains {actual_size} bytes but only expected "
-                f"{expected_size}. "
-                f"Using first {expected_size} bytes only.",
-                stacklevel=1,
-            )
+            if magic != b"\x6c\x1b\x01":
+                raise ValueError("Invalid BED file magic bytes")
+
+        # We could check the size of the bed file here, but that would
+        # mean we can't work with streams.
 
         # Initialize the lookup table with shape (256, 4, 2)
         # 256 possible byte values, 4 samples per byte, 2 alleles per sample
@@ -127,6 +92,56 @@ def __init__(self, prefix):
 
         self.byte_lookup = lookup
 
+    def decode(self, start, stop):
+        chunk_size = stop - start
+
+        # Calculate file offsets for the required data
+        # 3 bytes for the magic number at the beginning of the file
+        start_offset = 3 + (start * self.bytes_per_variant)
+        bytes_to_read = chunk_size * self.bytes_per_variant
+
+        # TODO make it possible to read sequentially from the same file handle,
+        # seeking only when necessary.
+        with open(self.path, "rb") as f:
+            f.seek(start_offset)
+            chunk_data = f.read(bytes_to_read)
+
+        data_bytes = np.frombuffer(chunk_data, dtype=np.uint8)
+        data_matrix = data_bytes.reshape(chunk_size, self.bytes_per_variant)
+
+        # Apply lookup table to get genotypes
+        # Shape becomes: (chunk_size, bytes_per_variant, 4, 2)
+        all_genotypes = self.byte_lookup[data_matrix]
+
+        # Reshape to get all samples in one dimension
+        # (chunk_size, bytes_per_variant*4, 2)
+        samples_padded = self.bytes_per_variant * 4
+        genotypes_reshaped = all_genotypes.reshape(chunk_size, samples_padded, 2)
+
+        return genotypes_reshaped[:, : self.num_samples]
+
+
+class PlinkFormat(vcz.Source):
+    def __init__(self, prefix):
+        # TODO we will need support multiple chromosomes here to join
+        # plinks into on big zarr. So, these will require multiple
+        # bed and bim files, but should share a .fam
+        self.prefix = str(prefix)
+        self.paths = PlinkPaths(
+            self.prefix + ".bed",
+            self.prefix + ".bim",
+            self.prefix + ".fam",
+        )
+
+        self.bim = read_bim(self.paths.bim_path)
+        self.fam = read_fam(self.paths.fam_path)
+
+        self._num_records = self.bim.shape[0]
+        self._num_samples = self.fam.shape[0]
+        self.bed_reader = BedReader(
+            self.paths.bed_path, self.num_records, self.num_samples
+        )
+
     @property
     def path(self):
         return self.prefix
@@ -163,33 +178,9 @@ def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
         alt_field = self.bim.allele_1.values
         ref_field = self.bim.allele_2.values
 
-        chunk_size = stop - start
-
-        # Calculate file offsets for the required data
-        # 3 bytes for the magic number at the beginning of the file
-        start_offset = 3 + (start * self.bytes_per_snp)
-        bytes_to_read = chunk_size * self.bytes_per_snp
-
-        # Read only the needed portion of the BED file
-        with open(self.paths.bed_path, "rb") as f:
-            f.seek(start_offset)
-            chunk_data = f.read(bytes_to_read)
-
-        data_bytes = np.frombuffer(chunk_data, dtype=np.uint8)
-        data_matrix = data_bytes.reshape(chunk_size, self.bytes_per_snp)
-
-        # Apply lookup table to get genotypes
-        # Shape becomes: (chunk_size, bytes_per_snp, 4, 2)
-        all_genotypes = self.byte_lookup[data_matrix]
-
-        # Reshape to get all samples in one dimension
-        # (chunk_size, bytes_per_snp*4, 2)
-        samples_padded = self.bytes_per_snp * 4
-        genotypes_reshaped = all_genotypes.reshape(chunk_size, samples_padded, 2)
-
-        gt = genotypes_reshaped[:, : self._num_samples]
+        gt = self.bed_reader.decode(start, stop)
 
-        phased = np.zeros((chunk_size, self._num_samples), dtype=bool)
+        phased = np.zeros(gt.shape[:2], dtype=bool)
 
         for i, (ref, alt) in enumerate(
             zip(ref_field[start:stop], alt_field[start:stop])
diff --git a/tests/test_plink.py b/tests/test_plink.py
index 4e8b5ce5..9b63ad0f 100644
--- a/tests/test_plink.py
+++ b/tests/test_plink.py
@@ -49,6 +49,19 @@ def test_example(self):
         nt.assert_array_equal(df["phenotype"].values, ["-9" for j in range(10)])
 
 
+class TestBedReader:
+    @pytest.mark.parametrize(
+        "path",
+        [
+            "tests/data/plink/example.bim",
+            "tests/data/vcf/sample.vcf.gz",
+        ],
+    )
+    def test_bad_file_type(self, path):
+        with pytest.raises(ValueError, match="Invalid BED file magic bytes"):
+            plink.BedReader(path, 1, 1)
+
+
 class TestSmallExample:
     @pytest.fixture(scope="class")
     def bed_path(self, tmp_path_factory):

From 766dbd9c731cfa3cfa442afdc382bfdf1039c9b0 Mon Sep 17 00:00:00 2001
From: Jerome Kelleher <jk@well.ox.ac.uk>
Date: Thu, 22 May 2025 21:50:06 +0100
Subject: [PATCH 6/7] Remove bed_reader dep and add pandas

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 17898968..4831cf35 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,6 +23,8 @@ dependencies = [
   # cyvcf2 also pulls in coloredlogs and click
   "coloredlogs",
   "click",
+  # Using pandas for reading plink files, but will be useful more generally
+  "pandas"
 ]
 requires-python = ">=3.10"
 classifiers = [
@@ -68,11 +70,9 @@ dev = [
 ]
 # TODO Using dev version of tskit for CI, FIXME before release
 tskit = ["tskit @ git+https://github.com/tskit-dev/tskit.git@main#subdirectory=python"]
-plink = ["bed_reader"]
 vcf = ["cyvcf2"]
 all = [
   "tskit @ git+https://github.com/tskit-dev/tskit.git@main#subdirectory=python", 
-  "bed_reader", 
   "cyvcf2"
   ]
 

From 08ceb749fb4b13ba11f5da3612a4e30e9b7c5cf6 Mon Sep 17 00:00:00 2001
From: Jerome Kelleher <jk@well.ox.ac.uk>
Date: Thu, 22 May 2025 22:20:12 +0100
Subject: [PATCH 7/7] Test on generated bed files written by bed_reader

---
 bio2zarr/plink.py   |  7 ++-----
 tests/test_plink.py | 47 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+), 5 deletions(-)

diff --git a/bio2zarr/plink.py b/bio2zarr/plink.py
index 382489fb..6ae2b873 100644
--- a/bio2zarr/plink.py
+++ b/bio2zarr/plink.py
@@ -62,6 +62,8 @@ def __init__(self, path, num_variants, num_samples):
         # bytes per variant: 1 byte per 4 samples, rounded up
         self.bytes_per_variant = (self.num_samples + 3) // 4
 
+        # TODO open this as a persistent file and support reading from a
+        # stream
         with open(self.path, "rb") as f:
             magic = f.read(3)
             if magic != b"\x6c\x1b\x01":
@@ -132,10 +134,8 @@ def __init__(self, prefix):
             self.prefix + ".bim",
             self.prefix + ".fam",
         )
-
         self.bim = read_bim(self.paths.bim_path)
         self.fam = read_fam(self.paths.fam_path)
-
         self._num_records = self.bim.shape[0]
         self._num_samples = self.fam.shape[0]
         self.bed_reader = BedReader(
@@ -177,11 +177,8 @@ def iter_id(self, start, stop):
     def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
         alt_field = self.bim.allele_1.values
         ref_field = self.bim.allele_2.values
-
         gt = self.bed_reader.decode(start, stop)
-
         phased = np.zeros(gt.shape[:2], dtype=bool)
-
         for i, (ref, alt) in enumerate(
             zip(ref_field[start:stop], alt_field[start:stop])
         ):
diff --git a/tests/test_plink.py b/tests/test_plink.py
index 9b63ad0f..0f863232 100644
--- a/tests/test_plink.py
+++ b/tests/test_plink.py
@@ -1,3 +1,5 @@
+import os.path
+
 import bed_reader
 import numpy as np
 import numpy.testing as nt
@@ -61,6 +63,51 @@ def test_bad_file_type(self, path):
         with pytest.raises(ValueError, match="Invalid BED file magic bytes"):
             plink.BedReader(path, 1, 1)
 
+    @pytest.mark.parametrize(
+        ("num_variants", "num_samples"),
+        [
+            (1, 1),
+            (1, 2),
+            (1, 3),
+            (1, 4),
+            (1, 5),
+            (1, 6),
+            (1, 7),
+            (1, 8),
+            (1, 9),
+            (2, 1),
+            (3, 1),
+            (10, 1),
+            (100, 1),
+            (10, 2),
+            (10, 3),
+            (10, 4),
+            (10, 5),
+            (20, 20),
+            (30, 3),
+        ],
+    )
+    def test_generated_bed_files(self, tmp_path, num_variants, num_samples):
+        bed_file = tmp_path / "a_file.bed"
+        # Generate a regular pattern of all possible values
+        data = np.arange(num_variants * num_samples, dtype=int) % 4
+        data[data == 3] = -127
+        data = data.reshape((num_variants, num_samples))
+
+        bed_reader.to_bed(bed_file, data.T, num_threads=1)
+
+        bytes_per_variant = (num_samples + 3) // 4
+        expected_size = 3 + bytes_per_variant * num_variants
+        assert os.path.getsize(bed_file) == expected_size
+
+        br_map = {0: (0, 0), 1: (0, 1), 2: (1, 1), -127: (-1, -1)}
+        reader = plink.BedReader(bed_file, num_variants, num_samples)
+        g = reader.decode(0, num_variants)
+        assert g.shape == (num_variants, num_samples, 2)
+        for j in range(num_variants):
+            for k in range(num_samples):
+                assert br_map[data[j, k]] == tuple(g[j, k])
+
 
 class TestSmallExample:
     @pytest.fixture(scope="class")