From 36def1b6483e5136ac6a55423baf3f3b15c266cc Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Thu, 22 May 2025 14:50:04 +0100 Subject: [PATCH 1/7] Remove bed_reader --- bio2zarr/plink.py | 182 ++++++++++++++++++++++++++++++++++++-------- tests/test_plink.py | 18 ----- 2 files changed, 152 insertions(+), 48 deletions(-) diff --git a/bio2zarr/plink.py b/bio2zarr/plink.py index e170ddea..d6f5e438 100644 --- a/bio2zarr/plink.py +++ b/bio2zarr/plink.py @@ -1,6 +1,8 @@ import dataclasses import logging +import os import pathlib +import warnings import numpy as np @@ -16,27 +18,125 @@ class PlinkPaths: fam_path: str +@dataclasses.dataclass +class FamData: + sid: np.ndarray + sid_count: int + + +@dataclasses.dataclass +class BimData: + chromosome: np.ndarray + vid: np.ndarray + bp_position: np.ndarray + allele_1: np.ndarray + allele_2: np.ndarray + vid_count: int + + class PlinkFormat(vcz.Source): - @core.requires_optional_dependency("bed_reader", "plink") def __init__(self, prefix): - import bed_reader - # TODO we will need support multiple chromosomes here to join # plinks into on big zarr. So, these will require multiple # bed and bim files, but should share a .fam self.prefix = str(prefix) - paths = PlinkPaths( + self.paths = PlinkPaths( self.prefix + ".bed", self.prefix + ".bim", self.prefix + ".fam", ) - self.bed = bed_reader.open_bed( - paths.bed_path, - bim_location=paths.bim_path, - fam_location=paths.fam_path, - num_threads=1, - count_A1=True, + + # Read sample information from .fam file + samples = [] + with open(self.paths.fam_path) as f: + for line in f: + fields = line.strip().split() + if len(fields) >= 2: # At minimum, we need FID and IID + samples.append(fields[1]) + self.fam = FamData(sid=np.array(samples), sid_count=len(samples)) + self.n_samples = len(samples) + + # Read variant information from .bim file + chromosomes = [] + vids = [] + positions = [] + allele1 = [] + allele2 = [] + + with open(self.paths.bim_path) as f: + for line in f: + fields = line.strip().split() + if len(fields) >= 6: + chrom, vid, _, pos, a1, a2 = ( + fields[0], + fields[1], + fields[2], + fields[3], + fields[4], + fields[5], + ) + chromosomes.append(chrom) + vids.append(vid) + positions.append(int(pos)) + allele1.append(a1) + allele2.append(a2) + + self.bim = BimData( + chromosome=np.array(chromosomes), + vid=np.array(vids), + bp_position=np.array(positions), + allele_1=np.array(allele1), + allele_2=np.array(allele2), + vid_count=len(vids), ) + self.n_variants = len(vids) + + # Calculate bytes per SNP: 1 byte per 4 samples, rounded up + self.bytes_per_snp = (self.n_samples + 3) // 4 + + # Verify BED file has correct magic bytes + with open(self.paths.bed_path, "rb") as f: + magic = f.read(3) + assert magic == b"\x6c\x1b\x01", "Invalid BED file format" + + expected_size = self.n_variants * self.bytes_per_snp + 3 # +3 for magic bytes + actual_size = os.path.getsize(self.paths.bed_path) + if actual_size < expected_size: + raise ValueError( + f"BED file is truncated: expected at least {expected_size} bytes, " + f"but only found {actual_size} bytes. " + f"Check that .bed, .bim, and .fam files match." + ) + elif actual_size > expected_size: + # Warn if there's extra data (might indicate file mismatch) + warnings.warn( + f"BED file contains {actual_size} bytes but only expected " + f"{expected_size}. " + f"Using first {expected_size} bytes only.", + stacklevel=1, + ) + + # Initialize the lookup table with shape (256, 4, 2) + # 256 possible byte values, 4 samples per byte, 2 alleles per sample + lookup = np.zeros((256, 4, 2), dtype=np.int8) + + # For each possible byte value (0-255) + for byte in range(256): + # For each of the 4 samples encoded in this byte + for sample in range(4): + # Extract the 2 bits for this sample + bits = (byte >> (sample * 2)) & 0b11 + # Convert PLINK's bit encoding to genotype values + if bits == 0b00: + lookup[byte, sample] = [1, 1] + elif bits == 0b01: + lookup[byte, sample] = [-1, -1] + elif bits == 0b10: + lookup[byte, sample] = [0, 1] + elif bits == 0b11: + lookup[byte, sample] = [0, 0] + + self.byte_lookup = lookup @property def path(self): @@ -44,15 +144,15 @@ def path(self): @property def num_records(self): - return self.bed.sid_count + return self.bim.vid_count @property def samples(self): - return [vcz.Sample(id=sample) for sample in self.bed.iid] + return [vcz.Sample(id=sample) for sample in self.fam.sid] @property def contigs(self): - return [vcz.Contig(id=str(chrom)) for chrom in np.unique(self.bed.chromosome)] + return [vcz.Contig(id=str(chrom)) for chrom in np.unique(self.bim.chromosome)] @property def num_samples(self): @@ -60,43 +160,65 @@ def num_samples(self): def iter_contig(self, start, stop): chrom_to_contig_index = {contig.id: i for i, contig in enumerate(self.contigs)} - for chrom in self.bed.chromosome[start:stop]: + for chrom in self.bim.chromosome[start:stop]: yield chrom_to_contig_index[str(chrom)] def iter_field(self, field_name, shape, start, stop): assert field_name == "position" # Only position field is supported from plink - yield from self.bed.bp_position[start:stop] + yield from self.bim.bp_position[start:stop] def iter_id(self, start, stop): - yield from self.bed.sid[start:stop] + yield from self.bim.vid[start:stop] def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles): - alt_field = self.bed.allele_1 - ref_field = self.bed.allele_2 - bed_chunk = self.bed.read(slice(start, stop), dtype=np.int8).T - gt = np.zeros(shape, dtype=np.int8) - phased = np.zeros(shape[:-1], dtype=bool) + alt_field = self.bim.allele_1 + ref_field = self.bim.allele_2 + + chunk_size = stop - start + + # Calculate file offsets for the required data + # 3 bytes for the magic number at the beginning of the file + start_offset = 3 + (start * self.bytes_per_snp) + bytes_to_read = chunk_size * self.bytes_per_snp + + # Read only the needed portion of the BED file + with open(self.paths.bed_path, "rb") as f: + f.seek(start_offset) + chunk_data = f.read(bytes_to_read) + + data_bytes = np.frombuffer(chunk_data, dtype=np.uint8) + data_matrix = data_bytes.reshape(chunk_size, self.bytes_per_snp) + + # Apply lookup table to get genotypes + # Shape becomes: (chunk_size, bytes_per_snp, 4, 2) + all_genotypes = self.byte_lookup[data_matrix] + + # Reshape to get all samples in one dimension + # (chunk_size, bytes_per_snp*4, 2) + samples_padded = self.bytes_per_snp * 4 + genotypes_reshaped = all_genotypes.reshape(chunk_size, samples_padded, 2) + + gt = genotypes_reshaped[:, : self.n_samples] + + phased = np.zeros((chunk_size, self.n_samples), dtype=bool) + for i, (ref, alt) in enumerate( zip(ref_field[start:stop], alt_field[start:stop]) ): alleles = np.full(num_alleles, constants.STR_FILL, dtype="O") alleles[0] = ref alleles[1 : 1 + len(alt)] = alt - gt[:] = 0 - gt[bed_chunk[i] == -127] = -1 - gt[bed_chunk[i] == 2] = 1 - gt[bed_chunk[i] == 1, 1] = 1 # rlen is the length of the REF in PLINK as there's no END annotations - yield vcz.VariantData(len(alleles[0]), alleles, gt, phased) + yield vcz.VariantData(len(alleles[0]), alleles, gt[i], phased[i]) def generate_schema( self, variants_chunk_size=None, samples_chunk_size=None, ): - n = self.bed.iid_count - m = self.bed.sid_count + n = self.fam.sid_count + m = self.bim.vid_count logging.info(f"Scanned plink with {n} samples and {m} variants") dimensions = vcz.standard_dimensions( variants_size=m, @@ -119,7 +241,7 @@ def generate_schema( ) # If we don't have SVLEN or END annotations, the rlen field is defined # as the length of the REF - max_len = self.bed.allele_2.itemsize + max_len = self.bim.allele_2.itemsize array_specs = [ vcz.ZarrArraySpec( @@ -156,7 +278,7 @@ def generate_schema( ), vcz.ZarrArraySpec( name="variant_contig", - dtype=core.min_int_dtype(0, len(np.unique(self.bed.chromosome))), + dtype=core.min_int_dtype(0, len(np.unique(self.bim.chromosome))), dimensions=["variants"], description="Contig/chromosome index for each variant", ), diff --git a/tests/test_plink.py b/tests/test_plink.py index 661b62f6..f116020a 100644 --- a/tests/test_plink.py +++ b/tests/test_plink.py @@ -1,5 +1,3 @@ -from unittest import mock - import bed_reader import numpy as np import numpy.testing as nt @@ -11,22 +9,6 @@ from bio2zarr import plink, vcf -def test_missing_dependency(): - with mock.patch( - "importlib.import_module", - side_effect=ImportError("No module named 'bed_reader'"), - ): - with pytest.raises(ImportError) as exc_info: - plink.convert( - "UNUSED_PATH", - "UNUSED_PATH", - ) - assert ( - "This process requires the optional bed_reader module. " - "Install it with: pip install bio2zarr[plink]" in str(exc_info.value) - ) - - class TestSmallExample: @pytest.fixture(scope="class") def bed_path(self, tmp_path_factory): From 81443fe5fc7358390b909335eddd080574c47e4e Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Thu, 22 May 2025 20:38:23 +0100 Subject: [PATCH 2/7] Remove bed_reader/plink from CI tests --- .github/workflows/ci.yml | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 911489d2..da2c4d98 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -101,16 +101,6 @@ jobs: python -m bio2zarr tskit2zarr convert tests/data/ts/example.trees ts.vcz deactivate - python -m venv env-plink - source env-plink/bin/activate - python -m pip install . - python -m bio2zarr plink2zarr convert tests/data/plink/example plink.vcz > plink.txt 2>&1 || echo $? > plink_exit.txt - test "$(cat plink_exit.txt)" = "1" - grep -q "This process requires the optional bed_reader module. Install it with: pip install bio2zarr\[plink\]" plink.txt - python -m pip install '.[plink]' - python -m bio2zarr plink2zarr convert tests/data/plink/example plink.vcz - deactivate - python -m venv env-vcf source env-vcf/bin/activate python -m pip install . From 7fe383a7a532ad58a4c7a7de68d428d41c4e0a91 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Thu, 22 May 2025 21:07:23 +0100 Subject: [PATCH 3/7] Add read_bim function based on sgkit --- bio2zarr/plink.py | 108 ++++++++++++++++++++++---------------------- tests/test_plink.py | 15 ++++++ 2 files changed, 69 insertions(+), 54 deletions(-) diff --git a/bio2zarr/plink.py b/bio2zarr/plink.py index d6f5e438..5effa162 100644 --- a/bio2zarr/plink.py +++ b/bio2zarr/plink.py @@ -5,12 +5,54 @@ import warnings import numpy as np +import pandas as pd from bio2zarr import constants, core, vcz logger = logging.getLogger(__name__) +FAM_FIELDS = [ + ("family_id", str, "U"), + ("member_id", str, "U"), + ("paternal_id", str, "U"), + ("maternal_id", str, "U"), + ("sex", str, "int8"), + ("phenotype", str, "int8"), +] +FAM_DF_DTYPE = dict([(f[0], f[1]) for f in FAM_FIELDS]) +FAM_ARRAY_DTYPE = dict([(f[0], f[2]) for f in FAM_FIELDS]) + +BIM_FIELDS = [ + ("contig", str, "U"), + ("variant_id", str, "U"), + ("cm_position", "float32", "float32"), + ("position", "int32", "int32"), + ("allele_1", str, "S"), + ("allele_2", str, "S"), +] +BIM_DF_DTYPE = dict([(f[0], f[1]) for f in BIM_FIELDS]) +BIM_ARRAY_DTYPE = dict([(f[0], f[2]) for f in BIM_FIELDS]) + + +def read_fam(path, sep=None): + if sep is None: + sep = " " + # See: https://www.cog-genomics.org/plink/1.9/formats#fam + names = [f[0] for f in FAM_FIELDS] + return pd.read_csv(path, sep=sep, names=names, dtype=FAM_DF_DTYPE) + + +def read_bim(path, sep=None): + if sep is None: + sep = "\t" + # See: https://www.cog-genomics.org/plink/1.9/formats#bim + names = [f[0] for f in BIM_FIELDS] + df = pd.read_csv(str(path), sep=sep, names=names, dtype=BIM_DF_DTYPE) + # df["contig"] = df["contig"].where(df["contig"] != "0", None) + return df + + @dataclasses.dataclass class PlinkPaths: bed_path: str @@ -24,16 +66,6 @@ class FamData: sid_count: int -@dataclasses.dataclass -class BimData: - chromosome: np.ndarray - vid: np.ndarray - bp_position: np.ndarray - allele_1: np.ndarray - allele_2: np.ndarray - vid_count: int - - class PlinkFormat(vcz.Source): def __init__(self, prefix): # TODO we will need support multiple chromosomes here to join @@ -56,40 +88,8 @@ def __init__(self, prefix): self.fam = FamData(sid=np.array(samples), sid_count=len(samples)) self.n_samples = len(samples) - # Read variant information from .bim file - chromosomes = [] - vids = [] - positions = [] - allele1 = [] - allele2 = [] - - with open(self.paths.bim_path) as f: - for line in f: - fields = line.strip().split() - if len(fields) >= 6: - chrom, vid, _, pos, a1, a2 = ( - fields[0], - fields[1], - fields[2], - fields[3], - fields[4], - fields[5], - ) - chromosomes.append(chrom) - vids.append(vid) - positions.append(int(pos)) - allele1.append(a1) - allele2.append(a2) - - self.bim = BimData( - chromosome=np.array(chromosomes), - vid=np.array(vids), - bp_position=np.array(positions), - allele_1=np.array(allele1), - allele_2=np.array(allele2), - vid_count=len(vids), - ) - self.n_variants = len(vids) + self.bim = read_bim(self.paths.bim_path) + self.n_variants = self.bim.shape[0] # Calculate bytes per SNP: 1 byte per 4 samples, rounded up self.bytes_per_snp = (self.n_samples + 3) // 4 @@ -144,7 +144,7 @@ def path(self): @property def num_records(self): - return self.bim.vid_count + return self.n_variants @property def samples(self): @@ -152,7 +152,7 @@ def samples(self): @property def contigs(self): - return [vcz.Contig(id=str(chrom)) for chrom in np.unique(self.bim.chromosome)] + return [vcz.Contig(id=str(chrom)) for chrom in self.bim.contig.unique()] @property def num_samples(self): @@ -160,19 +160,19 @@ def num_samples(self): def iter_contig(self, start, stop): chrom_to_contig_index = {contig.id: i for i, contig in enumerate(self.contigs)} - for chrom in self.bim.chromosome[start:stop]: + for chrom in self.bim.contig[start:stop]: yield chrom_to_contig_index[str(chrom)] def iter_field(self, field_name, shape, start, stop): assert field_name == "position" # Only position field is supported from plink - yield from self.bim.bp_position[start:stop] + yield from self.bim.position[start:stop] def iter_id(self, start, stop): - yield from self.bim.vid[start:stop] + yield from self.bim.variant_id[start:stop] def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles): - alt_field = self.bim.allele_1 - ref_field = self.bim.allele_2 + alt_field = self.bim.allele_1.values + ref_field = self.bim.allele_2.values chunk_size = stop - start @@ -218,7 +218,7 @@ def generate_schema( samples_chunk_size=None, ): n = self.fam.sid_count - m = self.bim.vid_count + m = self.num_records logging.info(f"Scanned plink with {n} samples and {m} variants") dimensions = vcz.standard_dimensions( variants_size=m, @@ -241,7 +241,7 @@ def generate_schema( ) # If we don't have SVLEN or END annotations, the rlen field is defined # as the length of the REF - max_len = self.bim.allele_2.itemsize + max_len = self.bim.allele_2.values.itemsize array_specs = [ vcz.ZarrArraySpec( @@ -278,7 +278,7 @@ def generate_schema( ), vcz.ZarrArraySpec( name="variant_contig", - dtype=core.min_int_dtype(0, len(np.unique(self.bim.chromosome))), + dtype=core.min_int_dtype(0, len(np.unique(self.bim.contig))), dimensions=["variants"], description="Contig/chromosome index for each variant", ), diff --git a/tests/test_plink.py b/tests/test_plink.py index f116020a..e3aa425e 100644 --- a/tests/test_plink.py +++ b/tests/test_plink.py @@ -9,6 +9,21 @@ from bio2zarr import plink, vcf +class TestReadBim: + def test_example(self): + path = "tests/data/plink/example.bim" + df = plink.read_bim(path) + # chrom, vid, cm_pos, pos, a1, a2 + # 1 1_10 0 10 A GG + # 1 1_20 0 20 TTT C + nt.assert_array_equal(df["contig"].values, ["1", "1"]) + nt.assert_array_equal(df["variant_id"].values, ["1_10", "1_20"]) + nt.assert_array_equal(df["cm_position"].values, [0, 0]) + nt.assert_array_equal(df["position"].values, [10, 20]) + nt.assert_array_equal(df["allele_1"].values, ["A", "TTT"]) + nt.assert_array_equal(df["allele_2"].values, ["GG", "C"]) + + class TestSmallExample: @pytest.fixture(scope="class") def bed_path(self, tmp_path_factory): From ac9f1d942ab22a0847dd8a73db3fc3d8547a651e Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Thu, 22 May 2025 21:29:01 +0100 Subject: [PATCH 4/7] Use the pandas-based FAM reader --- bio2zarr/plink.py | 47 +++++++++++++++++---------------------------- tests/test_plink.py | 25 ++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 29 deletions(-) diff --git a/bio2zarr/plink.py b/bio2zarr/plink.py index 5effa162..347ada0e 100644 --- a/bio2zarr/plink.py +++ b/bio2zarr/plink.py @@ -14,7 +14,7 @@ FAM_FIELDS = [ ("family_id", str, "U"), - ("member_id", str, "U"), + ("individual_id", str, "U"), ("paternal_id", str, "U"), ("maternal_id", str, "U"), ("sex", str, "int8"), @@ -36,20 +36,16 @@ def read_fam(path, sep=None): - if sep is None: - sep = " " # See: https://www.cog-genomics.org/plink/1.9/formats#fam names = [f[0] for f in FAM_FIELDS] - return pd.read_csv(path, sep=sep, names=names, dtype=FAM_DF_DTYPE) + df = pd.read_csv(path, sep=sep, names=names, dtype=FAM_DF_DTYPE) + return df def read_bim(path, sep=None): - if sep is None: - sep = "\t" # See: https://www.cog-genomics.org/plink/1.9/formats#bim names = [f[0] for f in BIM_FIELDS] df = pd.read_csv(str(path), sep=sep, names=names, dtype=BIM_DF_DTYPE) - # df["contig"] = df["contig"].where(df["contig"] != "0", None) return df @@ -78,28 +74,21 @@ def __init__(self, prefix): self.prefix + ".fam", ) - # Read sample information from .fam file - samples = [] - with open(self.paths.fam_path) as f: - for line in f: - fields = line.strip().split() - if len(fields) >= 2: # At minimum, we need FID and IID - samples.append(fields[1]) - self.fam = FamData(sid=np.array(samples), sid_count=len(samples)) - self.n_samples = len(samples) - self.bim = read_bim(self.paths.bim_path) - self.n_variants = self.bim.shape[0] + self.fam = read_fam(self.paths.fam_path) + + self._num_records = self.bim.shape[0] + self._num_samples = self.fam.shape[0] # Calculate bytes per SNP: 1 byte per 4 samples, rounded up - self.bytes_per_snp = (self.n_samples + 3) // 4 + self.bytes_per_snp = (self._num_samples + 3) // 4 # Verify BED file has correct magic bytes with open(self.paths.bed_path, "rb") as f: magic = f.read(3) assert magic == b"\x6c\x1b\x01", "Invalid BED file format" - expected_size = self.n_variants * self.bytes_per_snp + 3 # +3 for magic bytes + expected_size = self.num_records * self.bytes_per_snp + 3 # +3 for magic bytes actual_size = os.path.getsize(self.paths.bed_path) if actual_size < expected_size: raise ValueError( @@ -144,20 +133,20 @@ def path(self): @property def num_records(self): - return self.n_variants + return self._num_records + + @property + def num_samples(self): + return self._num_samples @property def samples(self): - return [vcz.Sample(id=sample) for sample in self.fam.sid] + return [vcz.Sample(id=iid) for iid in self.fam.individual_id] @property def contigs(self): return [vcz.Contig(id=str(chrom)) for chrom in self.bim.contig.unique()] - @property - def num_samples(self): - return len(self.samples) - def iter_contig(self, start, stop): chrom_to_contig_index = {contig.id: i for i, contig in enumerate(self.contigs)} for chrom in self.bim.contig[start:stop]: @@ -198,9 +187,9 @@ def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles): samples_padded = self.bytes_per_snp * 4 genotypes_reshaped = all_genotypes.reshape(chunk_size, samples_padded, 2) - gt = genotypes_reshaped[:, : self.n_samples] + gt = genotypes_reshaped[:, : self._num_samples] - phased = np.zeros((chunk_size, self.n_samples), dtype=bool) + phased = np.zeros((chunk_size, self._num_samples), dtype=bool) for i, (ref, alt) in enumerate( zip(ref_field[start:stop], alt_field[start:stop]) @@ -217,7 +206,7 @@ def generate_schema( variants_chunk_size=None, samples_chunk_size=None, ): - n = self.fam.sid_count + n = self.num_samples m = self.num_records logging.info(f"Scanned plink with {n} samples and {m} variants") dimensions = vcz.standard_dimensions( diff --git a/tests/test_plink.py b/tests/test_plink.py index e3aa425e..4e8b5ce5 100644 --- a/tests/test_plink.py +++ b/tests/test_plink.py @@ -24,6 +24,31 @@ def test_example(self): nt.assert_array_equal(df["allele_2"].values, ["GG", "C"]) +class TestReadFam: + def test_example(self): + path = "tests/data/plink/example.fam" + df = plink.read_fam(path) + # FID IID FATHER MOTHER SEX PHENOTYPE + # ind0 ind0 0 0 0 -9 + # ind1 ind1 0 0 0 -9 + # ind2 ind2 0 0 0 -9 + # ind3 ind3 0 0 0 -9 + # ind4 ind4 0 0 0 -9 + # ind5 ind5 0 0 0 -9 + # ind6 ind6 0 0 0 -9 + # ind7 ind7 0 0 0 -9 + # ind8 ind8 0 0 0 -9 + # ind9 ind9 0 0 0 -9 + nt.assert_array_equal(df["family_id"].values, [f"ind{j}" for j in range(10)]) + nt.assert_array_equal( + df["individual_id"].values, [f"ind{j}" for j in range(10)] + ) + nt.assert_array_equal(df["paternal_id"].values, ["0" for j in range(10)]) + nt.assert_array_equal(df["maternal_id"].values, ["0" for j in range(10)]) + nt.assert_array_equal(df["sex"].values, ["0" for j in range(10)]) + nt.assert_array_equal(df["phenotype"].values, ["-9" for j in range(10)]) + + class TestSmallExample: @pytest.fixture(scope="class") def bed_path(self, tmp_path_factory): From bf56b86fa03e3cb5487f30a3f932b7320b94909a Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Thu, 22 May 2025 21:47:26 +0100 Subject: [PATCH 5/7] Separate out the BedReader code --- bio2zarr/plink.py | 139 +++++++++++++++++++++----------------------- tests/test_plink.py | 13 +++++ 2 files changed, 78 insertions(+), 74 deletions(-) diff --git a/bio2zarr/plink.py b/bio2zarr/plink.py index 347ada0e..382489fb 100644 --- a/bio2zarr/plink.py +++ b/bio2zarr/plink.py @@ -1,8 +1,6 @@ import dataclasses import logging -import os import pathlib -import warnings import numpy as np import pandas as pd @@ -56,54 +54,21 @@ class PlinkPaths: fam_path: str -@dataclasses.dataclass -class FamData: - sid: np.ndarray - sid_count: int - - -class PlinkFormat(vcz.Source): - def __init__(self, prefix): - # TODO we will need support multiple chromosomes here to join - # plinks into on big zarr. So, these will require multiple - # bed and bim files, but should share a .fam - self.prefix = str(prefix) - self.paths = PlinkPaths( - self.prefix + ".bed", - self.prefix + ".bim", - self.prefix + ".fam", - ) - - self.bim = read_bim(self.paths.bim_path) - self.fam = read_fam(self.paths.fam_path) - - self._num_records = self.bim.shape[0] - self._num_samples = self.fam.shape[0] - - # Calculate bytes per SNP: 1 byte per 4 samples, rounded up - self.bytes_per_snp = (self._num_samples + 3) // 4 +class BedReader: + def __init__(self, path, num_variants, num_samples): + self.num_variants = num_variants + self.num_samples = num_samples + self.path = path + # bytes per variant: 1 byte per 4 samples, rounded up + self.bytes_per_variant = (self.num_samples + 3) // 4 - # Verify BED file has correct magic bytes - with open(self.paths.bed_path, "rb") as f: + with open(self.path, "rb") as f: magic = f.read(3) - assert magic == b"\x6c\x1b\x01", "Invalid BED file format" - - expected_size = self.num_records * self.bytes_per_snp + 3 # +3 for magic bytes - actual_size = os.path.getsize(self.paths.bed_path) - if actual_size < expected_size: - raise ValueError( - f"BED file is truncated: expected at least {expected_size} bytes, " - f"but only found {actual_size} bytes. " - f"Check that .bed, .bim, and .fam files match." - ) - elif actual_size > expected_size: - # Warn if there's extra data (might indicate file mismatch) - warnings.warn( - f"BED file contains {actual_size} bytes but only expected " - f"{expected_size}. " - f"Using first {expected_size} bytes only.", - stacklevel=1, - ) + if magic != b"\x6c\x1b\x01": + raise ValueError("Invalid BED file magic bytes") + + # We could check the size of the bed file here, but that would + # mean we can't work with streams. # Initialize the lookup table with shape (256, 4, 2) # 256 possible byte values, 4 samples per byte, 2 alleles per sample @@ -127,6 +92,56 @@ def __init__(self, prefix): self.byte_lookup = lookup + def decode(self, start, stop): + chunk_size = stop - start + + # Calculate file offsets for the required data + # 3 bytes for the magic number at the beginning of the file + start_offset = 3 + (start * self.bytes_per_variant) + bytes_to_read = chunk_size * self.bytes_per_variant + + # TODO make it possible to read sequentially from the same file handle, + # seeking only when necessary. + with open(self.path, "rb") as f: + f.seek(start_offset) + chunk_data = f.read(bytes_to_read) + + data_bytes = np.frombuffer(chunk_data, dtype=np.uint8) + data_matrix = data_bytes.reshape(chunk_size, self.bytes_per_variant) + + # Apply lookup table to get genotypes + # Shape becomes: (chunk_size, bytes_per_variant, 4, 2) + all_genotypes = self.byte_lookup[data_matrix] + + # Reshape to get all samples in one dimension + # (chunk_size, bytes_per_variant*4, 2) + samples_padded = self.bytes_per_variant * 4 + genotypes_reshaped = all_genotypes.reshape(chunk_size, samples_padded, 2) + + return genotypes_reshaped[:, : self.num_samples] + + +class PlinkFormat(vcz.Source): + def __init__(self, prefix): + # TODO we will need support multiple chromosomes here to join + # plinks into on big zarr. So, these will require multiple + # bed and bim files, but should share a .fam + self.prefix = str(prefix) + self.paths = PlinkPaths( + self.prefix + ".bed", + self.prefix + ".bim", + self.prefix + ".fam", + ) + + self.bim = read_bim(self.paths.bim_path) + self.fam = read_fam(self.paths.fam_path) + + self._num_records = self.bim.shape[0] + self._num_samples = self.fam.shape[0] + self.bed_reader = BedReader( + self.paths.bed_path, self.num_records, self.num_samples + ) + @property def path(self): return self.prefix @@ -163,33 +178,9 @@ def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles): alt_field = self.bim.allele_1.values ref_field = self.bim.allele_2.values - chunk_size = stop - start - - # Calculate file offsets for the required data - # 3 bytes for the magic number at the beginning of the file - start_offset = 3 + (start * self.bytes_per_snp) - bytes_to_read = chunk_size * self.bytes_per_snp - - # Read only the needed portion of the BED file - with open(self.paths.bed_path, "rb") as f: - f.seek(start_offset) - chunk_data = f.read(bytes_to_read) - - data_bytes = np.frombuffer(chunk_data, dtype=np.uint8) - data_matrix = data_bytes.reshape(chunk_size, self.bytes_per_snp) - - # Apply lookup table to get genotypes - # Shape becomes: (chunk_size, bytes_per_snp, 4, 2) - all_genotypes = self.byte_lookup[data_matrix] - - # Reshape to get all samples in one dimension - # (chunk_size, bytes_per_snp*4, 2) - samples_padded = self.bytes_per_snp * 4 - genotypes_reshaped = all_genotypes.reshape(chunk_size, samples_padded, 2) - - gt = genotypes_reshaped[:, : self._num_samples] + gt = self.bed_reader.decode(start, stop) - phased = np.zeros((chunk_size, self._num_samples), dtype=bool) + phased = np.zeros(gt.shape[:2], dtype=bool) for i, (ref, alt) in enumerate( zip(ref_field[start:stop], alt_field[start:stop]) diff --git a/tests/test_plink.py b/tests/test_plink.py index 4e8b5ce5..9b63ad0f 100644 --- a/tests/test_plink.py +++ b/tests/test_plink.py @@ -49,6 +49,19 @@ def test_example(self): nt.assert_array_equal(df["phenotype"].values, ["-9" for j in range(10)]) +class TestBedReader: + @pytest.mark.parametrize( + "path", + [ + "tests/data/plink/example.bim", + "tests/data/vcf/sample.vcf.gz", + ], + ) + def test_bad_file_type(self, path): + with pytest.raises(ValueError, match="Invalid BED file magic bytes"): + plink.BedReader(path, 1, 1) + + class TestSmallExample: @pytest.fixture(scope="class") def bed_path(self, tmp_path_factory): From 766dbd9c731cfa3cfa442afdc382bfdf1039c9b0 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Thu, 22 May 2025 21:50:06 +0100 Subject: [PATCH 6/7] Remove bed_reader dep and add pandas --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 17898968..4831cf35 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,8 @@ dependencies = [ # cyvcf2 also pulls in coloredlogs and click "coloredlogs", "click", + # Using pandas for reading plink files, but will be useful more generally + "pandas" ] requires-python = ">=3.10" classifiers = [ @@ -68,11 +70,9 @@ dev = [ ] # TODO Using dev version of tskit for CI, FIXME before release tskit = ["tskit @ git+https://github.com/tskit-dev/tskit.git@main#subdirectory=python"] -plink = ["bed_reader"] vcf = ["cyvcf2"] all = [ "tskit @ git+https://github.com/tskit-dev/tskit.git@main#subdirectory=python", - "bed_reader", "cyvcf2" ] From 08ceb749fb4b13ba11f5da3612a4e30e9b7c5cf6 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Thu, 22 May 2025 22:20:12 +0100 Subject: [PATCH 7/7] Test on generated bed files written by bed_reader --- bio2zarr/plink.py | 7 ++----- tests/test_plink.py | 47 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 5 deletions(-) diff --git a/bio2zarr/plink.py b/bio2zarr/plink.py index 382489fb..6ae2b873 100644 --- a/bio2zarr/plink.py +++ b/bio2zarr/plink.py @@ -62,6 +62,8 @@ def __init__(self, path, num_variants, num_samples): # bytes per variant: 1 byte per 4 samples, rounded up self.bytes_per_variant = (self.num_samples + 3) // 4 + # TODO open this as a persistent file and support reading from a + # stream with open(self.path, "rb") as f: magic = f.read(3) if magic != b"\x6c\x1b\x01": @@ -132,10 +134,8 @@ def __init__(self, prefix): self.prefix + ".bim", self.prefix + ".fam", ) - self.bim = read_bim(self.paths.bim_path) self.fam = read_fam(self.paths.fam_path) - self._num_records = self.bim.shape[0] self._num_samples = self.fam.shape[0] self.bed_reader = BedReader( @@ -177,11 +177,8 @@ def iter_id(self, start, stop): def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles): alt_field = self.bim.allele_1.values ref_field = self.bim.allele_2.values - gt = self.bed_reader.decode(start, stop) - phased = np.zeros(gt.shape[:2], dtype=bool) - for i, (ref, alt) in enumerate( zip(ref_field[start:stop], alt_field[start:stop]) ): diff --git a/tests/test_plink.py b/tests/test_plink.py index 9b63ad0f..0f863232 100644 --- a/tests/test_plink.py +++ b/tests/test_plink.py @@ -1,3 +1,5 @@ +import os.path + import bed_reader import numpy as np import numpy.testing as nt @@ -61,6 +63,51 @@ def test_bad_file_type(self, path): with pytest.raises(ValueError, match="Invalid BED file magic bytes"): plink.BedReader(path, 1, 1) + @pytest.mark.parametrize( + ("num_variants", "num_samples"), + [ + (1, 1), + (1, 2), + (1, 3), + (1, 4), + (1, 5), + (1, 6), + (1, 7), + (1, 8), + (1, 9), + (2, 1), + (3, 1), + (10, 1), + (100, 1), + (10, 2), + (10, 3), + (10, 4), + (10, 5), + (20, 20), + (30, 3), + ], + ) + def test_generated_bed_files(self, tmp_path, num_variants, num_samples): + bed_file = tmp_path / "a_file.bed" + # Generate a regular pattern of all possible values + data = np.arange(num_variants * num_samples, dtype=int) % 4 + data[data == 3] = -127 + data = data.reshape((num_variants, num_samples)) + + bed_reader.to_bed(bed_file, data.T, num_threads=1) + + bytes_per_variant = (num_samples + 3) // 4 + expected_size = 3 + bytes_per_variant * num_variants + assert os.path.getsize(bed_file) == expected_size + + br_map = {0: (0, 0), 1: (0, 1), 2: (1, 1), -127: (-1, -1)} + reader = plink.BedReader(bed_file, num_variants, num_samples) + g = reader.decode(0, num_variants) + assert g.shape == (num_variants, num_samples, 2) + for j in range(num_variants): + for k in range(num_samples): + assert br_map[data[j, k]] == tuple(g[j, k]) + class TestSmallExample: @pytest.fixture(scope="class")