Add prefix argument variant_id to plink conversion

jeromekelleher · jeromekelleher · commit bf958f641c3d · 2025-05-20T17:01:52.000+01:00
diff --git a/bio2zarr/plink.py b/bio2zarr/plink.py
@@ -1,3 +1,4 @@
+import dataclasses
 import logging
 import pathlib
 
@@ -9,17 +10,39 @@
 logger = logging.getLogger(__name__)
 
 
+@dataclasses.dataclass
+class PlinkPaths:
+    bed_path: pathlib.Path
+    bim_path: pathlib.Path
+    fam_path: pathlib.Path
+
+
 class PlinkFormat(vcz.Source):
     @core.requires_optional_dependency("bed_reader", "plink")
-    def __init__(self, path):
+    def __init__(self, prefix):
         import bed_reader
 
-        self._path = pathlib.Path(path)
-        self.bed = bed_reader.open_bed(path, num_threads=1, count_A1=False)
+        # TODO we will need support multiple chromosomes here to join
+        # plinks into on big zarr. So, these will require multiple
+        # bed and bim files, but should share a .fam
+        self.prefix = pathlib.Path(prefix)
+        paths = PlinkPaths(
+            self.prefix.with_suffix(".bed"),
+            self.prefix.with_suffix(".bim"),
+            self.prefix.with_suffix(".fam"),
+        )
+
+        self.bed = bed_reader.open_bed(
+            paths.bed_path,
+            bim_location=paths.bim_path,
+            fam_location=paths.fam_path,
+            num_threads=1,
+            count_A1=False,
+        )
 
     @property
     def path(self):
-        return self._path
+        return self.prefix
 
     @property
     def num_records(self):
@@ -46,6 +69,9 @@ def iter_field(self, field_name, shape, start, stop):
         assert field_name == "position"  # Only position field is supported from plink
         yield from self.bed.bp_position[start:stop]
 
+    def iter_id(self, start, stop):
+        yield from self.bed.sid[start:stop]
+
     def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
         ref_field = self.bed.allele_1
         alt_field = self.bed.allele_2
@@ -107,6 +133,18 @@ def generate_schema(
                 dimensions=["variants", "alleles"],
                 description=None,
             ),
+            vcz.ZarrArraySpec(
+                name="variant_id",
+                dtype="O",
+                dimensions=["variants"],
+                description=None,
+            ),
+            vcz.ZarrArraySpec(
+                name="variant_id_mask",
+                dtype="bool",
+                dimensions=["variants"],
+                description=None,
+            ),
             vcz.ZarrArraySpec(
                 source=None,
                 name="variant_length",
@@ -147,20 +185,20 @@ def generate_schema(
 
 
 def convert(
-    bed_path,
-    zarr_path,
+    prefix,
+    out,
     *,
     variants_chunk_size=None,
     samples_chunk_size=None,
     worker_processes=1,
     show_progress=False,
 ):
-    plink_format = PlinkFormat(bed_path)
+    plink_format = PlinkFormat(prefix)
     schema_instance = plink_format.generate_schema(
         variants_chunk_size=variants_chunk_size,
         samples_chunk_size=samples_chunk_size,
     )
-    zarr_path = pathlib.Path(zarr_path)
+    zarr_path = pathlib.Path(out)
     vzw = vcz.VcfZarrWriter(PlinkFormat, zarr_path)
     # Rough heuristic to split work up enough to keep utilisation high
     target_num_partitions = max(1, worker_processes * 4)
diff --git a/tests/test_plink.py b/tests/test_plink.py
@@ -79,15 +79,43 @@ def test_simulated_example(self, tmp_path):
         bed_path = data_path + "plink_sim_10s_100v_10pmiss.bed"
         fam_path = data_path + "plink_sim_10s_100v_10pmiss.fam"
         bim_path = data_path + "plink_sim_10s_100v_10pmiss.bim"
-        # print(bed_path)
-        # print(fam_path)
         sg_ds = sgkit.io.plink.read_plink(
             bed_path=bed_path, fam_path=fam_path, bim_path=bim_path
         )
         out = tmp_path / "example.plink.zarr"
-        plink.convert(bed_path, out)
+        plink.convert(prefix=data_path + "/plink_sim_10s_100v_10pmiss", out=out)
         ds = sg.load_dataset(out)
         nt.assert_array_equal(ds.call_genotype.values, sg_ds.call_genotype.values)
+        nt.assert_array_equal(
+            ds.call_genotype_mask.values, sg_ds.call_genotype_mask.values
+        )
+        # sgkit doesn't have phased
+        nt.assert_array_equal(ds.variant_position.values, sg_ds.variant_position.values)
+        nt.assert_array_equal(
+            ds.variant_allele.values, sg_ds.variant_allele.values.astype("U")
+        )
+        nt.assert_array_equal(ds.variant_contig.values, sg_ds.variant_contig.values)
+        nt.assert_array_equal(ds.variant_id.values, sg_ds.variant_id.values)
+        # print(sg_ds.variant_id.values)
+
+        # Can't compare to sgkit because of
+        # https://github.com/sgkit-dev/sgkit/issues/1314
+        nt.assert_array_equal(
+            ds.sample_id.values,
+            [
+                "000",
+                "001",
+                "002",
+                "003",
+                "004",
+                "005",
+                "006",
+                "007",
+                "008",
+                "009",
+            ],
+        )
+        # We don't do the additional sample_ fields yet
 
 
 class TestExample: