sgkit-dev · jeromekelleher · May 14, 2025 · May 13, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,8 @@
 
 - Add contigs to plink output (#344)
 
+- Add variant_length and indexing to plink output (#382)
+
 Breaking changes
 
 - Remove explicit sample, contig and filter lists from the schema.

diff --git a/bio2zarr/plink.py b/bio2zarr/plink.py
@@ -63,7 +63,7 @@ def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
             gt[bed_chunk[i] == 2] = 1
             gt[bed_chunk[i] == 1, 0] = 1
 
-            yield alleles, (gt, phased)
+            yield vcz.VariantData(max(len(a) for a in alleles), alleles, gt, phased)
 
     def generate_schema(
         self,
@@ -112,6 +112,13 @@ def generate_schema(
                 dimensions=["variants", "alleles"],
                 description=None,
             ),
+            vcz.ZarrArraySpec(
+                source=None,
+                name="variant_length",
+                dtype="i4",
+                dimensions=["variants"],
+                description="Length of each variant",
+            ),
             vcz.ZarrArraySpec(
                 name="variant_contig",
                 dtype=core.min_int_dtype(0, len(np.unique(self.bed.chromosome))),

diff --git a/bio2zarr/tskit.py b/bio2zarr/tskit.py
@@ -97,21 +97,23 @@ def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
             left=self.positions[start],
             right=self.positions[stop] if stop < self.num_records else None,
             samples=self.tskit_samples,
+            copy=False,
         ):
             gt = np.full(shape, constants.INT_FILL, dtype=np.int8)
             alleles = np.full(num_alleles, constants.STR_FILL, dtype="O")
+            variant_length = 0
             for i, allele in enumerate(variant.alleles):
                 # None is returned by tskit in the case of a missing allele
                 if allele is None:
                     continue
                 assert i < num_alleles
                 alleles[i] = allele
-
+                variant_length = max(variant_length, len(allele))
             gt[self.sample_indices, self.ploidy_indices] = variant.genotypes[
                 self.genotype_indices
             ]
 
-            yield alleles, (gt, phased)
+            yield vcz.VariantData(variant_length, alleles, gt, phased)
 
     def generate_schema(
         self,
@@ -164,6 +166,16 @@ def generate_schema(
             min_position = np.min(self.ts.sites_position)
             max_position = np.max(self.ts.sites_position)
 
+        tables = self.ts.tables
+        ancestral_state_offsets = tables.sites.ancestral_state_offset
+        derived_state_offsets = tables.mutations.derived_state_offset
+        ancestral_lengths = ancestral_state_offsets[1:] - ancestral_state_offsets[:-1]
+        derived_lengths = derived_state_offsets[1:] - derived_state_offsets[:-1]
+        max_variant_length = max(
+            np.max(ancestral_lengths) if len(ancestral_lengths) > 0 else 0,
+            np.max(derived_lengths) if len(derived_lengths) > 0 else 0,
+        )
+
         array_specs = [
             vcz.ZarrArraySpec(
                 source="position",
@@ -179,6 +191,13 @@ def generate_schema(
                 dimensions=["variants", "alleles"],
                 description="Alleles for each variant",
             ),
+            vcz.ZarrArraySpec(
+                source=None,
+                name="variant_length",
+                dtype=core.min_int_dtype(0, max_variant_length),
+                dimensions=["variants"],
+                description="Length of each variant",
+            ),
             vcz.ZarrArraySpec(
                 source=None,
                 name="variant_contig",

diff --git a/bio2zarr/vcf.py b/bio2zarr/vcf.py
@@ -1040,14 +1040,19 @@ def iter_genotypes(self, shape, start, stop):
             yield sanitised_genotypes, sanitised_phased
 
     def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
+        variant_lengths = self.fields["rlen"].iter_values(start, stop)
         if self.gt_field is None or shape is None:
-            for alleles in self.iter_alleles(start, stop, num_alleles):
-                yield alleles, (None, None)
+            for variant_length, alleles in zip(
+                variant_lengths, self.iter_alleles(start, stop, num_alleles)
+            ):
+                yield vcz.VariantData(variant_length, alleles, None, None)
         else:
-            yield from zip(
+            for variant_length, alleles, (gt, phased) in zip(
+                variant_lengths,
                 self.iter_alleles(start, stop, num_alleles),
                 self.iter_genotypes(shape, start, stop),
-            )
+            ):
+                yield vcz.VariantData(variant_length, alleles, gt, phased)
 
     def generate_schema(
         self, variants_chunk_size=None, samples_chunk_size=None, local_alleles=None
@@ -1121,6 +1126,7 @@ def fixed_field_spec(name, dtype, source=None, dimensions=("variants",)):
                 compressor=compressor,
             )
 
+        name_map = {field.full_name: field for field in self.metadata.fields}
         array_specs = [
             fixed_field_spec(
                 name="variant_contig",
@@ -1136,6 +1142,11 @@ def fixed_field_spec(name, dtype, source=None, dimensions=("variants",)):
                 dtype="O",
                 dimensions=["variants", "alleles"],
             ),
+            fixed_field_spec(
+                name="variant_length",
+                dtype=name_map["rlen"].smallest_dtype(),
+                dimensions=["variants"],
+            ),
             fixed_field_spec(
                 name="variant_id",
                 dtype="O",
@@ -1145,14 +1156,12 @@ def fixed_field_spec(name, dtype, source=None, dimensions=("variants",)):
                 dtype="bool",
             ),
         ]
-        name_map = {field.full_name: field for field in self.metadata.fields}
 
-        # Only three of the fixed fields have a direct one-to-one mapping.
+        # Only two of the fixed fields have a direct one-to-one mapping.
         array_specs.extend(
             [
                 spec_from_field(name_map["QUAL"], array_name="variant_quality"),
                 spec_from_field(name_map["POS"], array_name="variant_position"),
-                spec_from_field(name_map["rlen"], array_name="variant_length"),
             ]
         )
         array_specs.extend(

diff --git a/bio2zarr/vcz.py b/bio2zarr/vcz.py
@@ -37,6 +37,16 @@
 }
 
 
+@dataclasses.dataclass
+class VariantData:
+    """Represents variant data returned by iter_alleles_and_genotypes."""
+
+    variant_length: int
+    alleles: np.ndarray
+    genotypes: np.ndarray
+    phased: np.ndarray
+
+
 class Source(abc.ABC):
     @property
     @abc.abstractmethod
@@ -794,6 +804,7 @@ def encode_array_partition(self, array_spec, partition_index):
     def encode_alleles_and_genotypes_partition(self, partition_index):
         partition = self.metadata.partitions[partition_index]
         alleles = self.init_partition_array(partition_index, "variant_allele")
+        variant_lengths = self.init_partition_array(partition_index, "variant_length")
         has_gt = self.has_genotypes()
         shape = None
         if has_gt:
@@ -802,18 +813,21 @@ def encode_alleles_and_genotypes_partition(self, partition_index):
                 partition_index, "call_genotype_phased"
             )
             shape = gt.buff.shape[1:]
-        for alleles_value, (genotype, phased) in self.source.iter_alleles_and_genotypes(
+        for variant_data in self.source.iter_alleles_and_genotypes(
             partition.start, partition.stop, shape, alleles.array.shape[1]
         ):
             j_alleles = alleles.next_buffer_row()
-            alleles.buff[j_alleles] = alleles_value
+            alleles.buff[j_alleles] = variant_data.alleles
+            j_variant_length = variant_lengths.next_buffer_row()
+            variant_lengths.buff[j_variant_length] = variant_data.variant_length
             if has_gt:
                 j = gt.next_buffer_row()
-                gt.buff[j] = genotype
+                gt.buff[j] = variant_data.genotypes
                 j_phased = gt_phased.next_buffer_row()
-                gt_phased.buff[j_phased] = phased
+                gt_phased.buff[j_phased] = variant_data.phased
 
         self.finalise_partition_array(partition_index, alleles)
+        self.finalise_partition_array(partition_index, variant_lengths)
         if has_gt:
             self.finalise_partition_array(partition_index, gt)
             self.finalise_partition_array(partition_index, gt_phased)
@@ -1103,14 +1117,16 @@ def __init__(self, path):
     def create_index(self):
         """Create an index to support efficient region queries."""
         root = zarr.open_group(store=self.path, mode="r+")
-
+        print(list(root.keys()))
         if (
             "variant_contig" not in root
             or "variant_position" not in root
             or "variant_length" not in root
         ):
-            logger.warning("Cannot create index: required arrays not found")
-            return
+            raise ValueError(
+                "Cannot create index: variant_contig, "
+                "variant_position and variant_length arrays are required"
+            )
 
         contig = root["variant_contig"]
         pos = root["variant_position"]

diff --git a/tests/data/plink/example.bim b/tests/data/plink/example.bim
@@ -1,2 +1,2 @@
-1	1_10	0	10	A	G
-1	1_20	0	20	T	C
+1	1_10	0	10	A	GG
+1	1_20	0	20	TTT	C
diff --git a/tests/test_core.py b/tests/test_core.py
@@ -237,7 +237,7 @@ def test_examples(self, chunk_size, size, start, stop):
         # It works in CI on Linux, but it'll probably break at some point.
         # It's also necessary to update these numbers each time a new data
         # file gets added
-        ("tests/data", 5045029),
+        ("tests/data", 5045032),
         ("tests/data/vcf", 5018640),
         ("tests/data/vcf/sample.vcf.gz", 1089),
     ],

diff --git a/tests/test_plink.py b/tests/test_plink.py
@@ -94,8 +94,8 @@ class TestExample:
     """
     .bim file looks like this:
 
-    1       1_10    0       10      A       G
-    1       1_20    0       20      T       C
+    1       1_10    0       10      A       GG
+    1       1_20    0       20      TTT       C
 
     Definition: https://www.cog-genomics.org/plink/1.9/formats#bim
     Chromosome code (either an integer, or 'X'/'Y'/'XY'/'MT'; '0'
@@ -121,7 +121,10 @@ def test_variant_position(self, ds):
         nt.assert_array_equal(ds.variant_position, [10, 20])
 
     def test_variant_allele(self, ds):
-        nt.assert_array_equal(ds.variant_allele, [["A", "G"], ["T", "C"]])
+        nt.assert_array_equal(ds.variant_allele, [["A", "GG"], ["TTT", "C"]])
+
+    def test_variant_length(self, ds):
+        nt.assert_array_equal(ds.variant_length, [2, 3])
 
     def test_contig_id(self, ds):
         """Test that contig identifiers are correctly extracted and stored."""
@@ -266,6 +269,9 @@ def test_chunk_size(
             worker_processes=worker_processes,
         )
         ds2 = sg.load_dataset(out)
+        # Drop the region_index as it is chunk dependent
+        ds = ds.drop_vars("region_index")
+        ds2 = ds2.drop_vars("region_index")
         xt.assert_equal(ds, ds2)
         # TODO check array chunks
 
@@ -372,3 +378,9 @@ def test_genotypes(self, ds):
 
     def test_variant_position(self, ds):
         nt.assert_array_equal(ds.variant_position, [10, 20, 10, 10, 20, 10])
+
+    def test_variant_length(self, ds):
+        nt.assert_array_equal(
+            ds.variant_length,
+            [1, 1, 1, 1, 1, 1],
+        )