sgkit-dev
diff --git a/‎bio2zarr/core.py‎
Lines changed: 10 additions & 0 deletions b/‎bio2zarr/core.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎bio2zarr/plink.py‎
Lines changed: 182 additions & 101 deletions b/‎bio2zarr/plink.py‎
Lines changed: 182 additions & 101 deletions
@@ -34,6 +34,16 @@ def display_size(n):
     return humanfriendly.format_size(n, binary=True)
 
 
+def parse_max_memory(max_memory):
+    if max_memory is None:
+        # Effectively unbounded
+        return 2**63
+    if isinstance(max_memory, str):
+        max_memory = humanfriendly.parse_size(max_memory)
+    logger.info(f"Set memory budget to {display_size(max_memory)}")
+    return max_memory
+
+
 def min_int_dtype(min_value, max_value):
     if min_value > max_value:
         raise ValueError("min_value must be <= max_value")
 
@@ -2,17 +2,125 @@
 
 import bed_reader
 import humanfriendly
-import numcodecs
 import numpy as np
 import zarr
 
+from bio2zarr import schema, writer
 from bio2zarr.zarr_utils import ZARR_FORMAT_KWARGS
 
 from . import core
 
 logger = logging.getLogger(__name__)
 
 
+def generate_schema(bed_path, variants_chunk_size=None, samples_chunk_size=None):
+    """
+    Generate a schema for PLINK data based on the contents of the bed file.
+    """
+    bed = bed_reader.open_bed(bed_path, num_threads=1)
+    n = bed.iid_count
+    m = bed.sid_count
+
+    if samples_chunk_size is None:
+        samples_chunk_size = 1000
+    if variants_chunk_size is None:
+        variants_chunk_size = 10_000
+
+    logger.info(
+        f"Generating PLINK schema with chunks={variants_chunk_size, samples_chunk_size}"
+    )
+
+    ploidy = 2
+    shape = [m, n]
+    chunks = [variants_chunk_size, samples_chunk_size]
+    dimensions = ["variants", "samples"]
+
+    array_specs = [
+        # Sample information
+        schema.ZarrArraySpec.new(
+            vcf_field=None,
+            name="sample_id",
+            dtype="O",
+            shape=(n,),
+            chunks=(samples_chunk_size,),
+            dimensions=["samples"],
+            description="Sample identifiers",
+        ),
+        # Variant information
+        schema.ZarrArraySpec.new(
+            vcf_field=None,
+            name="variant_position",
+            dtype=np.int32,
+            shape=(m,),
+            chunks=(variants_chunk_size,),
+            dimensions=["variants"],
+            description="The reference position",
+        ),
+        schema.ZarrArraySpec.new(
+            vcf_field=None,
+            name="variant_allele",
+            dtype="O",
+            shape=(m, 2),
+            chunks=(variants_chunk_size, 2),
+            dimensions=["variants", "alleles"],
+            description="List of the reference and alternate alleles",
+        ),
+        # Genotype information
+        schema.ZarrArraySpec.new(
+            vcf_field=None,
+            name="call_genotype_phased",
+            dtype="bool",
+            shape=list(shape),
+            chunks=list(chunks),
+            dimensions=list(dimensions),
+            description="Boolean flag indicating if genotypes are phased",
+        ),
+    ]
+
+    # Add ploidy dimension for genotype arrays
+    shape_with_ploidy = shape + [ploidy]
+    chunks_with_ploidy = chunks + [ploidy]
+    dimensions_with_ploidy = dimensions + ["ploidy"]
+
+    array_specs.extend(
+        [
+            schema.ZarrArraySpec.new(
+                vcf_field=None,
+                name="call_genotype",
+                dtype="i1",
+                shape=list(shape_with_ploidy),
+                chunks=list(chunks_with_ploidy),
+                dimensions=list(dimensions_with_ploidy),
+                description="Genotype calls coded as allele indices",
+            ),
+            schema.ZarrArraySpec.new(
+                vcf_field=None,
+                name="call_genotype_mask",
+                dtype="bool",
+                shape=list(shape_with_ploidy),
+                chunks=list(chunks_with_ploidy),
+                dimensions=list(dimensions_with_ploidy),
+                description="Mask indicating missing genotype calls",
+            ),
+        ]
+    )
+
+    # Create empty lists for VCF-specific metadata
+    samples = [{"id": sample_id} for sample_id in bed.iid]
+    contigs = []  # PLINK doesn't have contig information in the same way as VCF
+    filters = []  # PLINK doesn't use filters like VCF
+
+    return schema.VcfZarrSchema(
+        format_version=schema.ZARR_SCHEMA_FORMAT_VERSION,
+        samples_chunk_size=samples_chunk_size,
+        variants_chunk_size=variants_chunk_size,
+        fields=array_specs,
+        samples=samples,
+        contigs=contigs,
+        filters=filters,
+    )
+
+
 def encode_genotypes_slice(bed_path, zarr_path, start, stop):
     # We need to count the A2 alleles here if we want to keep the
     # alleles reported as allele_1, allele_2. It's obvious here what
@@ -63,115 +171,88 @@ def convert(
     variants_chunk_size=None,
     samples_chunk_size=None,
 ):
-    bed = bed_reader.open_bed(bed_path, num_threads=1)
-    n = bed.iid_count
-    m = bed.sid_count
-    logging.info(f"Scanned plink with {n} samples and {m} variants")
+    """
+    Convert PLINK data to zarr format using the shared writer infrastructure.
+    """
+    # Generate schema from the PLINK data
+    plink_schema = generate_schema(
+        bed_path,
+        variants_chunk_size=variants_chunk_size,
+        samples_chunk_size=samples_chunk_size,
+    )
 
-    # FIXME
-    if samples_chunk_size is None:
-        samples_chunk_size = 1000
-    if variants_chunk_size is None:
-        variants_chunk_size = 10_000
+    # Create a data source adapter for PLINK
+    plink_adapter = PlinkDataAdapter(bed_path)
 
-    root = zarr.open_group(store=zarr_path, mode="w", **ZARR_FORMAT_KWARGS)
+    # Use the general writer
+    writer_instance = writer.GenericZarrWriter(zarr_path)
+    writer_instance.init_from_schema(plink_schema)
 
-    ploidy = 2
-    shape = [m, n]
-    chunks = [variants_chunk_size, samples_chunk_size]
-    dimensions = ["variants", "samples"]
-
-    # TODO we should be reusing some logic from vcfzarr here on laying
-    # out the basic dataset, and using the schema generator. Currently
-    # we're not using the best Blosc settings for genotypes here.
-    default_compressor = numcodecs.Blosc(cname="zstd", clevel=7)
-
-    a = root.array(
-        "sample_id",
-        data=bed.iid,
-        shape=bed.iid.shape,
-        dtype="str",
-        compressor=default_compressor,
-        chunks=(samples_chunk_size,),
-    )
-    a.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
-    logger.debug("Encoded samples")
-
-    # TODO encode these in slices - but read them in one go to avoid
-    # fetching repeatedly from bim file
-    a = root.array(
-        "variant_position",
-        data=bed.bp_position,
-        shape=bed.bp_position.shape,
-        dtype=np.int32,
-        compressor=default_compressor,
-        chunks=(variants_chunk_size,),
-    )
-    a.attrs["_ARRAY_DIMENSIONS"] = ["variants"]
-    logger.debug("encoded variant_position")
-
-    alleles = np.stack([bed.allele_1, bed.allele_2], axis=1)
-    a = root.array(
-        "variant_allele",
-        data=alleles,
-        shape=alleles.shape,
-        dtype="str",
-        compressor=default_compressor,
-        chunks=(variants_chunk_size, alleles.shape[1]),
+    # Encode data using the writer
+    logger.info(f"Converting PLINK data to zarr at {zarr_path}")
+    writer_instance.encode_data(
+        plink_adapter, worker_processes=worker_processes, show_progress=show_progress
     )
-    a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
-    logger.debug("encoded variant_allele")
-
-    # TODO remove this?
-    a = root.empty(
-        name="call_genotype_phased",
-        dtype="bool",
-        shape=list(shape),
-        chunks=list(chunks),
-        compressor=default_compressor,
-        **ZARR_FORMAT_KWARGS,
-    )
-    a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
-
-    shape += [ploidy]
-    dimensions += ["ploidy"]
-    a = root.empty(
-        name="call_genotype",
-        dtype="i1",
-        shape=list(shape),
-        chunks=list(chunks),
-        compressor=default_compressor,
-        **ZARR_FORMAT_KWARGS,
-    )
-    a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
-
-    a = root.empty(
-        name="call_genotype_mask",
-        dtype="bool",
-        shape=list(shape),
-        chunks=list(chunks),
-        compressor=default_compressor,
-        **ZARR_FORMAT_KWARGS,
-    )
-    a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
 
-    del bed
+    # Finalize the zarr store
+    writer_instance.finalise(show_progress)
+    zarr.consolidate_metadata(zarr_path)
+    logger.info("PLINK conversion complete")
 
-    num_slices = max(1, worker_processes * 4)
-    slices = core.chunk_aligned_slices(a, num_slices)
 
-    total_chunks = sum(a.nchunks for _, a in root.arrays())
+class PlinkDataAdapter:
+    """
+    Adapter class to provide PLINK data to the generic writer.
+    """
 
-    progress_config = core.ProgressConfig(
-        total=total_chunks, title="Convert", units="chunks", show=show_progress
-    )
-    with core.ParallelWorkManager(worker_processes, progress_config) as pwm:
-        for start, stop in slices:
-            pwm.submit(encode_genotypes_slice, bed_path, zarr_path, start, stop)
+    def __init__(self, bed_path):
+        self.bed_path = bed_path
+        self.bed = bed_reader.open_bed(bed_path, num_threads=1)
+        self.n_samples = self.bed.iid_count
+        self.n_variants = self.bed.sid_count
 
-    # TODO also add atomic swap like VCF. Should be abstracted to
-    # share basic code for setting up the variation dataset zarr
-    zarr.consolidate_metadata(zarr_path)
+    def get_sample_ids(self):
+        return self.bed.iid
+
+    def get_variant_positions(self):
+        return self.bed.bp_position
+
+    def get_variant_alleles(self):
+        return np.stack([self.bed.allele_1, self.bed.allele_2], axis=1)
+
+    def get_genotypes_slice(self, start, stop):
+        """
+        Read a slice of genotypes from the PLINK data.
+        Returns a dictionary with three arrays:
+        - genotypes: The actual genotype values
+        - phased: Whether genotypes are phased (always False for PLINK)
+        - mask: Which genotype values are missing
+        """
+        bed_chunk = self.bed.read(slice(start, stop), dtype=np.int8).T
+        n_variants = stop - start
+
+        # Create return arrays
+        gt = np.zeros((n_variants, self.n_samples, 2), dtype=np.int8)
+        gt_phased = np.zeros((n_variants, self.n_samples), dtype=bool)
+        gt_mask = np.zeros((n_variants, self.n_samples, 2), dtype=bool)
+
+        # Convert PLINK encoding to genotype encoding
+        # PLINK: 0=hom ref, 1=het, 2=hom alt, -127=missing
+        # Zarr: [0,0]=hom ref, [1,0]=het, [1,1]=hom alt, [-1,-1]=missing
+        for i, values in enumerate(bed_chunk):
+            gt[i, values == -127] = -1
+            gt[i, values == 2, :] = 1
+            gt[i, values == 1, 0] = 1
+            gt_mask[i] = gt[i] == -1
+
+        return {
+            "call_genotype": gt,
+            "call_genotype_phased": gt_phased,
+            "call_genotype_mask": gt_mask,
+        }
+
+    def close(self):
+        del self.bed
 
 
 # FIXME do this more efficiently - currently reading the whole thing