sgkit-dev
diff --git a/‎bio2zarr/plink.py‎
Lines changed: 101 additions & 182 deletions b/‎bio2zarr/plink.py‎
Lines changed: 101 additions & 182 deletions
@@ -2,125 +2,17 @@
 
 import bed_reader
 import humanfriendly
+import numcodecs
 import numpy as np
 import zarr
 
-from bio2zarr import schema, writer
 from bio2zarr.zarr_utils import ZARR_FORMAT_KWARGS
 
 from . import core
 
 logger = logging.getLogger(__name__)
 
 
-def generate_schema(bed_path, variants_chunk_size=None, samples_chunk_size=None):
-    """
-    Generate a schema for PLINK data based on the contents of the bed file.
-    """
-    bed = bed_reader.open_bed(bed_path, num_threads=1)
-    n = bed.iid_count
-    m = bed.sid_count
-
-    if samples_chunk_size is None:
-        samples_chunk_size = 1000
-    if variants_chunk_size is None:
-        variants_chunk_size = 10_000
-
-    logger.info(
-        f"Generating PLINK schema with chunks={variants_chunk_size, samples_chunk_size}"
-    )
-
-    ploidy = 2
-    shape = [m, n]
-    chunks = [variants_chunk_size, samples_chunk_size]
-    dimensions = ["variants", "samples"]
-
-    array_specs = [
-        # Sample information
-        schema.ZarrArraySpec.new(
-            vcf_field=None,
-            name="sample_id",
-            dtype="O",
-            shape=(n,),
-            chunks=(samples_chunk_size,),
-            dimensions=["samples"],
-            description="Sample identifiers",
-        ),
-        # Variant information
-        schema.ZarrArraySpec.new(
-            vcf_field=None,
-            name="variant_position",
-            dtype=np.int32,
-            shape=(m,),
-            chunks=(variants_chunk_size,),
-            dimensions=["variants"],
-            description="The reference position",
-        ),
-        schema.ZarrArraySpec.new(
-            vcf_field=None,
-            name="variant_allele",
-            dtype="O",
-            shape=(m, 2),
-            chunks=(variants_chunk_size, 2),
-            dimensions=["variants", "alleles"],
-            description="List of the reference and alternate alleles",
-        ),
-        # Genotype information
-        schema.ZarrArraySpec.new(
-            vcf_field=None,
-            name="call_genotype_phased",
-            dtype="bool",
-            shape=list(shape),
-            chunks=list(chunks),
-            dimensions=list(dimensions),
-            description="Boolean flag indicating if genotypes are phased",
-        ),
-    ]
-
-    # Add ploidy dimension for genotype arrays
-    shape_with_ploidy = shape + [ploidy]
-    chunks_with_ploidy = chunks + [ploidy]
-    dimensions_with_ploidy = dimensions + ["ploidy"]
-
-    array_specs.extend(
-        [
-            schema.ZarrArraySpec.new(
-                vcf_field=None,
-                name="call_genotype",
-                dtype="i1",
-                shape=list(shape_with_ploidy),
-                chunks=list(chunks_with_ploidy),
-                dimensions=list(dimensions_with_ploidy),
-                description="Genotype calls coded as allele indices",
-            ),
-            schema.ZarrArraySpec.new(
-                vcf_field=None,
-                name="call_genotype_mask",
-                dtype="bool",
-                shape=list(shape_with_ploidy),
-                chunks=list(chunks_with_ploidy),
-                dimensions=list(dimensions_with_ploidy),
-                description="Mask indicating missing genotype calls",
-            ),
-        ]
-    )
-
-    # Create empty lists for VCF-specific metadata
-    samples = [{"id": sample_id} for sample_id in bed.iid]
-    contigs = []  # PLINK doesn't have contig information in the same way as VCF
-    filters = []  # PLINK doesn't use filters like VCF
-
-    return schema.VcfZarrSchema(
-        format_version=schema.ZARR_SCHEMA_FORMAT_VERSION,
-        samples_chunk_size=samples_chunk_size,
-        variants_chunk_size=variants_chunk_size,
-        fields=array_specs,
-        samples=samples,
-        contigs=contigs,
-        filters=filters,
-    )
-
-
 def encode_genotypes_slice(bed_path, zarr_path, start, stop):
     # We need to count the A2 alleles here if we want to keep the
     # alleles reported as allele_1, allele_2. It's obvious here what
@@ -171,88 +63,115 @@ def convert(
     variants_chunk_size=None,
     samples_chunk_size=None,
 ):
-    """
-    Convert PLINK data to zarr format using the shared writer infrastructure.
-    """
-    # Generate schema from the PLINK data
-    plink_schema = generate_schema(
-        bed_path,
-        variants_chunk_size=variants_chunk_size,
-        samples_chunk_size=samples_chunk_size,
-    )
-
-    # Create a data source adapter for PLINK
-    plink_adapter = PlinkDataAdapter(bed_path)
-
-    # Use the general writer
-    writer_instance = writer.GenericZarrWriter(zarr_path)
-    writer_instance.init_from_schema(plink_schema)
-
-    # Encode data using the writer
-    logger.info(f"Converting PLINK data to zarr at {zarr_path}")
-    writer_instance.encode_data(
-        plink_adapter, worker_processes=worker_processes, show_progress=show_progress
-    )
-
-    # Finalize the zarr store
-    writer_instance.finalise(show_progress)
-    zarr.consolidate_metadata(zarr_path)
-    logger.info("PLINK conversion complete")
-
-
-class PlinkDataAdapter:
-    """
-    Adapter class to provide PLINK data to the generic writer.
-    """
+    bed = bed_reader.open_bed(bed_path, num_threads=1)
+    n = bed.iid_count
+    m = bed.sid_count
+    logging.info(f"Scanned plink with {n} samples and {m} variants")
 
-    def __init__(self, bed_path):
-        self.bed_path = bed_path
-        self.bed = bed_reader.open_bed(bed_path, num_threads=1)
-        self.n_samples = self.bed.iid_count
-        self.n_variants = self.bed.sid_count
+    # FIXME
+    if samples_chunk_size is None:
+        samples_chunk_size = 1000
+    if variants_chunk_size is None:
+        variants_chunk_size = 10_000
 
-    def get_sample_ids(self):
-        return self.bed.iid
+    root = zarr.open_group(store=zarr_path, mode="w", **ZARR_FORMAT_KWARGS)
 
-    def get_variant_positions(self):
-        return self.bed.bp_position
+    ploidy = 2
+    shape = [m, n]
+    chunks = [variants_chunk_size, samples_chunk_size]
+    dimensions = ["variants", "samples"]
 
-    def get_variant_alleles(self):
-        return np.stack([self.bed.allele_1, self.bed.allele_2], axis=1)
+    # TODO we should be reusing some logic from vcfzarr here on laying
+    # out the basic dataset, and using the schema generator. Currently
+    # we're not using the best Blosc settings for genotypes here.
+    default_compressor = numcodecs.Blosc(cname="zstd", clevel=7)
+
+    a = root.array(
+        "sample_id",
+        data=bed.iid,
+        shape=bed.iid.shape,
+        dtype="str",
+        compressor=default_compressor,
+        chunks=(samples_chunk_size,),
+    )
+    a.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
+    logger.debug("Encoded samples")
+
+    # TODO encode these in slices - but read them in one go to avoid
+    # fetching repeatedly from bim file
+    a = root.array(
+        "variant_position",
+        data=bed.bp_position,
+        shape=bed.bp_position.shape,
+        dtype=np.int32,
+        compressor=default_compressor,
+        chunks=(variants_chunk_size,),
+    )
+    a.attrs["_ARRAY_DIMENSIONS"] = ["variants"]
+    logger.debug("encoded variant_position")
+
+    alleles = np.stack([bed.allele_1, bed.allele_2], axis=1)
+    a = root.array(
+        "variant_allele",
+        data=alleles,
+        shape=alleles.shape,
+        dtype="str",
+        compressor=default_compressor,
+        chunks=(variants_chunk_size, alleles.shape[1]),
+    )
+    a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
+    logger.debug("encoded variant_allele")
+
+    # TODO remove this?
+    a = root.empty(
+        name="call_genotype_phased",
+        dtype="bool",
+        shape=list(shape),
+        chunks=list(chunks),
+        compressor=default_compressor,
+        **ZARR_FORMAT_KWARGS,
+    )
+    a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
+
+    shape += [ploidy]
+    dimensions += ["ploidy"]
+    a = root.empty(
+        name="call_genotype",
+        dtype="i1",
+        shape=list(shape),
+        chunks=list(chunks),
+        compressor=default_compressor,
+        **ZARR_FORMAT_KWARGS,
+    )
+    a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
+
+    a = root.empty(
+        name="call_genotype_mask",
+        dtype="bool",
+        shape=list(shape),
+        chunks=list(chunks),
+        compressor=default_compressor,
+        **ZARR_FORMAT_KWARGS,
+    )
+    a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
 
-    def get_genotypes_slice(self, start, stop):
-        """
-        Read a slice of genotypes from the PLINK data.
-        Returns a dictionary with three arrays:
-        - genotypes: The actual genotype values
-        - phased: Whether genotypes are phased (always False for PLINK)
-        - mask: Which genotype values are missing
-        """
-        bed_chunk = self.bed.read(slice(start, stop), dtype=np.int8).T
-        n_variants = stop - start
+    del bed
 
-        # Create return arrays
-        gt = np.zeros((n_variants, self.n_samples, 2), dtype=np.int8)
-        gt_phased = np.zeros((n_variants, self.n_samples), dtype=bool)
-        gt_mask = np.zeros((n_variants, self.n_samples, 2), dtype=bool)
+    num_slices = max(1, worker_processes * 4)
+    slices = core.chunk_aligned_slices(a, num_slices)
 
-        # Convert PLINK encoding to genotype encoding
-        # PLINK: 0=hom ref, 1=het, 2=hom alt, -127=missing
-        # Zarr: [0,0]=hom ref, [1,0]=het, [1,1]=hom alt, [-1,-1]=missing
-        for i, values in enumerate(bed_chunk):
-            gt[i, values == -127] = -1
-            gt[i, values == 2, :] = 1
-            gt[i, values == 1, 0] = 1
-            gt_mask[i] = gt[i] == -1
+    total_chunks = sum(a.nchunks for _, a in root.arrays())
 
-        return {
-            "call_genotype": gt,
-            "call_genotype_phased": gt_phased,
-            "call_genotype_mask": gt_mask,
-        }
+    progress_config = core.ProgressConfig(
+        total=total_chunks, title="Convert", units="chunks", show=show_progress
+    )
+    with core.ParallelWorkManager(worker_processes, progress_config) as pwm:
+        for start, stop in slices:
+            pwm.submit(encode_genotypes_slice, bed_path, zarr_path, start, stop)
 
-    def close(self):
-        del self.bed
+    # TODO also add atomic swap like VCF. Should be abstracted to
+    # share basic code for setting up the variation dataset zarr
+    zarr.consolidate_metadata(zarr_path)
 
 
 # FIXME do this more efficiently - currently reading the whole thing