Remove source dict

benjeffery · benjeffery · commit 547decfe58d9 · 2025-04-02T20:27:11.000+01:00
diff --git a/bio2zarr/plink.py b/bio2zarr/plink.py
@@ -5,6 +5,8 @@
 import numpy as np
 import zarr
 
+from bio2zarr import constants, schema, writer
+
 logger = logging.getLogger(__name__)
 
 
@@ -21,7 +23,6 @@ def iter_alleles(self, start, stop, num_alleles):
         ref_field = self.bed.allele_1
         alt_field = self.bed.allele_2
 
-        # TODO - should be doing whole chunks rather than one at a time
         for ref, alt in zip(
             ref_field[start:stop],
             alt_field[start:stop],
@@ -49,10 +50,6 @@ def iter_genotypes(self, shape, start, stop):
             yield gt, phased
 
 
-# Import here to avoid circular import
-from bio2zarr import constants, schema, writer  # noqa: E402
-
-
 def generate_schema(
     bed,
     variants_chunk_size=None,
@@ -147,7 +144,7 @@ def convert(
         samples_chunk_size=samples_chunk_size,
     )
     zarr_path = pathlib.Path(zarr_path)
-    vzw = writer.VcfZarrWriter("plink", zarr_path)
+    vzw = writer.VcfZarrWriter(PlinkFormat, zarr_path)
     # Rough heuristic to split work up enough to keep utilisation high
     target_num_partitions = max(1, worker_processes * 4)
     vzw.init(
@@ -168,146 +165,6 @@ def convert(
     # vzw.create_index()
 
 
-# def encode_genotypes_slice(bed_path, zarr_path, start, stop):
-#     # We need to count the A2 alleles here if we want to keep the
-#     # alleles reported as allele_1, allele_2. It's obvious here what
-#     # the correct approach is, but it is important to note that the
-#     # 0th allele is *not* necessarily the REF for these datasets.
-#     bed = bed_reader.open_bed(bed_path, num_threads=1, count_A1=False)
-#     root = zarr.open(store=zarr_path, mode="a", **ZARR_FORMAT_KWARGS)
-#     gt = core.BufferedArray(root["call_genotype"], start)
-#     gt_mask = core.BufferedArray(root["call_genotype_mask"], start)
-#     gt_phased = core.BufferedArray(root["call_genotype_phased"], start)
-#     variants_chunk_size = gt.array.chunks[0]
-#     assert start % variants_chunk_size == 0
-
-#     logger.debug(f"Reading slice {start}:{stop}")
-#     chunk_start = start
-#     while chunk_start < stop:
-#         chunk_stop = min(chunk_start + variants_chunk_size, stop)
-#         logger.debug(f"Reading bed slice {chunk_start}:{chunk_stop}")
-#         bed_chunk = bed.read(slice(chunk_start, chunk_stop), dtype=np.int8).T
-#         logger.debug(f"Got bed slice {humanfriendly.format_size(bed_chunk.nbytes)}")
-#         # Probably should do this without iterating over rows, but it's a bit
-#         # simpler and lines up better with the array buffering API. The bottleneck
-#         # is in the encoding anyway.
-#         for values in bed_chunk:
-#             j = gt.next_buffer_row()
-#             g = np.zeros_like(gt.buff[j])
-#             g[values == -127] = -1
-#             g[values == 2] = 1
-#             g[values == 1, 0] = 1
-#             gt.buff[j] = g
-#             j = gt_phased.next_buffer_row()
-#             gt_phased.buff[j] = False
-#             j = gt_mask.next_buffer_row()
-#             gt_mask.buff[j] = gt.buff[j] == -1
-#         chunk_start = chunk_stop
-#     gt.flush()
-#     gt_phased.flush()
-#     gt_mask.flush()
-#     logger.debug(f"GT slice {start}:{stop} done")
-
-# root = zarr.open_group(store=zarr_path, mode="w", **ZARR_FORMAT_KWARGS)
-
-# ploidy = 2
-# shape = [m, n]
-# chunks = [variants_chunk_size, samples_chunk_size]
-# dimensions = ["variants", "samples"]
-
-# # TODO we should be reusing some logic from vcfzarr here on laying
-# # out the basic dataset, and using the schema generator. Currently
-# # we're not using the best Blosc settings for genotypes here.
-# default_compressor = numcodecs.Blosc(cname="zstd", clevel=7)
-
-# a = root.array(
-#     "sample_id",
-#     data=bed.iid,
-#     shape=bed.iid.shape,
-#     dtype="str",
-#     compressor=default_compressor,
-#     chunks=(samples_chunk_size,),
-# )
-# a.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
-# logger.debug("Encoded samples")
-
-# # TODO encode these in slices - but read them in one go to avoid
-# # fetching repeatedly from bim file
-# a = root.array(
-#     "variant_position",
-#     data=bed.bp_position,
-#     shape=bed.bp_position.shape,
-#     dtype=np.int32,
-#     compressor=default_compressor,
-#     chunks=(variants_chunk_size,),
-# )
-# a.attrs["_ARRAY_DIMENSIONS"] = ["variants"]
-# logger.debug("encoded variant_position")
-
-# alleles = np.stack([bed.allele_1, bed.allele_2], axis=1)
-# a = root.array(
-#     "variant_allele",
-#     data=alleles,
-#     shape=alleles.shape,
-#     dtype="str",
-#     compressor=default_compressor,
-#     chunks=(variants_chunk_size, alleles.shape[1]),
-# )
-# a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
-# logger.debug("encoded variant_allele")
-
-# # TODO remove this?
-# a = root.empty(
-#     name="call_genotype_phased",
-#     dtype="bool",
-#     shape=list(shape),
-#     chunks=list(chunks),
-#     compressor=default_compressor,
-#     **ZARR_FORMAT_KWARGS,
-# )
-# a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
-
-# shape += [ploidy]
-# dimensions += ["ploidy"]
-# a = root.empty(
-#     name="call_genotype",
-#     dtype="i1",
-#     shape=list(shape),
-#     chunks=list(chunks),
-#     compressor=default_compressor,
-#     **ZARR_FORMAT_KWARGS,
-# )
-# a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
-
-# a = root.empty(
-#     name="call_genotype_mask",
-#     dtype="bool",
-#     shape=list(shape),
-#     chunks=list(chunks),
-#     compressor=default_compressor,
-#     **ZARR_FORMAT_KWARGS,
-# )
-# a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
-
-# del bed
-
-# num_slices = max(1, worker_processes * 4)
-# slices = core.chunk_aligned_slices(a, num_slices)
-
-# total_chunks = sum(a.nchunks for _, a in root.arrays())
-
-# progress_config = core.ProgressConfig(
-#     total=total_chunks, title="Convert", units="chunks", show=show_progress
-# )
-# with core.ParallelWorkManager(worker_processes, progress_config) as pwm:
-#     for start, stop in slices:
-#         pwm.submit(encode_genotypes_slice, bed_path, zarr_path, start, stop)
-
-# # TODO also add atomic swap like VCF. Should be abstracted to
-# # share basic code for setting up the variation dataset zarr
-# zarr.consolidate_metadata(zarr_path)
-
-
 # FIXME do this more efficiently - currently reading the whole thing
 # in for convenience, and also comparing call-by-call
 def validate(bed_path, zarr_path):
diff --git a/bio2zarr/vcf2zarr/vcz.py b/bio2zarr/vcf2zarr/vcz.py
@@ -276,7 +276,7 @@ def encode(
         max_variant_chunks=max_variant_chunks,
         dimension_separator=dimension_separator,
     )
-    vzw = writer.VcfZarrWriter("icf", zarr_path)
+    vzw = writer.VcfZarrWriter(icf.IntermediateColumnarFormat, zarr_path)
     vzw.encode_all_partitions(
         worker_processes=worker_processes,
         show_progress=show_progress,
@@ -329,12 +329,12 @@ def encode_init(
 
 
 def encode_partition(zarr_path, partition):
-    writer_instance = writer.VcfZarrWriter("icf", zarr_path)
+    writer_instance = writer.VcfZarrWriter(icf.IntermediateColumnarFormat, zarr_path)
     writer_instance.encode_partition(partition)
 
 
 def encode_finalise(zarr_path, show_progress=False):
-    writer_instance = writer.VcfZarrWriter("icf", zarr_path)
+    writer_instance = writer.VcfZarrWriter(icf.IntermediateColumnarFormat, zarr_path)
     writer_instance.finalise(show_progress=show_progress)
 
 
diff --git a/bio2zarr/writer.py b/bio2zarr/writer.py
@@ -9,12 +9,21 @@
 import numpy as np
 import zarr
 
-from bio2zarr import constants, core, plink, provenance, schema, zarr_utils
-from bio2zarr.vcf2zarr import icf
+from bio2zarr import constants, core, provenance, schema, zarr_utils
 
 logger = logging.getLogger(__name__)
 
-SOURCES = {"icf": icf.IntermediateColumnarFormat, "plink": plink.PlinkFormat}
+
+def sanitise_int_array(value, ndmin, dtype):
+    if isinstance(value, tuple):
+        value = [
+            constants.VCF_INT_MISSING if x is None else x for x in value
+        ]  # NEEDS TEST
+    value = np.array(value, ndmin=ndmin, copy=True)
+    value[value == constants.VCF_INT_MISSING] = -1
+    value[value == constants.VCF_INT_FILL] = -2
+    # TODO watch out for clipping here!
+    return value.astype(dtype)
 
 
 def compute_la_field(genotypes):
@@ -87,10 +96,10 @@ class LocalisableFieldDescriptor:
 
 localisable_fields = [
     LocalisableFieldDescriptor(
-        "call_LAD", "FORMAT/AD", icf.sanitise_int_array, compute_lad_field
+        "call_LAD", "FORMAT/AD", sanitise_int_array, compute_lad_field
     ),
     LocalisableFieldDescriptor(
-        "call_LPL", "FORMAT/PL", icf.sanitise_int_array, compute_lpl_field
+        "call_LPL", "FORMAT/PL", sanitise_int_array, compute_lpl_field
     ),
 ]
 
@@ -344,8 +353,7 @@ def load_metadata(self):
         if self.metadata is None:
             with open(self.wip_path / "metadata.json") as f:
                 self.metadata = VcfZarrWriterMetadata.fromdict(json.load(f))
-            source_loader = SOURCES[self.source_type]
-            self.source = source_loader(self.metadata.source_path)
+            self.source = self.source_type(self.metadata.source_path)
 
     def partition_path(self, partition_index):
         return self.partitions_path / f"p{partition_index}"