sgkit-dev
diff --git a/‎bio2zarr/__main__.py‎
Lines changed: 1 addition & 0 deletions b/‎bio2zarr/__main__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎bio2zarr/bed2zarr.py‎
Lines changed: 116 additions & 0 deletions b/‎bio2zarr/bed2zarr.py‎
Lines changed: 116 additions & 0 deletions
diff --git a/‎bio2zarr/cli.py‎
Lines changed: 36 additions & 1 deletion b/‎bio2zarr/cli.py‎
Lines changed: 36 additions & 1 deletion
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions b/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz‎
95 Bytes b/‎tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz‎
95 Bytes
diff --git a/‎tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz.csi‎
122 Bytes b/‎tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz.csi‎
122 Bytes
diff --git a/‎tests/data/bed/sample_mask.bed.gz‎
84 Bytes b/‎tests/data/bed/sample_mask.bed.gz‎
84 Bytes
diff --git a/‎tests/data/bed/sample_mask.bed.gz.csi‎
149 Bytes b/‎tests/data/bed/sample_mask.bed.gz.csi‎
149 Bytes
diff --git a/‎tests/test_bed.py‎
Lines changed: 50 additions & 0 deletions b/‎tests/test_bed.py‎
Lines changed: 50 additions & 0 deletions
@@ -15,6 +15,7 @@ def bio2zarr():
 # is handy for development and for those whose PATHs aren't set
 # up in the right way.
 bio2zarr.add_command(cli.vcf2zarr_main)
+bio2zarr.add_command(cli.bed2zarr_main)
 bio2zarr.add_command(cli.plink2zarr)
 bio2zarr.add_command(cli.vcfpartition)
 
 
@@ -0,0 +1,116 @@
+import dataclasses
+import numpy as np
+import pathlib
+import gzip
+import zarr
+from . import core
+
+
+# see https://samtools.github.io/hts-specs/BEDv1.pdf
+@dataclasses.dataclass
+class Bed3:
+    """BED3 genomic region with chromosome, start, and end. Intervals
+    are 0-based, half-open."""
+
+    chrom: str
+    start: int
+    end: int
+
+    @property
+    def width(self):
+        """Width of the region."""
+        return self.end - self.start
+
+    def __len__(self):
+        return self.width
+
+    def mask(self, invert=False):
+        """Create a mask for the region. The mask is an array of 1's
+        (0's if inverted)."""
+        func = np.zeros if invert else np.ones
+        return func(self.width, dtype=np.uint8)
+
+
+class BedReader:
+    def __init__(self, bed_path):
+        self.bed_path = pathlib.Path(bed_path)
+
+    def __enter__(self):
+        if self.bed_path.suffix == ".gz":
+            self.fh = gzip.open(self.bed_path, "rt")
+        else:
+            self.fh = self.bed_path.open("rt")
+
+        return self
+
+    def __exit__(self, *args):
+        self.fh.close()
+
+
+# Here we are assuming that we write a mask. However, the BED file
+# could represent other things, such as scores, and there could be up
+# to 9 columns, in which case more fields (aka data arrays?) would be
+# needed.
+def bed2zarr(
+    bed_path,
+    zarr_path,
+    bed_array="bed_mask",  # More generic name?
+    show_progress=False,
+):
+    # 1. Make sure the bed file is gzipped and indexed
+    bed_path = pathlib.Path(bed_path)
+
+    if bed_path.suffix != ".gz":
+        raise ValueError("BED file must be gzipped.")
+    if (
+        not bed_path.with_suffix(".gz.csi").exists()
+        or not bed_path.with_suffix(".gz.tbi").exists()
+    ):
+        raise ValueError("BED file must be indexed.")
+
+    # 2. Make sure there are contig lengths
+    store = zarr.open(zarr_path)
+    if "contig_length" not in store:
+        raise ValueError(
+            (
+                "No contig lengths in Zarr store. Contig lengths must be"
+                " present in the Zarr store before writing Bed entries."
+            )
+        )
+    # 2b. Make chromosome to integer mapping
+    chrom_d = {
+        k: v for k, v in zip(store["contig_id"], np.arange(len(store["contig_id"])))
+    }
+    # 2c. Make cumulative index of contig lengths
+    contig_indices = np.insert(np.cumsum(store["contig_length"])[:-1], 0, 0)
+
+    # 3. Init the zarr group with the contig lengths
+    # bed_array and bed_array_contig are of equal lengths = total genome
+    if bed_array not in store:
+        bed_array_contig = f"{bed_array}_contig"
+        dtype = core.min_int_dtype(0, len(store["contig_id"]))
+        n_bases = np.sum(store["contig_length"])
+
+        store.create_dataset(bed_array, fill_value=0, dtype=dtype, shape=(n_bases,))
+        store.create_dataset(
+            bed_array_contig,
+            data=np.repeat(
+                np.arange(len(store["contig_id"])), store["contig_length"]
+            ).astype(dtype),
+            dtype=dtype,
+            shape=(n_bases,),
+        )
+
+    # 4. Read the bed file and write the mask to the zarr dataset,
+    # updating for each entry; many I/O operations; better read entire
+    # file, store regions by chromosomes and generate index by
+    # chromosome for all regions?
+    with BedReader(bed_path) as br:
+        for line in br.fh:
+            chrom, start, end = line.strip().split("\t")
+            i = chrom_d[chrom]
+            start = int(start) + contig_indices[i]
+            end = int(end) + contig_indices[i]
+            bed = Bed3(chrom, start, end)
+            mask = bed.mask()
+            store[bed_array][start:end] = mask
@@ -8,7 +8,7 @@
 import numcodecs
 import tabulate
 
-from . import plink, provenance, vcf2zarr, vcf_utils
+from . import plink, provenance, vcf2zarr, vcf_utils, bed2zarr
 from .vcf2zarr import icf as icf_mod
 
 logger = logging.getLogger(__name__)
@@ -574,6 +574,41 @@ def plink2zarr():
 plink2zarr.add_command(convert_plink)
 
 
+@click.command
+@version
+@click.argument(
+    "bed_path",
+    type=click.Path(exists=True, dir_okay=False),
+)
+@zarr_path
+@click.argument(
+    "zarr_field",
+    type=str,
+)
+@verbose
+@force
+@progress
+def bed2zarr_main(bed_path, zarr_path, bed_array, verbose, force, progress):
+    """
+    Convert BED file to the Zarr format. The BED regions will be
+    converted to binary-encoded arrays whose length is equal to the
+    length of the reference genome. The BED file regions are used to
+    mask the reference genome, where the masked regions are set to 1
+    and the unmasked regions are set to 0.
+
+    The BED file must be compressed and tabix-indexed.
+    """
+    setup_logging(verbose)
+    path = pathlib.Path(zarr_path) / bed_array
+    check_overwrite_dir(path, force)
+    bed2zarr(
+        bed_path,
+        zarr_path,
+        bed_array,
+        show_progress=progress,
+    )
+
+
 @click.command
 @version
 @vcfs
 
@@ -45,6 +45,7 @@ repository = "https://github.com/sgkit-dev/bio2zarr"
 documentation = "https://sgkit-dev.github.io/bio2zarr/"
 
 [project.scripts]
+bed2zarr = "bio2zarr.cli:bed2zarr"
 vcf2zarr = "bio2zarr.cli:vcf2zarr_main"
 vcfpartition = "bio2zarr.cli:vcfpartition"
 
 
@@ -0,0 +1,50 @@
+import pytest
+import sgkit as sg
+import xarray.testing as xt
+import zarr
+from bio2zarr import bed2zarr, vcf2zarr
+
+
+class Test1kgBed:
+    data_path = "tests/data/vcf/1kg_2020_chr20_annotations.bcf"
+    bed_path = "tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz"
+    csi_path = "tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz.csi"
+
+    @pytest.fixture(scope="module")
+    def icf(self, tmp_path_factory):
+        out = tmp_path_factory.mktemp("data") / "1kg_2020.exploded"
+        vcf2zarr.explode(out, [self.data_path])
+        return out
+
+    @pytest.fixture(scope="module")
+    def zarr(self, icf, tmp_path_factory):
+        out = tmp_path_factory.mktemp("data") / "1kg_2020.zarr"
+        vcf2zarr.encode(icf, out)
+        return out
+
+    def test_add_mask_chr20(self, zarr):
+        bed2zarr.bed2zarr(bed_path=self.bed_path, zarr_path=zarr, show_progress=True)
+
+
+class TestSampleBed:
+    data_path = "tests/data/vcf/sample.bcf"
+    bed_path = "tests/data/bed/sample_mask.bed.gz"
+    csi_path = "tests/data/bed/sample_mask.bed.gz.csi"
+
+    @pytest.fixture(scope="module")
+    def icf(self, tmp_path_factory):
+        out = tmp_path_factory.mktemp("data") / "sample.exploded"
+        vcf2zarr.explode(out, [self.data_path])
+        return out
+
+    @pytest.fixture(scope="module")
+    def zarr(self, icf, tmp_path_factory):
+        out = tmp_path_factory.mktemp("data") / "sample.zarr"
+        vcf2zarr.encode(icf, out)
+        return out
+
+    def test_add_mask_sample(self, zarr):
+        with pytest.raises(ValueError):
+            bed2zarr.bed2zarr(
+                bed_path=self.bed_path, zarr_path=zarr, show_progress=True
+            )