Skip to content

Commit 8e97bc3

Browse files
committed
Draft bed2zarr code
Draft code to convert Bed3-format to Zarr.
1 parent 6aaee4f commit 8e97bc3

9 files changed

+204
-1
lines changed

bio2zarr/__main__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ def bio2zarr():
1515
# is handy for development and for those whose PATHs aren't set
1616
# up in the right way.
1717
bio2zarr.add_command(cli.vcf2zarr_main)
18+
bio2zarr.add_command(cli.bed2zarr_main)
1819
bio2zarr.add_command(cli.plink2zarr)
1920
bio2zarr.add_command(cli.vcfpartition)
2021

bio2zarr/bed2zarr.py

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
import dataclasses
2+
import numpy as np
3+
import pathlib
4+
import gzip
5+
import zarr
6+
from . import core
7+
8+
9+
# see https://samtools.github.io/hts-specs/BEDv1.pdf
10+
@dataclasses.dataclass
11+
class Bed3:
12+
"""BED3 genomic region with chromosome, start, and end. Intervals
13+
are 0-based, half-open."""
14+
15+
chrom: str
16+
start: int
17+
end: int
18+
19+
@property
20+
def width(self):
21+
"""Width of the region."""
22+
return self.end - self.start
23+
24+
def __len__(self):
25+
return self.width
26+
27+
def mask(self, invert=False):
28+
"""Create a mask for the region. The mask is an array of 1's
29+
(0's if inverted)."""
30+
func = np.zeros if invert else np.ones
31+
return func(self.width, dtype=np.uint8)
32+
33+
34+
class BedReader:
35+
def __init__(self, bed_path):
36+
self.bed_path = pathlib.Path(bed_path)
37+
38+
def __enter__(self):
39+
if self.bed_path.suffix == ".gz":
40+
self.fh = gzip.open(self.bed_path, "rt")
41+
else:
42+
self.fh = self.bed_path.open("rt")
43+
44+
return self
45+
46+
def __exit__(self, *args):
47+
self.fh.close()
48+
49+
50+
# Here we are assuming that we write a mask. However, the BED file
51+
# could represent other things, such as scores, and there could be up
52+
# to 9 columns, in which case more fields (aka data arrays?) would be
53+
# needed.
54+
def bed2zarr(
55+
bed_path,
56+
zarr_path,
57+
bed_array="bed_mask", # More generic name?
58+
show_progress=False,
59+
):
60+
# 1. Make sure the bed file is gzipped and indexed
61+
bed_path = pathlib.Path(bed_path)
62+
63+
if bed_path.suffix != ".gz":
64+
raise ValueError("BED file must be gzipped.")
65+
if (
66+
not bed_path.with_suffix(".gz.csi").exists()
67+
or not bed_path.with_suffix(".gz.tbi").exists()
68+
):
69+
raise ValueError("BED file must be indexed.")
70+
71+
# 2. Make sure there are contig lengths
72+
store = zarr.open(zarr_path)
73+
if "contig_length" not in store:
74+
raise ValueError(
75+
(
76+
"No contig lengths in Zarr store. Contig lengths must be"
77+
" present in the Zarr store before writing Bed entries."
78+
)
79+
)
80+
# 2b. Make chromosome to integer mapping
81+
chrom_d = {
82+
k: v for k, v in zip(store["contig_id"], np.arange(len(store["contig_id"])))
83+
}
84+
# 2c. Make cumulative index of contig lengths
85+
contig_indices = np.insert(np.cumsum(store["contig_length"])[:-1], 0, 0)
86+
87+
# 3. Init the zarr group with the contig lengths
88+
# bed_array and bed_array_contig are of equal lengths = total genome
89+
if bed_array not in store:
90+
bed_array_contig = f"{bed_array}_contig"
91+
dtype = core.min_int_dtype(0, len(store["contig_id"]))
92+
n_bases = np.sum(store["contig_length"])
93+
94+
store.create_dataset(bed_array, fill_value=0, dtype=dtype, shape=(n_bases,))
95+
store.create_dataset(
96+
bed_array_contig,
97+
data=np.repeat(
98+
np.arange(len(store["contig_id"])), store["contig_length"]
99+
).astype(dtype),
100+
dtype=dtype,
101+
shape=(n_bases,),
102+
)
103+
104+
# 4. Read the bed file and write the mask to the zarr dataset,
105+
# updating for each entry; many I/O operations; better read entire
106+
# file, store regions by chromosomes and generate index by
107+
# chromosome for all regions?
108+
with BedReader(bed_path) as br:
109+
for line in br.fh:
110+
chrom, start, end = line.strip().split("\t")
111+
i = chrom_d[chrom]
112+
start = int(start) + contig_indices[i]
113+
end = int(end) + contig_indices[i]
114+
bed = Bed3(chrom, start, end)
115+
mask = bed.mask()
116+
store[bed_array][start:end] = mask

bio2zarr/cli.py

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import numcodecs
99
import tabulate
1010

11-
from . import plink, provenance, vcf2zarr, vcf_utils
11+
from . import plink, provenance, vcf2zarr, vcf_utils, bed2zarr
1212
from .vcf2zarr import icf as icf_mod
1313

1414
logger = logging.getLogger(__name__)
@@ -574,6 +574,41 @@ def plink2zarr():
574574
plink2zarr.add_command(convert_plink)
575575

576576

577+
@click.command
578+
@version
579+
@click.argument(
580+
"bed_path",
581+
type=click.Path(exists=True, dir_okay=False),
582+
)
583+
@zarr_path
584+
@click.argument(
585+
"zarr_field",
586+
type=str,
587+
)
588+
@verbose
589+
@force
590+
@progress
591+
def bed2zarr_main(bed_path, zarr_path, bed_array, verbose, force, progress):
592+
"""
593+
Convert BED file to the Zarr format. The BED regions will be
594+
converted to binary-encoded arrays whose length is equal to the
595+
length of the reference genome. The BED file regions are used to
596+
mask the reference genome, where the masked regions are set to 1
597+
and the unmasked regions are set to 0.
598+
599+
The BED file must be compressed and tabix-indexed.
600+
"""
601+
setup_logging(verbose)
602+
path = pathlib.Path(zarr_path) / bed_array
603+
check_overwrite_dir(path, force)
604+
bed2zarr(
605+
bed_path,
606+
zarr_path,
607+
bed_array,
608+
show_progress=progress,
609+
)
610+
611+
577612
@click.command
578613
@version
579614
@vcfs

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ repository = "https://github.com/sgkit-dev/bio2zarr"
4545
documentation = "https://sgkit-dev.github.io/bio2zarr/"
4646

4747
[project.scripts]
48+
bed2zarr = "bio2zarr.cli:bed2zarr"
4849
vcf2zarr = "bio2zarr.cli:vcf2zarr_main"
4950
vcfpartition = "bio2zarr.cli:vcfpartition"
5051

95 Bytes
Binary file not shown.
122 Bytes
Binary file not shown.

tests/data/bed/sample_mask.bed.gz

84 Bytes
Binary file not shown.
149 Bytes
Binary file not shown.

tests/test_bed.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import pytest
2+
import sgkit as sg
3+
import xarray.testing as xt
4+
import zarr
5+
from bio2zarr import bed2zarr, vcf2zarr
6+
7+
8+
class Test1kgBed:
9+
data_path = "tests/data/vcf/1kg_2020_chr20_annotations.bcf"
10+
bed_path = "tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz"
11+
csi_path = "tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz.csi"
12+
13+
@pytest.fixture(scope="module")
14+
def icf(self, tmp_path_factory):
15+
out = tmp_path_factory.mktemp("data") / "1kg_2020.exploded"
16+
vcf2zarr.explode(out, [self.data_path])
17+
return out
18+
19+
@pytest.fixture(scope="module")
20+
def zarr(self, icf, tmp_path_factory):
21+
out = tmp_path_factory.mktemp("data") / "1kg_2020.zarr"
22+
vcf2zarr.encode(icf, out)
23+
return out
24+
25+
def test_add_mask_chr20(self, zarr):
26+
bed2zarr.bed2zarr(bed_path=self.bed_path, zarr_path=zarr, show_progress=True)
27+
28+
29+
class TestSampleBed:
30+
data_path = "tests/data/vcf/sample.bcf"
31+
bed_path = "tests/data/bed/sample_mask.bed.gz"
32+
csi_path = "tests/data/bed/sample_mask.bed.gz.csi"
33+
34+
@pytest.fixture(scope="module")
35+
def icf(self, tmp_path_factory):
36+
out = tmp_path_factory.mktemp("data") / "sample.exploded"
37+
vcf2zarr.explode(out, [self.data_path])
38+
return out
39+
40+
@pytest.fixture(scope="module")
41+
def zarr(self, icf, tmp_path_factory):
42+
out = tmp_path_factory.mktemp("data") / "sample.zarr"
43+
vcf2zarr.encode(icf, out)
44+
return out
45+
46+
def test_add_mask_sample(self, zarr):
47+
with pytest.raises(ValueError):
48+
bed2zarr.bed2zarr(
49+
bed_path=self.bed_path, zarr_path=zarr, show_progress=True
50+
)

0 commit comments

Comments
 (0)