Skip to content

Commit 5abbfad

Browse files
Merge pull request #24 from jeromekelleher/basic-plink
Basic plink tests
2 parents d9bf347 + 9d956bd commit 5abbfad

16 files changed

+259
-137
lines changed

bio2zarr/cli.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import coloredlogs
44

55
from . import vcf
6+
from . import plink
67

78
# Common arguments/options
89
verbose = click.option("-v", "--verbose", count=True, help="Increase verbosity")
@@ -112,14 +113,14 @@ def vcf2zarr():
112113

113114

114115
@click.command(name="convert")
115-
@click.argument("plink", type=click.Path())
116+
@click.argument("in_path", type=click.Path())
116117
@click.argument("out_path", type=click.Path())
117118
@worker_processes
118119
@click.option("--chunk-width", type=int, default=None)
119120
@click.option("--chunk-length", type=int, default=None)
120-
def convert_plink(plink, out_path, worker_processes, chunk_width, chunk_length):
121-
vcf.convert_plink(
122-
plink,
121+
def convert_plink(in_path, out_path, worker_processes, chunk_width, chunk_length):
122+
plink.convert(
123+
in_path,
123124
out_path,
124125
show_progress=True,
125126
worker_processes=worker_processes,

bio2zarr/core.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,19 @@
1010
import zarr
1111
import numpy as np
1212
import tqdm
13+
import numcodecs
1314

1415

1516
logger = logging.getLogger(__name__)
1617

18+
numcodecs.blosc.use_threads = False
19+
20+
# TODO this should probably go in another module where we abstract
21+
# out the zarr defaults
22+
default_compressor = numcodecs.Blosc(
23+
cname="zstd", clevel=7, shuffle=numcodecs.Blosc.AUTOSHUFFLE
24+
)
25+
1726

1827
class SynchronousExecutor(cf.Executor):
1928
def submit(self, fn, /, *args, **kwargs):

bio2zarr/vcf.py

Lines changed: 6 additions & 133 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,6 @@
2020
import tqdm
2121
import zarr
2222

23-
import bed_reader
24-
2523
from . import core
2624

2725
logger = logging.getLogger(__name__)
@@ -38,12 +36,6 @@
3836
[0x7F800001, 0x7F800002], dtype=np.int32
3937
)
4038

41-
numcodecs.blosc.use_threads = False
42-
43-
default_compressor = numcodecs.Blosc(
44-
cname="zstd", clevel=7, shuffle=numcodecs.Blosc.AUTOSHUFFLE
45-
)
46-
4739

4840
def assert_all_missing_float(a):
4941
v = np.array(a, dtype=np.float32).view(np.int32)
@@ -437,6 +429,8 @@ def __init__(self, vcf_field, base_path):
437429
else:
438430
self.path = base_path / vcf_field.category / vcf_field.name
439431

432+
# TODO Check if other compressors would give reasonable compression
433+
# with significantly faster times
440434
self.compressor = numcodecs.Blosc(cname="zstd", clevel=7)
441435
# TODO have a clearer way of defining this state between
442436
# read and write mode.
@@ -905,7 +899,7 @@ def generate(pcvcf, chunk_length=None, chunk_width=None):
905899
if chunk_length is None:
906900
chunk_length = 10_000
907901

908-
compressor = default_compressor.get_config()
902+
compressor = core.default_compressor.get_config()
909903

910904
def fixed_field_spec(
911905
name, dtype, vcf_field=None, shape=(m,), dimensions=("variants",)
@@ -1136,7 +1130,7 @@ def encode_samples(self, pcvcf, sample_id, chunk_width):
11361130
"sample_id",
11371131
sample_id,
11381132
dtype="str",
1139-
compressor=default_compressor,
1133+
compressor=core.default_compressor,
11401134
chunks=(chunk_width,),
11411135
)
11421136
array.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
@@ -1147,7 +1141,7 @@ def encode_contig(self, pcvcf, contig_names, contig_lengths):
11471141
"contig_id",
11481142
contig_names,
11491143
dtype="str",
1150-
compressor=default_compressor,
1144+
compressor=core.default_compressor,
11511145
)
11521146
array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
11531147

@@ -1181,7 +1175,7 @@ def encode_filters(self, pcvcf, filter_names):
11811175
"filter_id",
11821176
filter_names,
11831177
dtype="str",
1184-
compressor=default_compressor,
1178+
compressor=core.default_compressor,
11851179
)
11861180
array.attrs["_ARRAY_DIMENSIONS"] = ["filters"]
11871181

@@ -1339,41 +1333,6 @@ def convert_vcf(
13391333
)
13401334

13411335

1342-
def encode_bed_partition_genotypes(
1343-
bed_path, zarr_path, start_variant, end_variant, encoder_threads=8
1344-
):
1345-
bed = bed_reader.open_bed(bed_path, num_threads=1)
1346-
1347-
store = zarr.DirectoryStore(zarr_path)
1348-
root = zarr.group(store=store)
1349-
gt = core.BufferedArray(root["call_genotype"])
1350-
gt_mask = core.BufferedArray(root["call_genotype_mask"])
1351-
gt_phased = core.BufferedArray(root["call_genotype_phased"])
1352-
chunk_length = gt.array.chunks[0]
1353-
assert start_variant % chunk_length == 0
1354-
1355-
buffered_arrays = [gt, gt_phased, gt_mask]
1356-
1357-
with core.ThreadedZarrEncoder(buffered_arrays, encoder_threads) as te:
1358-
start = start_variant
1359-
while start < end_variant:
1360-
stop = min(start + chunk_length, end_variant)
1361-
bed_chunk = bed.read(index=slice(start, stop), dtype="int8").T
1362-
# Note could do this without iterating over rows, but it's a bit
1363-
# simpler and the bottleneck is in the encoding step anyway. It's
1364-
# also nice to have updates on the progress monitor.
1365-
for values in bed_chunk:
1366-
j = te.next_buffer_row()
1367-
dest = gt.buff[j]
1368-
dest[values == -127] = -1
1369-
dest[values == 2] = 1
1370-
dest[values == 1, 0] = 1
1371-
gt_phased.buff[j] = False
1372-
gt_mask.buff[j] = dest == -1
1373-
core.update_progress(1)
1374-
start = stop
1375-
1376-
13771336
def validate(vcf_path, zarr_path, show_progress=False):
13781337
store = zarr.DirectoryStore(zarr_path)
13791338

@@ -1508,89 +1467,3 @@ def validate(vcf_path, zarr_path, show_progress=False):
15081467
print(vcf_val)
15091468
print(zarr_val)
15101469
assert False
1511-
1512-
1513-
def convert_plink(
1514-
bed_path,
1515-
zarr_path,
1516-
*,
1517-
show_progress,
1518-
worker_processes=1,
1519-
chunk_length=None,
1520-
chunk_width=None,
1521-
):
1522-
bed = bed_reader.open_bed(bed_path, num_threads=1)
1523-
n = bed.iid_count
1524-
m = bed.sid_count
1525-
del bed
1526-
1527-
# FIXME
1528-
if chunk_width is None:
1529-
chunk_width = 1000
1530-
if chunk_length is None:
1531-
chunk_length = 10_000
1532-
1533-
store = zarr.DirectoryStore(zarr_path)
1534-
root = zarr.group(store=store, overwrite=True)
1535-
1536-
ploidy = 2
1537-
shape = [m, n]
1538-
chunks = [chunk_length, chunk_width]
1539-
dimensions = ["variants", "samples"]
1540-
1541-
a = root.empty(
1542-
"call_genotype_phased",
1543-
dtype="bool",
1544-
shape=list(shape),
1545-
chunks=list(chunks),
1546-
compressor=default_compressor,
1547-
)
1548-
a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
1549-
1550-
shape += [ploidy]
1551-
dimensions += ["ploidy"]
1552-
a = root.empty(
1553-
"call_genotype",
1554-
dtype="i8",
1555-
shape=list(shape),
1556-
chunks=list(chunks),
1557-
compressor=default_compressor,
1558-
)
1559-
a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
1560-
1561-
a = root.empty(
1562-
"call_genotype_mask",
1563-
dtype="bool",
1564-
shape=list(shape),
1565-
chunks=list(chunks),
1566-
compressor=default_compressor,
1567-
)
1568-
a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
1569-
1570-
num_chunks = max(1, m // chunk_length)
1571-
worker_processes = min(worker_processes, num_chunks)
1572-
if num_chunks == 1 or worker_processes == 1:
1573-
partitions = [(0, m)]
1574-
else:
1575-
# Generate num_workers partitions
1576-
# TODO finer grained might be better.
1577-
partitions = []
1578-
chunk_boundaries = [
1579-
p[0] for p in np.array_split(np.arange(num_chunks), worker_processes)
1580-
]
1581-
for j in range(len(chunk_boundaries) - 1):
1582-
start = chunk_boundaries[j] * chunk_length
1583-
end = chunk_boundaries[j + 1] * chunk_length
1584-
end = min(end, m)
1585-
partitions.append((start, end))
1586-
last_stop = partitions[-1][-1]
1587-
if last_stop != m:
1588-
partitions.append((last_stop, m))
1589-
# print(partitions)
1590-
1591-
progress_config = core.ProgressConfig(
1592-
total=m, title="Convert", units="vars", show=show_progress
1593-
)
1594-
with core.ParallelWorkManager(worker_processes, progress_config) as pwm:
1595-
for start, end in partitions:
1596-
pwm.submit(encode_bed_partition_genotypes, bed_path, zarr_path, start, end)

tests/data/plink/example.bed

9 Bytes
Binary file not shown.

tests/data/plink/example.bim

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
1 1_10 0 10 A G
2+
1 1_20 0 20 T C

tests/data/plink/example.fam

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
ind0 ind0 0 0 0 -9
2+
ind1 ind1 0 0 0 -9
3+
ind2 ind2 0 0 0 -9
4+
ind3 ind3 0 0 0 -9
5+
ind4 ind4 0 0 0 -9
6+
ind5 ind5 0 0 0 -9
7+
ind6 ind6 0 0 0 -9
8+
ind7 ind7 0 0 0 -9
9+
ind8 ind8 0 0 0 -9
10+
ind9 ind9 0 0 0 -9

tests/data/plink/example.map

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
1 1_10 0 10
2+
1 1_20 0 20

tests/data/plink/example.nosex

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
ind0 ind0
2+
ind1 ind1
3+
ind2 ind2
4+
ind3 ind3
5+
ind4 ind4
6+
ind5 ind5
7+
ind6 ind6
8+
ind7 ind7
9+
ind8 ind8
10+
ind9 ind9

tests/data/plink/example.ped

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
ind0 ind0 0 0 0 0 A A T T
2+
ind1 ind1 0 0 0 0 A A T T
3+
ind2 ind2 0 0 0 0 A A T T
4+
ind3 ind3 0 0 0 0 G G T T
5+
ind4 ind4 0 0 0 0 G G C C
6+
ind5 ind5 0 0 0 0 G G C C
7+
ind6 ind6 0 0 0 0 G G C C
8+
ind7 ind7 0 0 0 0 G G C C
9+
ind8 ind8 0 0 0 0 G G C C
10+
ind9 ind9 0 0 0 0 G G C C
9 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)