Skip to content

Commit 9af381d

Browse files
Will-Tylerjeromekelleher
authored andcommitted
Only compute LAA if not present
Only add LAA field if not present Infer observed alleles from PL field
1 parent 0415869 commit 9af381d

File tree

4 files changed

+87
-11
lines changed

4 files changed

+87
-11
lines changed

bio2zarr/vcf2zarr/icf.py

Lines changed: 41 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,7 @@ def scan_vcf(path, target_num_partitions, *, local_alleles):
238238

239239
# Indicates whether vcf2zarr can introduce local alleles
240240
can_localize = False
241+
should_add_laa_field = True
241242
fields = fixed_vcf_field_definitions()
242243
for h in vcf.header_iter():
243244
if h["HeaderType"] in ["INFO", "FORMAT"]:
@@ -248,8 +249,10 @@ def scan_vcf(path, target_num_partitions, *, local_alleles):
248249
fields.append(field)
249250
if field.category == "FORMAT" and field.name in {"GT", "AD"}:
250251
can_localize = True
252+
if (field.category, field.name) == ("FORMAT", "LAA"):
253+
should_add_laa_field = False
251254

252-
if local_alleles and can_localize:
255+
if local_alleles and can_localize and should_add_laa_field:
253256
laa_field = VcfField(
254257
category="FORMAT",
255258
name="LAA",
@@ -509,8 +512,36 @@ def positive_pad(arr, *, length):
509512
)
510513
alleles = np.concatenate((alleles, depths), axis=1)
511514
if "PL" in variant.FORMAT:
512-
# TODO
513-
pass
515+
516+
def infer_and_pad(arr: np.ndarray, *, ploidy: int, length: int):
517+
assert ploidy in {1, 2}
518+
indices = arr.nonzero()[0]
519+
520+
if ploidy == 2:
521+
b = np.ceil(np.sqrt(2 * indices + 9 / 4) - 3 / 2)
522+
a = indices - b * (b + 1) / 2
523+
pad_length = length - len(a) - len(b)
524+
else:
525+
a = indices
526+
b = np.empty(0)
527+
pad_length = length - len(a)
528+
return np.pad(
529+
np.concatenate((a, b)),
530+
(0, pad_length),
531+
mode="constant",
532+
constant_values=0,
533+
)
534+
535+
likelihoods = variant.format("PL")
536+
ploidy = variant.ploidy
537+
likelihoods = np.apply_along_axis(
538+
infer_and_pad,
539+
axis=1,
540+
arr=likelihoods,
541+
length=ploidy * likelihoods.shape[1],
542+
ploidy=ploidy,
543+
)
544+
alleles = np.concatenate((alleles, likelihoods), axis=1)
514545

515546
max_unique_size = 1
516547

@@ -529,7 +560,7 @@ def unique_pad(arr: np.ndarray, *, length: int):
529560
)
530561

531562
alleles = np.apply_along_axis(
532-
unique_pad, axis=1, arr=alleles, length=alleles.shape[0]
563+
unique_pad, axis=1, arr=alleles, length=alleles.shape[1]
533564
)
534565
alleles = alleles[:, :max_unique_size]
535566

@@ -1165,12 +1196,14 @@ def process_partition(self, partition_index):
11651196
val = variant.genotype.array()
11661197
tcw.append("FORMAT/GT", val)
11671198
for field in format_fields:
1168-
if field.full_name == "FORMAT/LAA":
1169-
laa_val = compute_laa_field(variant)
1170-
tcw.append("FORMAT/LAA", laa_val)
1199+
if (
1200+
field.full_name == "FORMAT/LAA"
1201+
and "LAA" not in variant.FORMAT
1202+
):
1203+
val = compute_laa_field(variant)
11711204
else:
11721205
val = variant.format(field.name)
1173-
tcw.append(field.full_name, val)
1206+
tcw.append(field.full_name, val)
11741207

11751208
# Note: an issue with updating the progress per variant here like
11761209
# this is that we get a significant pause at the end of the counter
845 Bytes
Binary file not shown.
118 Bytes
Binary file not shown.

tests/test_vcf_examples.py

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,27 @@ def test_vcf_field_description(self, ds, field, description):
440440
assert ds[field].attrs["description"] == description
441441

442442

443+
class TestLocalAllelesExample:
444+
data_path = "tests/data/vcf/local_alleles.vcf.gz"
445+
446+
@pytest.fixture(scope="class")
447+
def ds(self, tmp_path_factory):
448+
out = tmp_path_factory.mktemp("data") / "local_alleles.vcf.zarr"
449+
vcf2zarr.convert([self.data_path], out, worker_processes=0)
450+
return sg.load_dataset(out)
451+
452+
def test_call_LAA(self, ds):
453+
call_LAA = [
454+
[[1, -2, -2], [1, -2, -2]],
455+
[[1, -2, -2], [1, -2, -2]],
456+
[[1, 2, -2], [1, 2, -2]],
457+
[[1, 2, 3], [2, 3, -2]],
458+
[[1, -2, -2], [1, -2, -2]],
459+
[[2, -2, -2], [1, -2, -2]],
460+
]
461+
nt.assert_array_equal(ds.call_LAA.values, call_LAA)
462+
463+
443464
class Test1000G2020Example:
444465
data_path = "tests/data/vcf/1kg_2020_chrM.vcf.gz"
445466

@@ -546,9 +567,31 @@ def test_call_AD(self, ds):
546567
nt.assert_array_equal(ds.call_AD.values, call_AD)
547568

548569
def test_call_LAA(self, ds):
549-
# The shape is (23, 3, 1).
550-
# None of the entries use any alternate alleles.
551-
call_LAA = np.full((23, 3, 1), -2, dtype=int)
570+
call_LAA = [
571+
[[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]],
572+
[[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]],
573+
[[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]],
574+
[[1, 2, -2, -2], [1, 2, -2, -2], [1, 2, -2, -2]],
575+
[[1, 2, 3, -2], [1, 2, 3, -2], [1, 2, 3, -2]],
576+
[[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]],
577+
[[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]],
578+
[[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]],
579+
[[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]],
580+
[[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]],
581+
[[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]],
582+
[[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]],
583+
[[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]],
584+
[[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]],
585+
[[1, 2, 3, -2], [1, 2, 3, -2], [1, 2, 3, -2]],
586+
[[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]],
587+
[[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]],
588+
[[1, 2, -2, -2], [1, 2, -2, -2], [1, 2, -2, -2]],
589+
[[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]],
590+
[[1, 2, -2, -2], [1, 2, -2, -2], [1, 2, -2, -2]],
591+
[[1, 2, 3, -2], [1, 2, 3, -2], [1, 2, 3, -2]],
592+
[[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]],
593+
[[1, 2, 3, -2], [1, 2, 3, -2], [1, 2, 3, -2]],
594+
]
552595
nt.assert_array_equal(ds.call_LAA.values, call_LAA)
553596

554597
def test_call_PID(self, ds):

0 commit comments

Comments
 (0)