Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 2 additions & 43 deletions bio2zarr/vcf2zarr/icf.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,7 +512,7 @@ def compute_laa_field(variant) -> np.ndarray:
The LAA field is a list of one-based indices into the ALT alleles
that indicates which alternate alleles are observed in the sample.

This method infers which alleles are observed from the GT, AD, and PL fields.
This method infers which alleles are observed from the GT field.
"""
sample_count = variant.num_called + variant.num_unknown
alt_allele_count = len(variant.ALT)
Expand All @@ -528,48 +528,6 @@ def compute_laa_field(variant) -> np.ndarray:
np.bincount, axis=1, arr=genotypes, minlength=allele_count
)
allele_counts += genotype_allele_counts
if "AD" in variant.FORMAT:
depths = variant.format("AD")
depths.clip(0, None, out=depths)

def bincount_nonzero(arr, *, minlength):
# nonzero returns the indices of the nonzero elements for each axis
return np.bincount(arr.nonzero()[0], minlength=minlength)

depths_allele_counts = np.apply_along_axis(
bincount_nonzero, axis=1, arr=depths, minlength=allele_count
)
allele_counts += depths_allele_counts
if "PL" in variant.FORMAT:
likelihoods = variant.format("PL")
likelihoods.clip(0, None, out=likelihoods)
# n is the indices of the nonzero likelihoods
n = np.tile(np.arange(likelihoods.shape[1]), (likelihoods.shape[0], 1))
assert n.shape == likelihoods.shape
n[likelihoods <= 0] = 0
ploidy = variant.ploidy

if ploidy == 1:
a = n
b = np.zeros_like(a)
elif ploidy == 2:
# We have n = b(b+1) / 2 + a
# We need to compute a and b
b = np.ceil(np.sqrt(2 * n + 9 / 4) - 3 / 2).astype(int)
a = (n - b * (b + 1) / 2).astype(int)
else:
# TODO: Handle all possible ploidy
raise ValueError(f"Cannot handle ploidy = {ploidy}")

a_counts = np.apply_along_axis(
np.bincount, axis=1, arr=a, minlength=allele_count
)
b_counts = np.apply_along_axis(
np.bincount, axis=1, arr=b, minlength=allele_count
)
assert a_counts.shape == b_counts.shape == allele_counts.shape
allele_counts += a_counts
allele_counts += b_counts

allele_counts[:, 0] = 0 # We don't count the reference allele
max_row_length = 1
Expand Down Expand Up @@ -640,6 +598,7 @@ def compute_lpl_field(variant, laa_val: np.ndarray) -> np.ndarray:
pl_val = np.broadcast_to(pl_val, n.shape)
row_index = np.arange(pl_val.shape[0]).reshape(-1, 1)
lpl_val = pl_val[row_index, n]
lpl_val[b == constants.INT_FILL] = constants.INT_FILL

return lpl_val

Expand Down
135 changes: 20 additions & 115 deletions tests/test_vcf_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,41 +446,41 @@ def ds(self, tmp_path_factory):

def test_call_LAA(self, ds):
call_LAA = [
[[1, -2, -2], [1, -2, -2]],
[[1, -2, -2], [1, -2, -2]],
[[1, 2, -2], [1, 2, -2]],
[[1, -2, -2], [-2, -2, -2]],
[[-2, -2, -2], [1, -2, -2]],
[[1, -2, -2], [2, -2, -2]],
[[1, 2, 3], [2, 3, -2]],
[[1, -2, -2], [1, -2, -2]],
[[2, -2, -2], [1, -2, -2]],
[[-2, -2, -2], [-2, -2, -2]],
[[-2, -2, -2], [1, -2, -2]],
[[1, -2, -2], [-1, -2, -2]],
[[2, -2, -2], [2, -2, -2]],
[[1, -2, -2], [-2, -2, -2]],
[[-2, -2, -2], [-2, -2, -2]],
[[1, -2, -2], [1, -2, -2]],
[[-2, -2, -2], [-2, -2, -2]],
[[-2, -2, -2], [1, -2, -2]],
]
nt.assert_array_equal(ds.call_LAA.values, call_LAA)

def test_call_LPL(self, ds):
call_LPL = [
[
[100, 0, 105, -2, -2, -2, -2, -2, -2, -2],
[0, 100, 200, -2, -2, -2, -2, -2, -2, -2],
[0, -2, -2, -2, -2, -2, -2, -2, -2, -2],
],
[
[0, 100, 200, -2, -2, -2, -2, -2, -2, -2],
[0, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[154, 22, 0, -2, -2, -2, -2, -2, -2, -2],
],
[
[1002, 55, 1002, 0, 55, 1002, -2, -2, -2, -2],
[154, 154, 0, 154, 102, 102, -2, -2, -2, -2],
[1002, 55, 1002, -2, -2, -2, -2, -2, -2, -2],
[154, 154, 102, -2, -2, -2, -2, -2, -2, -2],
],
[
[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
],
[
[30, 30, 30, -2, -2, -2, -2, -2, -2, -2],
[30, 60, 0, -2, -2, -2, -2, -2, -2, -2],
[30, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[30, -2, -2, -2, -2, -2, -2, -2, -2, -2],
],
[
[0, 30, -2, -2, -2, -2, -2, -2, -2, -2],
Expand All @@ -495,12 +495,12 @@ def test_call_LPL(self, ds):
[10, 40, 60, -2, -2, -2, -2, -2, -2, -2],
],
[
[30, 30, 30, -2, -2, -2, -2, -2, -2, -2],
[30, -1, 0, -2, -2, -2, -2, -2, -2, -2],
[30, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[30, -2, -2, -2, -2, -2, -2, -2, -2, -2],
],
[
[-1, -1, -1, -2, -2, -2, -2, -2, -2, -2],
[-1, -1, -1, -2, -2, -2, -2, -2, -2, -2],
[-1, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[-1, -2, -2, -2, -2, -2, -2, -2, -2, -2],
],
[
[-1, -1, -2, -2, -2, -2, -2, -2, -2, -2],
Expand Down Expand Up @@ -637,107 +637,12 @@ def test_call_AD(self, ds):
nt.assert_array_equal(ds.call_AD.values, call_AD)

def test_call_LAA(self, ds):
call_LAA = [
[[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]],
[[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]],
[[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]],
[[1, 2, -2, -2], [1, 2, -2, -2], [1, 2, -2, -2]],
[[1, 2, 3, -2], [1, 2, 3, -2], [1, 2, 3, -2]],
[[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]],
[[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]],
[[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]],
[[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]],
[[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]],
[[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]],
[[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]],
[[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]],
[[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]],
[[1, 2, 3, -2], [1, 2, 3, -2], [1, 2, 3, -2]],
[[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]],
[[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]],
[[1, 2, -2, -2], [1, 2, -2, -2], [1, 2, -2, -2]],
[[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]],
[[1, 2, -2, -2], [1, 2, -2, -2], [1, 2, -2, -2]],
[[1, 2, 3, -2], [1, 2, 3, -2], [1, 2, 3, -2]],
[[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]],
[[1, 2, 3, -2], [1, 2, 3, -2], [1, 2, 3, -2]],
]
# All the genotypes are 0/0
call_LAA = np.full((23, 3, 1), -2)
nt.assert_array_equal(ds.call_LAA.values, call_LAA)

def test_call_LPL(self, ds):
# fmt: off
call_LPL = [
[[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2]],
[[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2]],
[[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2]],
[[0, 120, 1800, 120, 1800, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, 120, 1800, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, 120, 1800, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2]],
[[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, -2, -2, -2, -2, -2],
[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, -2, -2, -2, -2, -2],
[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, -2, -2, -2, -2, -2]], # noqa: E501
[[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2]],
[[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2]],
[[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2]],
[[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2]],
[[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2]],
[[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2]],
[[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, 120, 1800, 1800, 1800, 1800], # noqa: E501
[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, 120, 1800, 1800, 1800, 1800], # noqa: E501
[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, 120, 1800, 1800, 1800, 1800]], # noqa: E501
[[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, 120, 1800, 1800, 1800, 1800], # noqa: E501
[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, 120, 1800, 1800, 1800, 1800], # noqa: E501
[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, 120, 1800, 1800, 1800, 1800]], # noqa: E501
[[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, 120, 1800, 1800, 1800, 1800], # noqa: E501
[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, 120, 1800, 1800, 1800, 1800], # noqa: E501
[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, 120, 1800, 1800, 1800, 1800]], # noqa: E501
[[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, -2, -2, -2, -2, -2],
[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, -2, -2, -2, -2, -2],
[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, -2, -2, -2, -2, -2]], # noqa: E501
[[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2]],
[[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2]],
[[0, 120, 1800, 120, 1800, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, 120, 1800, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, 120, 1800, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2]],
[[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2]],
[[0, 120, 1800, 120, 1800, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, 120, 1800, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, 120, 1800, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2]],
[[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, -2, -2, -2, -2, -2],
[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, -2, -2, -2, -2, -2],
[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, -2, -2, -2, -2, -2]], # noqa: E501
[[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, 120, 1800, 1800, 1800, 1800], # noqa: E501
[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, 120, 1800, 1800, 1800, 1800], # noqa: E501
[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, 120, 1800, 1800, 1800, 1800]], # noqa: E501
[[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, -2, -2, -2, -2, -2],
[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, -2, -2, -2, -2, -2],
[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, -2, -2, -2, -2, -2]]
]
# fmt: on
call_LPL = np.tile([0, -2, -2], (23, 3, 1))
nt.assert_array_equal(ds.call_LPL.values, call_LPL)

def test_call_PID(self, ds):
Expand Down
Loading