Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
201 changes: 201 additions & 0 deletions tests/test_variantdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -1128,3 +1128,204 @@ def test_with_variant_data(self, tmp_path):
else:
allele_idx = -1
assert vdata.sites_ancestral_allele[i] == allele_idx


class TestFromArrays:
def demo_data(self):
# returns pos, data, alleles, ancestral
return [
list(data)
for data in zip(
*[
(3, [[0, 1], [0, 0], [0, 0]], ["A", "T", ""], "A"),
(10, [[0, 1], [1, 1], [0, 0]], ["C", "A", ""], "C"),
(13, [[0, 1], [1, 0], [0, 0]], ["G", "C", ""], "C"),
(19, [[0, 0], [0, 1], [1, 0]], ["A", "C", ""], "A"),
(20, [[0, 1], [2, 0], [0, 0]], ["T", "G", "C"], "T"),
]
)
]

def test_simple_from_arrays(self):
pos, G, alleles, ancestral = self.demo_data()
vdata = tsinfer.VariantData.from_arrays(G, pos, alleles, ancestral)
assert vdata.num_individuals == 3
assert vdata.num_sites == 5
inf_ts = tsinfer.infer(vdata)
assert inf_ts.num_samples == 6
assert inf_ts.num_individuals == 3
assert inf_ts.num_sites == 5
assert np.all(inf_ts.sites_position == pos)

def test_named_from_arrays(self):
# When we pass sample_id names, they should be stored in the individuals metadata
pos, G, alleles, ancestral = self.demo_data()
sample_id = ["sample1", "sample2", "sample3"]
vdata = tsinfer.VariantData.from_arrays(
G, pos, alleles, ancestral, sample_id=sample_id
)
assert vdata.num_individuals == 3
inf_ts = tsinfer.infer(vdata)
assert inf_ts.num_individuals == 3
for name, ind in zip(sample_id, inf_ts.individuals()):
assert ind.metadata["variant_data_sample_id"] == name

def test_bad_variant_matrix(self):
pos, G, alleles, ancestral = self.demo_data()
G = np.array(G)
with pytest.raises(ValueError, match="must be a 3D array"):
tsinfer.VariantData.from_arrays([G], pos, alleles, ancestral)
with pytest.raises(ValueError, match="must be a 3D array"):
tsinfer.VariantData.from_arrays(G[:, :, 0], pos, alleles, ancestral)

def test_empty(self):
# Test with ploidy=1 but no sites
pos, G, alleles, ancestral = [], np.empty((0, 0, 1)), np.empty((0, 0)), []
with pytest.raises(ValueError, match="No sites exist"):
tsinfer.VariantData.from_arrays(G, pos, alleles, ancestral)

def test_zero_ploidy(self):
pos, G, alleles, ancestral = [], [[[]]], np.empty((0, 0)), []
with pytest.raises(ValueError, match="Ploidy must be greater than zero"):
tsinfer.VariantData.from_arrays(G, pos, alleles, ancestral)

def test_from_arrays_ancestral_missing_warning(self):
pos, G, alleles, ancestral = self.demo_data()
ancestral[0] = "-"
with pytest.warns(UserWarning, match=r"ancestral allele.+not found[\s\S]+'-'"):
tsinfer.VariantData.from_arrays(G, pos, alleles, ancestral)

def test_sequence_length(self):
pos, G, alleles, ancestral = self.demo_data()
vdata = tsinfer.VariantData.from_arrays(
G, pos, alleles, ancestral, sequence_length=50
)
assert vdata.sequence_length == 50

def test_bad_sequence_length(self):
pos, G, alleles, ancestral = self.demo_data()
with pytest.raises(ValueError, match="`sequence_length` cannot be less"):
tsinfer.VariantData.from_arrays(
G, pos, alleles, ancestral, sequence_length=10
)

@pytest.mark.parametrize("pos", [[[3, 10, 13, 19, 20]], [3, 10, 13, 19]])
def test_bad_position(self, pos):
_, G, alleles, ancestral = self.demo_data()
with pytest.raises(ValueError, match="`variant_position` must be a 1D array"):
tsinfer.VariantData.from_arrays(G, [pos], alleles, ancestral)

def test_unordered_position(self):
pos, G, alleles, ancestral = self.demo_data()
pos[-1] = 5 # out of order
with pytest.raises(ValueError, match="out-of-order values"):
tsinfer.VariantData.from_arrays(G, pos, alleles, ancestral)

def test_bad_dim_alleles(self):
pos, G, alleles, ancestral = self.demo_data()
with pytest.raises(ValueError, match="`variant_allele` must be a 2D array"):
tsinfer.VariantData.from_arrays(G, pos, [alleles], ancestral)

def test_bad_alleles(self):
pos, G, alleles, ancestral = self.demo_data()
alleles = np.array(alleles)
with pytest.raises(ValueError, match="same number of rows as variants"):
tsinfer.VariantData.from_arrays(G, pos, alleles[1:, :], ancestral)

def test_bad_num_alleles(self):
pos, G, alleles, ancestral = self.demo_data()
alleles = np.array(alleles)
with pytest.raises(ValueError, match="same number of columns"):
tsinfer.VariantData.from_arrays(G, pos, alleles[:, 1:], ancestral)

def test_bad_ancestral_state_length(self):
pos, G, alleles, ancestral = self.demo_data()
ancestral = np.array(ancestral)
with pytest.raises(ValueError, match="`ancestral_state` must be a 1D array"):
tsinfer.VariantData.from_arrays(G, pos, alleles, [ancestral])
with pytest.raises(ValueError, match="`ancestral_state` must be a 1D array"):
tsinfer.VariantData.from_arrays(G, pos, alleles, ancestral[1:])

@pytest.mark.parametrize("sid", [["A"], []])
def test_bad_sample_id(self, sid):
pos, G, alleles, ancestral = self.demo_data()
print(sid)
with pytest.raises(ValueError, match="`sample_id` must be a 1D array"):
tsinfer.VariantData.from_arrays(G, pos, alleles, ancestral, sample_id=sid)

def test_sample_mask(self):
pos, G, alleles, ancestral = self.demo_data()
G = np.array(G)
mask = np.array([False, False, True])
keep = np.logical_not(mask)
alleles = np.array(alleles)
vdata = tsinfer.VariantData.from_arrays(
G, pos, alleles, ancestral, sample_mask=mask
)
assert vdata.num_individuals == 2
inf_ts = tsinfer.infer(vdata)
assert inf_ts.num_individuals == 2
for v, p, allele_arr in zip(inf_ts.variants(), pos, alleles):
expected_idx = G[v.site.id, keep, :].flatten()
assert v.site.position == p
assert np.array_equal(v.states(), allele_arr[expected_idx])

def test_site_mask(self):
pos, G, alleles, ancestral = self.demo_data()
G = np.array(G)
mask = np.array([False, False, True, False, False])
keep = np.logical_not(mask)
pos = np.array(pos)
alleles = np.array(alleles)
ancestral = np.array(ancestral)
vdata = tsinfer.VariantData.from_arrays(
G, pos, alleles, ancestral[keep], site_mask=mask
)
assert vdata.num_individuals == 3
inf_ts = tsinfer.infer(vdata)
used_sites = np.where(keep)[0]
for v, p, allele_arr in zip(inf_ts.variants(), pos[keep], alleles[keep]):
expected_idx = G[used_sites[v.site.id], :, :].flatten()
assert v.site.position == p
assert np.array_equal(v.states(), allele_arr[expected_idx])

def test_bad_site_mask_length(self):
pos, G, alleles, ancestral = self.demo_data()
mask = np.array([False, True, False]) # wrong length
with pytest.raises(ValueError, match="length as the total number of variants"):
tsinfer.VariantData.from_arrays(G, pos, alleles, ancestral, site_mask=mask)

def test_bad_sample_mask_length(self):
pos, G, alleles, ancestral = self.demo_data()
mask = np.array([False, True, True, False, True]) # wrong length
with pytest.raises(ValueError, match="length as the total number of samples"):
tsinfer.VariantData.from_arrays(
G, pos, alleles, ancestral, sample_mask=mask
)

def test_bad_ancestral_state_masked(self):
pos, G, alleles, ancestral = self.demo_data()
mask = np.array([False, False, True, False, False])
with pytest.raises(ValueError, match="`ancestral_state` must be a 1D array"):
# Need to provide ancestral states of the same length as *unmasked* sites
tsinfer.VariantData.from_arrays(G, pos, alleles, ancestral, site_mask=mask)

def test_round_trip_ts(self):
ts = msprime.sim_ancestry(10, sequence_length=1000, random_seed=123)
ts = msprime.sim_mutations(ts, rate=1e-2, random_seed=123)
samples = ts.individuals_nodes
G = []
alleles = []
for v in ts.variants():
G.append(v.genotypes[samples])
alleles.append(v.alleles + ("",) * (4 - len(v.alleles))) # pad to 4 alleles

vdata = tsinfer.VariantData.from_arrays(
G,
ts.sites_position,
alleles,
np.array(ts.sites_ancestral_state, dtype="U1"),
)
inf_ts = tsinfer.infer(vdata)
for v1, v2 in zip(inf_ts.variants(), ts.variants()):
assert np.array_equal(v1.states(), v2.states())
155 changes: 154 additions & 1 deletion tsinfer/formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -2424,7 +2424,6 @@ def __init__(
individuals_flags=None,
sequence_length=None,
):
self._sequence_length = sequence_length
self._contig_index = None
self._contig_id = None
try:
Expand Down Expand Up @@ -2494,6 +2493,9 @@ def process_array(
self.individuals_select = ~sample_mask.astype(bool)
self._num_sites = np.sum(self.sites_select)

if len(self.sites_select) == 0:
raise ValueError("No sites exist")

if np.sum(self.sites_select) == 0:
raise ValueError(
"All sites have been masked out, at least one value "
Expand Down Expand Up @@ -2647,6 +2649,157 @@ def process_array(
logger.info(
f"Number of individuals after applying mask: {self.num_individuals}"
)
self._sequence_length = sequence_length
if self.sequence_length <= self.sites_position[-1]:
raise ValueError(
"`sequence_length` cannot be less than or equal to the maximum "
"unmasked variant position"
)

@classmethod
def from_arrays(
cls,
variant_matrix_phased,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not call_genotype?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could indeed use that, but adding _phased is intended to indicate to any user that it is assumed phased. But if you are refactoring, then perhaps not worth me revisiting it?

If there is a neat bio2zarr way to make in-memory VariantData versions from simple arrays for testing, then great.

variant_position,
variant_allele,
ancestral_state,
*,
sample_id=None,
site_mask=None,
sample_mask=None,
**kwargs,
):
"""
Create a basic in-memory VariantData instance directly from array data.
Mainly useful for small test datasets. Larger datasets, or ones that require
metadata to be included, should use e.g. bio2zarr
to create a zarr datastore containing the required data and call
VariantData(path_to_zarr)

.. note::
If a ``site_mask`` or ``sample_mask`` is provided, this does not
require changing the size of the ``variant_position``, ``variant_allele``
or `sample_id` arrays. Which must always match the dimensions of the
``variant_matrix_phased`` array. However, if sites are masked out, this
*does* require changing the ``ancestral_state`` array to match the number of
unmasked sites (and similarly for other arrays passed as kwargs).

:param array variant_matrix_phased: a 3D array of variants X samples x ploidy,
giving an index into the allele array for each corresponding variant. Values
must be coercable into 8-bit (np.int8) integers. Data for all samples is
assumed to be phased. This corresponds to the ``call_genotype`` array in the
VCF Zarr specification (e.g. missing data can be coded as -1).
:param array variant_position: a 1D array of variant positions, of the same
length as the first dimension of ``variant_matrix_phased``.
:param array variant_allele: a 2D string array of variants x max_num_alleles at a
site. The length of the first dimension must match the first dimension of the
``variant_matrix_phased`` array. The second dimension must be at least as
long as the maximum allele index in the `variant_matrix_phased` array (i.e.
each allele list for a variant must be the same length; this can be ensured
by padding the list with `""`).
:param array ancestral_state: A numpy array of strings specifying
the ancestral states (alleles) used in inference. For unknown ancestral
alleles, any character which is not in the allele list can be used.
This must be the same length as the number of *unmasked* variants in
``variant_matrix_phased`` (see note above).
:param array sample_id: a 1D string array of sample names, of length of the
total number of n-ploid samples in ``variant_matrix_phased`` (i.e. the
second dimension of ``variant_matrix_phased``).
If None, each individual n-ploid sample will be
allocated an ID corresponding to its sample index in the *unmasked*
``variant_matrix_phased`` array (i.e. "0", "1", "2", .. etc.)
:param array site_mask: A numpy array of booleans of length specifying which
sites to mask out (exclude) from the dataset.
:param array sample_mask: A numpy array of booleans of length specifying which
samples to mask out (exclude) from the dataset.
:param \\**kwargs: Further arguments passed to the VariantData constructor.
In particular you may wish to specify `sequence_length`. Arrays for
``sites_time``, ``individuals_time`` etc. can also be provided.
"""
call_genotype = np.array(variant_matrix_phased, dtype=np.int8)
if call_genotype.ndim != 3:
raise ValueError("`variant_matrix_phased` must be a 3D array")

num_variants, num_samples, ploidy = call_genotype.shape
if ploidy == 0:
raise ValueError("Ploidy must be greater than zero")
variant_position = np.asarray(variant_position, dtype=np.float64)
if variant_position.shape != (num_variants,):
raise ValueError(
"`variant_position` must be a 1D array of the same length as "
"the number of variants in variant_matrix_phased"
)

variant_allele = np.asarray(variant_allele, dtype="U") # make unicode for zarr
if variant_allele.ndim != 2 or variant_allele.shape[0] != num_variants:
raise ValueError(
"`variant_allele` must be a 2D array with the same number of rows as "
"variants in `variant_matrix_phased`"
)

if sample_id is None:
sample_id = np.arange(call_genotype.shape[1]).astype(str)
sample_id = np.asarray(sample_id, dtype="U")
if sample_id.shape != (num_samples,):
raise ValueError(
"`sample_id` must be a 1D array of the same length as the total "
"number of samples in `variant_matrix_phased`"
)

if site_mask is None:
site_keep = slice(None)
else:
if site_mask.shape != (num_variants,):
raise ValueError(
"`site_mask` must be a 1D array of the same length as the total "
"number of variants in `variant_matrix_phased`"
)
site_keep = np.logical_not(site_mask) # turn into an inclusion mask
num_variants = np.sum(site_keep)

if sample_mask is None:
sample_keep = slice(None)
else:
if sample_mask.shape != (num_samples,):
raise ValueError(
"`sample_mask` must be a 1D array of the same length as the "
"total number of samples in `variant_matrix_phased`"
)
sample_keep = np.logical_not(sample_mask) # turn into an inclusion mask
num_samples = np.sum(sample_keep)

# Further tests must take into account the masked num_samples & num_variants

max_alleles = np.max(call_genotype[site_keep, sample_keep, :], initial=-1) + 1
if variant_allele.shape[1] < max_alleles:
raise ValueError(
"`variant_allele` must have the same number of columns as the maximum "
"value in the unmasked variant_matrix_phased plus one"
)

ancestral_state = np.asarray(ancestral_state, dtype="U")
if ancestral_state.shape != (num_variants,):
raise ValueError(
"`ancestral_state` must be a 1D array of the same length as the "
"number of unmasked variants in `variant_matrix_phased`"
)

store = zarr.storage.MemoryStore()
root = zarr.group(store=store, overwrite=True)
root.create_dataset("variant_position", data=variant_position)
root.create_dataset("call_genotype", data=call_genotype)
root.create_dataset( # Assume all phased
"call_genotype_phased", data=np.ones(call_genotype.shape[:2], dtype=bool)
)
root.create_dataset("variant_allele", data=variant_allele)
root.create_dataset("sample_id", data=sample_id)
return cls(
root,
ancestral_state,
site_mask=site_mask,
sample_mask=sample_mask,
**kwargs,
)

@functools.cached_property
def format_name(self):
Expand Down
Loading