Skip to content

Commit 3165f58

Browse files
Fixup documentation and add some info logging
Some tests
1 parent a173af7 commit 3165f58

File tree

3 files changed

+44
-2
lines changed

3 files changed

+44
-2
lines changed

bio2zarr/tskit.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,12 @@ def __init__(
2828
else:
2929
# input 'ts' is a path.
3030
self._path = ts
31+
logger.info(f"Loading from {ts}")
3132
self.ts = tskit.load(ts)
33+
logger.info(
34+
f"Input has {self.ts.num_individuals} individuals and "
35+
f"{self.ts.num_sites} sites"
36+
)
3237

3338
self.contig_id = contig_id if contig_id is not None else "1"
3439
self.isolated_as_missing = isolated_as_missing
@@ -42,6 +47,7 @@ def __init__(
4247
sample_ids = model_mapping.individuals_name
4348

4449
self._num_samples = individuals_nodes.shape[0]
50+
logger.info(f"Converting for {self._num_samples} samples")
4551
if self._num_samples < 1:
4652
raise ValueError("individuals_nodes must have at least one sample")
4753
self.max_ploidy = individuals_nodes.shape[1]
@@ -100,6 +106,7 @@ def iter_field(self, field_name, shape, start, stop):
100106
def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
101107
# All genotypes in tskit are considered phased
102108
phased = np.ones(shape[:-1], dtype=bool)
109+
logger.debug(f"Getting genotpes start={start} stop={stop}")
103110

104111
for variant in self.ts.variants(
105112
isolated_as_missing=self.isolated_as_missing,

docs/tskit2zarr/python_api.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,14 @@ This will convert the [tskit](https://tskit.dev) tree sequence stored
1212
at ``ts_path`` to VCF Zarr stored at ``vcz_path`` using 8 worker processes.
1313
The details of how we map from the
1414
tskit {ref}`tskit:sec_data_model` to VCF Zarr are taken care of by
15-
TreeSequence.map_to_vcf_model method, which is called with no
15+
{meth}`tskit.TreeSequence.map_to_vcf_model`
16+
method, which is called with no
1617
parameters by default if the ``model_mapping`` parameter to
1718
{func}`~bio2zarr.tskit.convert` is not specified.
1819

1920
For more control over the properties of the output, for example
2021
to pick a specific subset of individuals, you can use
21-
TreeSequence.map_to_vcf_model
22+
{meth}`~tskit.TreeSequence.map_to_vcf_model`
2223
to return the required mapping:
2324

2425
```python

tests/test_tskit.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -518,3 +518,37 @@ def test_against_tskit_vcf_output(ts, tmp_path):
518518
)
519519
)
520520
xt.assert_equal(ds1, ds2)
521+
522+
523+
def assert_ts_ds_equal(ts, ds, ploidy=2):
524+
assert ds.sizes["ploidy"] == ploidy
525+
assert ds.sizes["variants"] == ts.num_sites
526+
assert ds.sizes["samples"] == ts.num_individuals
527+
# Msprime guarantees that this will be true.
528+
nt.assert_array_equal(
529+
ts.genotype_matrix().reshape((ts.num_sites, ts.num_individuals, ploidy)),
530+
ds.call_genotype.values,
531+
)
532+
nt.assert_array_equal(
533+
ds.call_genotype_phased.values,
534+
np.ones((ts.num_sites, ts.num_individuals), dtype=bool),
535+
)
536+
# Specialised for the limited form of mutations used here
537+
nt.assert_equal(
538+
ds.variant_allele[:, 0].values, [site.ancestral_state for site in ts.sites()]
539+
)
540+
nt.assert_equal(
541+
ds.variant_allele[:, 1].values,
542+
[mutation.derived_state for mutation in ts.mutations()],
543+
)
544+
nt.assert_equal(ds.variant_position, ts.sites_position)
545+
546+
547+
@pytest.mark.parametrize("worker_processes", [0, 1, 2, 15])
548+
def test_workers(tmp_path, worker_processes):
549+
ts = msprime.sim_ancestry(10, sequence_length=1000, random_seed=42)
550+
ts = add_mutations(ts)
551+
out = tmp_path / "tskit.zarr"
552+
tsk.convert(ts, out, worker_processes=worker_processes)
553+
ds = sg.load_dataset(out)
554+
assert_ts_ds_equal(ts, ds)

0 commit comments

Comments
 (0)