diff --git a/docs/Makefile b/docs/Makefile index 69108abe..d8946b58 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -19,3 +19,5 @@ dist: clean: rm -fR $(BUILDDIR) rm -rf _static/example_data.vcz/ancestral_state + rm -rf notebook-simulation* + diff --git a/docs/usage.md b/docs/usage.md index 1226881d..1e4e528b 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -320,10 +320,9 @@ import sys import os import subprocess -from Bio import bgzf -import numpy as np - +import bio2zarr.tskit as ts2z import msprime +import numpy as np import tsinfer if getattr(builtins, "__IPYTHON__", False): # if running IPython: e.g. in a notebook @@ -343,27 +342,16 @@ ts = msprime.sim_ancestry( random_seed=6, ) ts = msprime.sim_mutations(ts, rate=1e-8, random_seed=7) -ts.dump(name + "-source.trees") +ts.dump(f"{name}-source.trees") print( f"Simulated {ts.num_samples} samples over {seq_len/1e6} Mb:", f"{ts.num_trees} trees and {ts.num_sites} sites" ) -# Convert to a zarr file: this should be easier once a tskit2zarr utility is made, see -# https://github.com/sgkit-dev/bio2zarr/issues/232 -np.save(f"{name}-AA.npy", [s.ancestral_state for s in ts.sites()]) -vcf_name = f"{name}.vcf.gz" -with bgzf.open(vcf_name, "wt") as f: - ts.write_vcf(f) -subprocess.run(["tabix", vcf_name]) -ret = subprocess.run( - [python, "-m", "bio2zarr", "vcf2zarr", "convert", "--force", vcf_name, f"{name}.vcz"], - stderr = subprocess.DEVNULL if name == "notebook-simulation" else None, -) +ts2z.convert(f"{name}-source.trees", f"{name}.vcz") assert os.path.exists(f"{name}.vcz") -if ret.returncode == 0: - print(f"Converted to {name}.vcz") +print(f"Converted to {name}.vcz") ``` Here we first run a simulation then we create a vcf file and convert it to .vcz format. @@ -431,8 +419,8 @@ Once we have our `.vcz` file created, running the inference is straightforward. ```{code-cell} ipython3 # Infer & save a ts from the notebook simulation. -ancestral_states = np.load(f"{name}-AA.npy") -vdata = tsinfer.VariantData(f"{name}.vcz", ancestral_states) +vcf_zarr = zarr.load(f"{name}.vcz") +vdata = tsinfer.VariantData(f"{name}.vcz", ancestral_state=vcf_zarr["variant_allele"][:, 0]) tsinfer.infer(vdata, progress_monitor=True, num_threads=4).dump(name + ".trees") ``` diff --git a/requirements/CI-docs/requirements.txt b/requirements/CI-docs/requirements.txt index 45597e48..645e7045 100644 --- a/requirements/CI-docs/requirements.txt +++ b/requirements/CI-docs/requirements.txt @@ -5,10 +5,11 @@ humanize==4.12.1 lmdb==1.6.2 tqdm==4.67.1 daiquiri==3.3.0 +tskit==0.6.4 msprime==1.3.3 sgkit[vcf]==0.9.0 ipywidgets==8.1.5 Bio==1.7.1 -bio2zarr==0.1.4 +bio2zarr==0.1.6 sphinx-book-theme #Unpinned to allow easy updating. -pyfaidx==0.8.1.3 \ No newline at end of file +pyfaidx==0.8.1.3 diff --git a/requirements/CI-tests-complete/requirements.txt b/requirements/CI-tests-complete/requirements.txt index c7aa0838..1326278b 100644 --- a/requirements/CI-tests-complete/requirements.txt +++ b/requirements/CI-tests-complete/requirements.txt @@ -10,6 +10,6 @@ pytest==8.3.5 pytest-cov==6.0.0 seaborn==0.13.2 sgkit[vcf]==0.9.0 -tskit==0.6.0 +tskit==0.6.4 tqdm==4.67.1 twine==6.1.0 diff --git a/requirements/CI-tests-conda/requirements.txt b/requirements/CI-tests-conda/requirements.txt index a062c960..90869d9b 100644 --- a/requirements/CI-tests-conda/requirements.txt +++ b/requirements/CI-tests-conda/requirements.txt @@ -7,4 +7,4 @@ daiquiri==3.0.0 # Pinned as conda package not updating matplotlib==3.9.4 seaborn==0.13.2 colorama==0.4.6 -tskit==0.6.0 \ No newline at end of file +tskit==0.6.4 diff --git a/requirements/development.txt b/requirements/development.txt index d708f8cd..c10cdbe7 100644 --- a/requirements/development.txt +++ b/requirements/development.txt @@ -7,7 +7,8 @@ tqdm humanize daiquiri msprime >= 1.0.0 -tskit >= 0.5.3 +tskit >= 0.6.4 +bio2zarr >= 0.1.6 lmdb pre-commit pytest