Skip to content

Commit 0638662

Browse files
committed
Add tskit.load
1 parent 76d9775 commit 0638662

File tree

4 files changed

+70
-0
lines changed

4 files changed

+70
-0
lines changed

CHANGELOG.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,11 @@
1+
--------------------
2+
[0.2.3] - 2023-XX-XX
3+
--------------------
4+
5+
- Add `tszip.load` which loads both compressed and uncompressed trees sequences
6+
(benjeffery, #75)
7+
8+
19
--------------------
210
[0.2.2] - 2022-02-22
311
--------------------

tests/test_compression.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828

2929
import msprime
3030
import numpy as np
31+
import pytest
3132
import tskit
3233
import zarr
3334

@@ -383,3 +384,32 @@ def test_save_dir(self):
383384
ts = msprime.simulate(10, random_seed=1)
384385
with self.assertRaises(OSError):
385386
tszip.compress(ts, self.path.parent)
387+
388+
389+
class TestLoad:
390+
def test_missing_file(self):
391+
path = "/no/such/file"
392+
with pytest.raises(FileNotFoundError):
393+
tszip.load(path)
394+
395+
def test_load_dir(self):
396+
with pytest.raises(OSError):
397+
tszip.load(pathlib.Path(__file__).parent)
398+
399+
def test_wrong_format(self, tmpdir):
400+
path = pathlib.Path(tmpdir) / "treeseq.tsz"
401+
with open(str(path), "w") as f:
402+
f.write("")
403+
with pytest.raises(EOFError):
404+
tszip.load(path)
405+
for contents in ["1234", "X" * 1024]:
406+
with open(str(path), "w") as f:
407+
f.write(contents)
408+
with pytest.raises(tskit.FileFormatError):
409+
tszip.load(path)
410+
411+
def test_open_both(self):
412+
files = pathlib.Path(__file__).parent / "files"
413+
ts = tszip.load(files / "1.0.0.trees.tsz")
414+
ts2 = tszip.load(files / "1.0.0.trees")
415+
assert ts == ts2

tszip/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,5 +21,6 @@
2121
# SOFTWARE.
2222
from .compression import compress # NOQA
2323
from .compression import decompress # NOQA
24+
from .compression import load # NOQA
2425
from .compression import print_summary # NOQA
2526
from .provenance import __version__ # NOQA

tszip/compression.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -388,3 +388,34 @@ def visitor(array):
388388
if verbosity > 0:
389389
for line in str(array.info).splitlines():
390390
print("\t", line)
391+
392+
393+
def load(path):
394+
"""
395+
Open a tszip or normal tskit file. This is a convenience function that
396+
determines if the file needs to be decompressed or not, returning
397+
the tree sequence instance in either case.
398+
399+
:param str path: The location of the tszip compressed file or
400+
standard tskit file to load.
401+
:rtype: tskit.TreeSequence
402+
:return: A :class:`tskit.TreeSequence` instance corresponding to
403+
the specified file.
404+
"""
405+
path = str(path)
406+
407+
# Determine if the file is a zip file, this seems more robust than
408+
# checking the file extension, or depending on exceptions. Note that
409+
# `is_zipfile` not only checks the header but also the EOCD record at
410+
# then end of the file. This means we read the file twice, but as
411+
# tree sequences are usually less than a few GB this should not
412+
# be a problem.
413+
with open(path, "rb") as f:
414+
is_zip = zipfile.is_zipfile(f)
415+
if is_zip:
416+
return decompress(path)
417+
else:
418+
# Open everything else with tskit. We could check for a
419+
# kastore header here, but this way we get all the normal
420+
# tskit exceptions on error
421+
return tskit.load(path)

0 commit comments

Comments
 (0)