File tree Expand file tree Collapse file tree 4 files changed +70
-0
lines changed
Expand file tree Collapse file tree 4 files changed +70
-0
lines changed Original file line number Diff line number Diff line change 1+ --------------------
2+ [0.2.3] - 2023-XX-XX
3+ --------------------
4+
5+ - Add `tszip.load ` which loads both compressed and uncompressed trees sequences
6+ (benjeffery, #75)
7+
8+
19--------------------
210[0.2.2] - 2022-02-22
311--------------------
Original file line number Diff line number Diff line change 2828
2929import msprime
3030import numpy as np
31+ import pytest
3132import tskit
3233import zarr
3334
@@ -383,3 +384,32 @@ def test_save_dir(self):
383384 ts = msprime .simulate (10 , random_seed = 1 )
384385 with self .assertRaises (OSError ):
385386 tszip .compress (ts , self .path .parent )
387+
388+
389+ class TestLoad :
390+ def test_missing_file (self ):
391+ path = "/no/such/file"
392+ with pytest .raises (FileNotFoundError ):
393+ tszip .load (path )
394+
395+ def test_load_dir (self ):
396+ with pytest .raises (OSError ):
397+ tszip .load (pathlib .Path (__file__ ).parent )
398+
399+ def test_wrong_format (self , tmpdir ):
400+ path = pathlib .Path (tmpdir ) / "treeseq.tsz"
401+ with open (str (path ), "w" ) as f :
402+ f .write ("" )
403+ with pytest .raises (EOFError ):
404+ tszip .load (path )
405+ for contents in ["1234" , "X" * 1024 ]:
406+ with open (str (path ), "w" ) as f :
407+ f .write (contents )
408+ with pytest .raises (tskit .FileFormatError ):
409+ tszip .load (path )
410+
411+ def test_open_both (self ):
412+ files = pathlib .Path (__file__ ).parent / "files"
413+ ts = tszip .load (files / "1.0.0.trees.tsz" )
414+ ts2 = tszip .load (files / "1.0.0.trees" )
415+ assert ts == ts2
Original file line number Diff line number Diff line change 2121# SOFTWARE.
2222from .compression import compress # NOQA
2323from .compression import decompress # NOQA
24+ from .compression import load # NOQA
2425from .compression import print_summary # NOQA
2526from .provenance import __version__ # NOQA
Original file line number Diff line number Diff line change @@ -388,3 +388,34 @@ def visitor(array):
388388 if verbosity > 0 :
389389 for line in str (array .info ).splitlines ():
390390 print ("\t " , line )
391+
392+
393+ def load (path ):
394+ """
395+ Open a tszip or normal tskit file. This is a convenience function that
396+ determines if the file needs to be decompressed or not, returning
397+ the tree sequence instance in either case.
398+
399+ :param str path: The location of the tszip compressed file or
400+ standard tskit file to load.
401+ :rtype: tskit.TreeSequence
402+ :return: A :class:`tskit.TreeSequence` instance corresponding to
403+ the specified file.
404+ """
405+ path = str (path )
406+
407+ # Determine if the file is a zip file, this seems more robust than
408+ # checking the file extension, or depending on exceptions. Note that
409+ # `is_zipfile` not only checks the header but also the EOCD record at
410+ # then end of the file. This means we read the file twice, but as
411+ # tree sequences are usually less than a few GB this should not
412+ # be a problem.
413+ with open (path , "rb" ) as f :
414+ is_zip = zipfile .is_zipfile (f )
415+ if is_zip :
416+ return decompress (path )
417+ else :
418+ # Open everything else with tskit. We could check for a
419+ # kastore header here, but this way we get all the normal
420+ # tskit exceptions on error
421+ return tskit .load (path )
You can’t perform that action at this time.
0 commit comments