Skip to content

Commit 54588a3

Browse files
authored
CU-8699jpw1u Save and load CDB and Vocab as zip (#14)
* CU-8699jpw1u: Add utility method to save and load serialisables as .zip files. And to do so automatically. * CU-8699jpw1u: Allow a CDB to saved as a zip automatically. * CU-8699jpw1u: A few simple tests for CDB automatic zip-serialisation * CU-8699jpw1u: Add test time zipped CDB * CU-8699jpw1u: Add option for vocab to be (automatically) saved as zip * CU-8699jpw1u: A few simple tests for vocab zip-serialisation * CU-8699jpw1u: Add test time zipped Vocab
1 parent cf529a2 commit 54588a3

File tree

7 files changed

+137
-6
lines changed

7 files changed

+137
-6
lines changed

medcat-v2/medcat/cdb/cdb.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
1-
from typing import Iterable, Any, Collection, Union
1+
from typing import Iterable, Any, Collection, Union, Literal
22

33
from medcat.storage.serialisables import AbstractSerialisable
44
from medcat.cdb.concepts import CUIInfo, NameInfo, TypeInfo
55
from medcat.cdb.concepts import get_new_cui_info, get_new_name_info
66
from medcat.cdb.concepts import reset_cui_training
77
from medcat.storage.serialisers import (
88
deserialise, AvailableSerialisers, serialise)
9+
from medcat.storage.zip_utils import (
10+
should_serialise_as_zip, serialise_as_zip, deserialise_from_zip)
911
from medcat.utils.defaults import default_weighted_average, StatusTypes as ST
1012
from medcat.utils.hasher import Hasher
1113
from medcat.preprocessors.cleaners import NameDescriptor
@@ -485,6 +487,7 @@ def save(self, save_path: str,
485487
serialiser: Union[
486488
str, AvailableSerialisers] = AvailableSerialisers.dill,
487489
overwrite: bool = False,
490+
as_zip: Union[bool, Literal['auto']] = 'auto',
488491
) -> None:
489492
"""Save CDB at path.
490493
@@ -495,12 +498,20 @@ def save(self, save_path: str,
495498
The serialiser. Defaults to AvailableSerialisers.dill.
496499
overwrite (bool, optional):
497500
Whether to allow overwriting existing files. Defaults to False.
501+
as_zip (Union[bool, Literal['auto']]):
502+
Whether to serialise the CDB as a zip.
498503
"""
504+
if should_serialise_as_zip(save_path, as_zip):
505+
serialise_as_zip(self, save_path, serialiser, overwrite=overwrite)
506+
return
499507
serialise(serialiser, self, save_path, overwrite=overwrite)
500508

501509
@classmethod
502510
def load(cls, path: str) -> 'CDB':
503-
cdb = deserialise(path)
511+
if should_serialise_as_zip(path, 'auto'):
512+
cdb = deserialise_from_zip(path)
513+
else:
514+
cdb = deserialise(path)
504515
if not isinstance(cdb, CDB):
505516
raise ValueError(f"The path '{path}' is not a CDB!")
506517
return cdb
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
import os
2+
import shutil
3+
import tempfile
4+
from typing import Union, Literal
5+
6+
from medcat.storage.serialisables import Serialisable
7+
from medcat.storage.serialisers import (
8+
serialise, deserialise, AvailableSerialisers)
9+
10+
11+
def should_serialise_as_zip(path: str,
12+
as_zip: Union[bool, Literal['auto']]
13+
) -> bool:
14+
if as_zip == 'auto':
15+
return path.endswith(".zip")
16+
return as_zip
17+
18+
19+
def serialise_as_zip(
20+
obj: Serialisable, path: str,
21+
ser_type: Union[AvailableSerialisers, str] = AvailableSerialisers.dill,
22+
overwrite: bool = False,
23+
) -> None:
24+
"""Serialse the file to a .zip at the specified path.
25+
26+
The process uses the regular `serialise` method to serialise the object
27+
as a folder into a temporary directory and subsequently zips it up to
28+
the path requested.
29+
30+
Args:
31+
obj (Serialisable): The object to serialise.
32+
path (str): The path to serialse the file to. Should end with .zip.
33+
ser_type (Union[AvailableSerialisers, str], optional): The serialiser
34+
to use. Defaults to AvailableSerialisers.dill.
35+
overwrite (bool, optional):
36+
Whether to allow overwriting existing files. Defaults to False.
37+
"""
38+
if not path.endswith('.zip'):
39+
raise ValueError(f"Path must end with .zip, got {path}")
40+
with tempfile.TemporaryDirectory() as tmpdir:
41+
serialise(ser_type, obj, tmpdir)
42+
if not overwrite and os.path.exists(path):
43+
raise ValueError(f"Cannot overwrite existing file: {path}")
44+
base_name = path[:-4] # remove '.zip'
45+
shutil.make_archive(
46+
base_name=base_name,
47+
format='zip',
48+
root_dir=tmpdir,
49+
)
50+
51+
52+
def deserialise_from_zip(path: str) -> Serialisable:
53+
"""Deserialise from a zip file.
54+
55+
The process involves unzipping the contents to a temporary directory,
56+
and subsequently using the ruglar `deserialise` method to deserialise
57+
from that.
58+
59+
Args:
60+
path (str): The path to deserialise from. Should end with .zip.
61+
62+
Returns:
63+
Serialisable: The deserialised object
64+
"""
65+
if not path.endswith('.zip'):
66+
raise ValueError(f"Path must end with .zip, got {path}")
67+
with tempfile.TemporaryDirectory() as tmpdir:
68+
shutil.unpack_archive(path, tmpdir, format='zip')
69+
return deserialise(tmpdir)

medcat-v2/medcat/vocab.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import Optional, Any, cast, Union
1+
from typing import Optional, Any, cast, Union, Literal
22
from typing_extensions import TypedDict
33

44
# import dill
@@ -7,6 +7,8 @@
77
from medcat.storage.serialisables import AbstractSerialisable
88
from medcat.storage.serialisers import (
99
deserialise, AvailableSerialisers, serialise)
10+
from medcat.storage.zip_utils import (
11+
should_serialise_as_zip, serialise_as_zip, deserialise_from_zip)
1012

1113

1214
WordDescriptor = TypedDict('WordDescriptor',
@@ -298,6 +300,7 @@ def save(self, save_path: str,
298300
serialiser: Union[
299301
str, AvailableSerialisers] = AvailableSerialisers.dill,
300302
overwrite: bool = False,
303+
as_zip: Union[bool, Literal['auto']] = 'auto',
301304
) -> None:
302305
"""Save Vocab at path.
303306
@@ -308,12 +311,20 @@ def save(self, save_path: str,
308311
The serialiser. Defaults to AvailableSerialisers.dill.
309312
overwrite (bool, optional):
310313
Whether to allow overwriting existing files. Defaults to False.
314+
as_zip (Union[bool, Literal['auto']]):
315+
Whether to serialise the CDB as a zip.
311316
"""
317+
if should_serialise_as_zip(save_path, as_zip):
318+
serialise_as_zip(self, save_path, serialiser, overwrite=overwrite)
319+
return
312320
serialise(serialiser, self, save_path, overwrite=overwrite)
313321

314322
@classmethod
315323
def load(cls, path: str) -> 'Vocab':
316-
vocab = deserialise(path)
324+
if should_serialise_as_zip(path, 'auto'):
325+
vocab = deserialise_from_zip(path)
326+
else:
327+
vocab = deserialise(path)
317328
if not isinstance(vocab, Vocab):
318329
raise ValueError(f"The path '{path}' is not a Vocab!")
319330
return vocab

medcat-v2/tests/cdb/test_cdb.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,10 @@
99
from unittest import TestCase
1010
import tempfile
1111

12-
from .. import UNPACKED_EXAMPLE_MODEL_PACK_PATH
12+
from .. import UNPACKED_EXAMPLE_MODEL_PACK_PATH, RESOURCES_PATH
13+
14+
15+
ZIPPED_CDB_PATH = os.path.join(RESOURCES_PATH, "mct2_cdb.zip")
1316

1417

1518
class CDBTests(TestCase):
@@ -26,9 +29,28 @@ def test_convenience_method_save(self):
2629
with tempfile.TemporaryDirectory() as dir:
2730
self.cdb.save(dir)
2831
self.assertTrue(os.path.exists(dir))
32+
# should have a non-empty directory
33+
self.assertTrue(os.listdir(dir))
2934
obj = deserialise(dir)
3035
self.assertIsInstance(obj, cdb.CDB)
3136

37+
def test_can_load_from_zip(self):
38+
loaded = cdb.CDB.load(ZIPPED_CDB_PATH)
39+
self.assertIsInstance(loaded, cdb.CDB)
40+
# make sure it's actually a file not a folder
41+
self.assertTrue(os.path.isfile(ZIPPED_CDB_PATH))
42+
43+
def test_can_save_to_zip(self):
44+
with tempfile.TemporaryDirectory() as temp_dir:
45+
file_name = os.path.join(temp_dir, "cdb.zip")
46+
# NOTE: auto detection should write as zip
47+
self.cdb.save(file_name)
48+
self.assertTrue(os.path.exists(file_name))
49+
self.assertTrue(os.path.isfile(file_name))
50+
# and can load from saved zip
51+
loaded = cdb.CDB.load(file_name)
52+
self.assertIsInstance(loaded, cdb.CDB)
53+
3254
def test_convenience_method_load(self):
3355
ccdb = cdb.CDB.load(self.CDB_PATH)
3456
self.assertIsInstance(ccdb, cdb.CDB)
14.2 KB
Binary file not shown.
1.21 KB
Binary file not shown.

medcat-v2/tests/test_vocab.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,10 @@
88
import unittest
99
import tempfile
1010

11-
from . import UNPACKED_EXAMPLE_MODEL_PACK_PATH
11+
from . import UNPACKED_EXAMPLE_MODEL_PACK_PATH, RESOURCES_PATH
12+
13+
14+
ZIPPED_VOCAB_PATH = os.path.join(RESOURCES_PATH, "mct2_vocab.zip")
1215

1316

1417
class VocabCreationTests(unittest.TestCase):
@@ -191,3 +194,18 @@ def test_convenience_save(self):
191194
def test_convenience_load(self):
192195
vocab = Vocab.load(self.VOCAB_PATH)
193196
self.assertIsInstance(vocab, Vocab)
197+
198+
def test_can_load_from_zip(self):
199+
vocab = Vocab.load(ZIPPED_VOCAB_PATH)
200+
self.assertIsInstance(vocab, Vocab)
201+
202+
def test_can_save_to_zip(self):
203+
with tempfile.TemporaryDirectory() as temp_dir:
204+
file_name = os.path.join(temp_dir, 'vocab.zip')
205+
# NOTE: auto detection will write as zip
206+
self.vocab.save(file_name)
207+
self.assertTrue(os.path.exists(file_name))
208+
self.assertTrue(os.path.isfile(file_name))
209+
# and can load from saved zip
210+
loaded = Vocab.load(file_name)
211+
self.assertIsInstance(loaded, Vocab)

0 commit comments

Comments
 (0)