Skip to content

Commit 11875a1

Browse files
committed
CU-8699np02n improve legacy conversion (4) (#31)
* CU-8699np02n: Update CDB legacy conversion so that it works with CDBs with no name_isupper attribute * CU-8699np02n: Add method to legacy converter to convert any config * CU-8699np02n: Fix config legacy conversion * CU-8699np02n: Add a few simple tests for Config legacy conversion * CU-8699np02n: Add a little more sophistication to general config conversion tests * CU-8699np02n: Simplify MetaCAT deserialisation from legacy data * CU-8699np02n: Fix MetaCAT legacy conversion
1 parent c9cd2a1 commit 11875a1

File tree

1 file changed

+34
-18
lines changed

1 file changed

+34
-18
lines changed

medcat-v2/medcat/components/addons/meta_cat/meta_cat.py

Lines changed: 34 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@
2727
from medcat.cdb import CDB
2828
from medcat.vocab import Vocab
2929
from medcat.utils.defaults import COMPONENTS_FOLDER
30+
from medcat.utils.defaults import (
31+
avoid_legacy_conversion, doing_legacy_conversion_message,
32+
LegacyConversionDisabledError)
3033
from peft import get_peft_model, LoraConfig, TaskType
3134

3235
# It should be safe to do this always, as all other multiprocessing
@@ -173,9 +176,39 @@ def serialise_to(self, folder_path: str) -> None:
173176
os.mkdir(folder_path)
174177
self.save(folder_path)
175178

179+
@classmethod
180+
def _create_throwaway_tokenizer(cls) -> BaseTokenizer:
181+
from medcat.tokenizing.tokenizers import create_tokenizer
182+
from medcat.config import Config
183+
logger.warning(
184+
"A base tokenizer was not provided during the loading of a "
185+
"MetaCAT. The tokenizer is used to register the required data "
186+
"paths for MetaCAT to function. Using the default of '%s'. If "
187+
"this it not the tokenizer you will end up using, MetaCAT may "
188+
"be unable to recover unless a) the paths are registered "
189+
"explicitly, or b) there are other MetaCATs created with the "
190+
"correct tokenizer. Do note that this will also create "
191+
"another instance of the tokenizer, though it should be "
192+
"garbage collected soon.", cls.DEFAULT_TOKENIZER
193+
)
194+
# NOTE: the use of a (mostly) default config here probably won't
195+
# affect anything since the tokenizer itself won't be used
196+
gcnf = Config()
197+
gcnf.general.nlp.provider = 'spacy'
198+
return create_tokenizer(cls.DEFAULT_TOKENIZER, gcnf)
199+
176200
@classmethod
177201
def deserialise_from(cls, folder_path: str, **init_kwargs
178202
) -> 'MetaCATAddon':
203+
if "model.dat" in os.listdir(folder_path):
204+
if not avoid_legacy_conversion():
205+
doing_legacy_conversion_message(
206+
logger, cls.__name__, folder_path)
207+
from medcat.utils.legacy.convert_meta_cat import (
208+
get_meta_cat_from_old)
209+
return get_meta_cat_from_old(
210+
folder_path, cls._create_throwaway_tokenizer())
211+
raise LegacyConversionDisabledError(cls.__name__,)
179212
if 'cnf' in init_kwargs:
180213
cnf = init_kwargs['cnf']
181214
else:
@@ -191,24 +224,7 @@ def deserialise_from(cls, folder_path: str, **init_kwargs
191224
if 'tokenizer' in init_kwargs:
192225
tokenizer = init_kwargs['tokenizer']
193226
else:
194-
from medcat.tokenizing.tokenizers import create_tokenizer
195-
from medcat.config import Config
196-
logger.warning(
197-
"A base tokenizer was not provided during the loading of a "
198-
"MetaCAT. The tokenizer is used to register the required data "
199-
"paths for MetaCAT to function. Using the default of '%s'. If "
200-
"this it not the tokenizer you will end up using, MetaCAT may "
201-
"be unable to recover unless a) the paths are registered "
202-
"explicitly, or b) there are other MetaCATs created with the "
203-
"correct tokenizer. Do note that this will also create "
204-
"another instance of the tokenizer, though it should be "
205-
"garbage collected soon.", cls.DEFAULT_TOKENIZER
206-
)
207-
# NOTE: the use of a (mostly) default config here probably won't
208-
# affect anything since the tokenizer itself won't be used
209-
gcnf = Config()
210-
gcnf.general.nlp.provider = 'spacy'
211-
tokenizer = create_tokenizer(cls.DEFAULT_TOKENIZER, gcnf)
227+
tokenizer = cls._create_throwaway_tokenizer()
212228
return cls.load_existing(
213229
load_path=folder_path,
214230
cnf=cnf,

0 commit comments

Comments
 (0)