2727from medcat .cdb import CDB
2828from medcat .vocab import Vocab
2929from medcat .utils .defaults import COMPONENTS_FOLDER
30+ from medcat .utils .defaults import (
31+ avoid_legacy_conversion , doing_legacy_conversion_message ,
32+ LegacyConversionDisabledError )
3033from peft import get_peft_model , LoraConfig , TaskType
3134
3235# It should be safe to do this always, as all other multiprocessing
@@ -173,9 +176,39 @@ def serialise_to(self, folder_path: str) -> None:
173176 os .mkdir (folder_path )
174177 self .save (folder_path )
175178
179+ @classmethod
180+ def _create_throwaway_tokenizer (cls ) -> BaseTokenizer :
181+ from medcat .tokenizing .tokenizers import create_tokenizer
182+ from medcat .config import Config
183+ logger .warning (
184+ "A base tokenizer was not provided during the loading of a "
185+ "MetaCAT. The tokenizer is used to register the required data "
186+ "paths for MetaCAT to function. Using the default of '%s'. If "
187+ "this it not the tokenizer you will end up using, MetaCAT may "
188+ "be unable to recover unless a) the paths are registered "
189+ "explicitly, or b) there are other MetaCATs created with the "
190+ "correct tokenizer. Do note that this will also create "
191+ "another instance of the tokenizer, though it should be "
192+ "garbage collected soon." , cls .DEFAULT_TOKENIZER
193+ )
194+ # NOTE: the use of a (mostly) default config here probably won't
195+ # affect anything since the tokenizer itself won't be used
196+ gcnf = Config ()
197+ gcnf .general .nlp .provider = 'spacy'
198+ return create_tokenizer (cls .DEFAULT_TOKENIZER , gcnf )
199+
176200 @classmethod
177201 def deserialise_from (cls , folder_path : str , ** init_kwargs
178202 ) -> 'MetaCATAddon' :
203+ if "model.dat" in os .listdir (folder_path ):
204+ if not avoid_legacy_conversion ():
205+ doing_legacy_conversion_message (
206+ logger , cls .__name__ , folder_path )
207+ from medcat .utils .legacy .convert_meta_cat import (
208+ get_meta_cat_from_old )
209+ return get_meta_cat_from_old (
210+ folder_path , cls ._create_throwaway_tokenizer ())
211+ raise LegacyConversionDisabledError (cls .__name__ ,)
179212 if 'cnf' in init_kwargs :
180213 cnf = init_kwargs ['cnf' ]
181214 else :
@@ -191,24 +224,7 @@ def deserialise_from(cls, folder_path: str, **init_kwargs
191224 if 'tokenizer' in init_kwargs :
192225 tokenizer = init_kwargs ['tokenizer' ]
193226 else :
194- from medcat .tokenizing .tokenizers import create_tokenizer
195- from medcat .config import Config
196- logger .warning (
197- "A base tokenizer was not provided during the loading of a "
198- "MetaCAT. The tokenizer is used to register the required data "
199- "paths for MetaCAT to function. Using the default of '%s'. If "
200- "this it not the tokenizer you will end up using, MetaCAT may "
201- "be unable to recover unless a) the paths are registered "
202- "explicitly, or b) there are other MetaCATs created with the "
203- "correct tokenizer. Do note that this will also create "
204- "another instance of the tokenizer, though it should be "
205- "garbage collected soon." , cls .DEFAULT_TOKENIZER
206- )
207- # NOTE: the use of a (mostly) default config here probably won't
208- # affect anything since the tokenizer itself won't be used
209- gcnf = Config ()
210- gcnf .general .nlp .provider = 'spacy'
211- tokenizer = create_tokenizer (cls .DEFAULT_TOKENIZER , gcnf )
227+ tokenizer = cls ._create_throwaway_tokenizer ()
212228 return cls .load_existing (
213229 load_path = folder_path ,
214230 cnf = cnf ,
0 commit comments