Skip to content

Commit b7711c8

Browse files
authored
bug(medcat): CU-869b2hpam Fix issue loading tokenizers off disk (#213)
* CU-869b2hpam: Make tokenizer init use model load path to amend saved tokenizer path * CU-869b2hpam: Fix issue with lang string from tokenizer folder name when setting stopwords * CU-869b2hpam: Make tokenizer saving return subfolder name only (not path to it) * CU-869b2hpam: Minor whitespace changes * CU-869b2hpam: Add a logged warning when fixing the spacy model internals path * CU-869b2hpam: Fix tests regarding spacy model save * CU-869b2hpam: Fix tests regarding config merge
1 parent a0ed550 commit b7711c8

File tree

3 files changed

+27
-10
lines changed

3 files changed

+27
-10
lines changed

medcat-v2/medcat/pipeline/pipeline.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ def __init__(self, cdb: CDB, vocab: Optional[Vocab],
7979
# but it should be non-None otherwise
8080
self.vocab: Vocab = vocab # type: ignore
8181
self.config = self.cdb.config
82-
self._tokenizer = self._init_tokenizer()
82+
self._tokenizer = self._init_tokenizer(model_load_path)
8383
self._components: list[CoreComponent] = []
8484
self._addons: list[AddonComponent] = []
8585
self._init_components(model_load_path, old_pipe, addon_config_dict)
@@ -95,8 +95,20 @@ def tokenizer_with_tag(self) -> BaseTokenizer:
9595
tag_comp = self.get_component(CoreComponentType.tagging)
9696
return DelegatingTokenizer(self.tokenizer, [tag_comp])
9797

98-
def _init_tokenizer(self) -> BaseTokenizer:
98+
def _init_tokenizer(self, model_load_path: Optional[str]) -> BaseTokenizer:
9999
nlp_cnf = self.config.general.nlp
100+
if model_load_path:
101+
orig_modelname = nlp_cnf.modelname
102+
model_basename = os.path.basename(orig_modelname)
103+
# NOTE: this should update the load path to the correct one
104+
nlp_cnf.modelname = os.path.join(
105+
model_load_path, model_basename)
106+
if orig_modelname != model_basename:
107+
logger.warning(
108+
"Loading a model with incorrectly saved tokenizer "
109+
"internals path. Was saved as '%s' whereas should have "
110+
"had just '%s'. This is an automated fix - no further "
111+
"action is needed", orig_modelname, model_basename)
100112
try:
101113
return create_tokenizer(nlp_cnf.provider, self.config)
102114
except TypeError as type_error:

medcat-v2/medcat/tokenizing/spacy_impl/tokenizers.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,8 @@ def __init__(self, spacy_model_name: str,
5858
ensure_spacy_model(self._spacy_model_name)
5959
spacy_model_name = self._spacy_model_name
6060
if stopwords is not None:
61-
lang_str = os.path.basename(spacy_model_name).split('_', 1)[0]
61+
lang_str = os.path.basename(spacy_model_name).removeprefix(
62+
TOKENIZER_PREFIX).split('_', 1)[0]
6263
cls = spacy.util.get_lang_class(lang_str)
6364
cls.Defaults.stop_words = set(stopwords)
6465
self._nlp = spacy.load(spacy_model_name,
@@ -87,7 +88,8 @@ def __call__(self, text: str) -> MutableDocument:
8788
@classmethod
8889
def create_new_tokenizer(cls, config: Config) -> 'SpacyTokenizer':
8990
nlp_cnf = config.general.nlp
90-
return cls(nlp_cnf.modelname,
91+
return cls(
92+
nlp_cnf.modelname,
9193
nlp_cnf.disabled_components,
9294
config.general.diacritics,
9395
config.preprocessing.max_document_length,
@@ -102,14 +104,14 @@ def get_entity_class(self) -> Type[MutableEntity]:
102104
# saveable tokenizer
103105

104106
def save_internals_to(self, folder_path: str) -> str:
105-
subfolder = os.path.join(
106-
folder_path, f"{TOKENIZER_PREFIX}{self._spacy_model_name}")
107+
subfolder_only = f"{TOKENIZER_PREFIX}{self._spacy_model_name}"
108+
subfolder = os.path.join(folder_path, subfolder_only)
107109
if os.path.exists(subfolder):
108110
# NOTE: always overwrite
109111
shutil.rmtree(subfolder)
110112
logger.debug("Saving spacy model to '%s'", subfolder)
111113
self._nlp.to_disk(subfolder)
112-
return subfolder
114+
return subfolder_only
113115

114116
def load_internals_from(self, folder_path: str) -> bool:
115117
return os.path.exists(folder_path)

medcat-v2/tests/test_cat.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -129,8 +129,9 @@ class ConfigMergeTests(unittest.TestCase):
129129
def test_can_merge_config(self):
130130
model = cat.CAT.load_model_pack(
131131
EXAMPLE_MODEL_PACK_ZIP, config_dict=self.model_dict)
132-
self.assertEqual(
133-
model.config.general.nlp.modelname, self.spacy_model_name)
132+
# NOTE: this is converted to a (non-existent) path
133+
self.assertIn(
134+
self.spacy_model_name, model.config.general.nlp.modelname)
134135

135136

136137
class OntologiesMapTests(TrainedModelTests):
@@ -919,7 +920,9 @@ def setUpClass(cls):
919920
cls.saved_model_path = cls.cat.save_model_pack(
920921
cls._save_folder.name, make_archive=False)
921922
# NOTE: that has changed config
922-
cls.saved_spacy_path = cls.cat.config.general.nlp.modelname
923+
cls.saved_spacy_path = os.path.join(
924+
cls.saved_model_path,
925+
cls.cat.config.general.nlp.modelname)
923926

924927
@classmethod
925928
def tearDownClass(cls):

0 commit comments

Comments
 (0)