Skip to content

Commit 8960255

Browse files
authored
bug(medcat): CU-869avau57 Fix cui to original names (#177)
* CU-869avau57: Move original names from addl info to cuiinfo upon conversion * CU-869avau57: Add small test for original names upon CDB conversion * CU-869avau57: Add a fix method for fixing cui2original_names * CU-869avau57: Add a few simple tests for cui2orig names fix * CU-869avau57: Clear duplicate original names info after fix * CU-869avau57: Add a simple test for not fixing legacy issues twice * CU-869avau57: Fix cui2original names upon CDB load * CU-869avau57: Adapt tests to on the fly legacy conversion fixes * CU-869avau57: Add test for automatic cui2orig names fix
1 parent 260d63c commit 8960255

File tree

5 files changed

+148
-1
lines changed

5 files changed

+148
-1
lines changed

medcat-v2/medcat/cdb/cdb.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -521,7 +521,25 @@ def save(self, save_path: str,
521521
serialise(serialiser, self, save_path, overwrite=overwrite)
522522

523523
@classmethod
524-
def load(cls, path: str) -> 'CDB':
524+
def load(cls, path: str, perform_fixes: bool = True) -> 'CDB':
525+
"""Load the CDB off disk.
526+
527+
This can load a legacy (v1) CDB (.dat) or a v2 CDB either in its folder
528+
format or the .zip format. The distinction is made automatically.
529+
530+
Args:
531+
path (str): The path to the CDB.
532+
perform_fixes (bool): Whether to perform fixes such as
533+
original names issue. Defaults to True.
534+
535+
Raises:
536+
LegacyConversionDisabledError:
537+
If when a legacy model is found and conversion is not allowed.
538+
ValueError: If the loaded object isn't a CDB.
539+
540+
Returns:
541+
CDB: The loaded CDB.
542+
"""
525543
if should_serialise_as_zip(path, 'auto'):
526544
cdb = deserialise_from_zip(path)
527545
elif os.path.isfile(path) and path.endswith('.dat'):
@@ -535,4 +553,9 @@ def load(cls, path: str) -> 'CDB':
535553
cdb = deserialise(path)
536554
if not isinstance(cdb, CDB):
537555
raise ValueError(f"The path '{path}' is not a CDB!")
556+
if perform_fixes:
557+
# perform fix(es)
558+
from medcat.utils.legacy.fixes import (
559+
fix_cui2original_names_if_needed)
560+
fix_cui2original_names_if_needed(cdb)
538561
return cdb

medcat-v2/medcat/utils/legacy/convert_cdb.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ def _add_cui_info(cdb: CDB, data: dict) -> CDB:
121121
cui2tags, cui2type_ids = data['cui2tags'], data['cui2type_ids']
122122
cui2prefname = data['cui2preferred_name']
123123
cui2av_conf = data['cui2average_confidence']
124+
cui2orig_names = data["addl_info"].get("cui2original_names", {})
124125
for cui in all_cuis:
125126
names = cui2names.get(cui, set())
126127
snames = cui2snames.get(cui, set())
@@ -134,8 +135,14 @@ def _add_cui_info(cdb: CDB, data: dict) -> CDB:
134135
cui=cui, preferred_name=prefname, names=names, subnames=snames,
135136
type_ids=type_ids, tags=tags, count_train=count_train,
136137
context_vectors=vecs, average_confidence=av_conf,
138+
original_names=cui2orig_names.get(cui, None),
137139
)
138140
cdb.cui2info[cui] = info
141+
# remove cui2original_names from addl_info - we've already used it
142+
if "cui2original_names" in data["addl_info"]:
143+
logger.info("Deleting 'cui2original_names' in addl_info - "
144+
"it was used in CUIInfo already")
145+
del data["addl_info"]["cui2original_names"]
139146
all_cui_tuis = set((ci['cui'], tui) for ci in cdb.cui2info.values()
140147
for tui in ci['type_ids'])
141148
all_tuis = set(tui for _, tui in all_cui_tuis)
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
import logging
2+
3+
from medcat.cdb import CDB
4+
5+
6+
logger = logging.getLogger(__name__)
7+
8+
9+
def _fix_cui2original_names(cdb: CDB) -> None:
10+
cui2on = cdb.addl_info["cui2original_names"]
11+
num_cuis = len(cui2on)
12+
used_cuis = 0
13+
for ci in cdb.cui2info.values():
14+
orig_names: set[str] = cui2on.get(ci["cui"], None)
15+
if orig_names is not None:
16+
if ci["original_names"] is None:
17+
ci["original_names"] = orig_names
18+
else:
19+
ci["original_names"].update(orig_names)
20+
used_cuis += 1
21+
logger.info(
22+
"Used %d out of %d CUIs in the 'cui2original_names' map",
23+
used_cuis, num_cuis)
24+
# delete existing data in cui2original_names
25+
del cdb.addl_info["cui2original_names"]
26+
27+
28+
def fix_cui2original_names_if_needed(cdb: CDB) -> bool:
29+
"""Fix the cui2original names in CDB if needed.
30+
31+
This was an issue caused by faulty legacy conversion
32+
where the data wasn't moved correctly from addl_info.
33+
34+
Args:
35+
cdb (CDB): The CDB in question.
36+
37+
Returns:
38+
bool: Whether the fix / change was made.
39+
"""
40+
if "cui2original_names" in cdb.addl_info:
41+
logger.info(
42+
"CDB addl_info contains legacy data: "
43+
"'cui2original_names' . Moving it to cui2info")
44+
_fix_cui2original_names(cdb)
45+
return True
46+
else:
47+
logger.debug("CDB does not contain legacy 'cui2original_names")
48+
return False

medcat-v2/tests/utils/legacy/test_convert_cdb.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,12 @@ def test_all_cui_names_in_names(self):
3232
with self.subTest(f"{cui}: {name}"):
3333
self.assertIn(name, self.cdb.name2info)
3434

35+
def test_all_cuis_have_original_names(self):
36+
for cui, ci in self.cdb.cui2info.items():
37+
with self.subTest(cui):
38+
print(cui, ":", ci["original_names"])
39+
self.assertTrue(ci["original_names"])
40+
3541
def test_all_name_cuis_in_per_cui_status(self):
3642
for name, nameinfo in self.cdb.name2info.items():
3743
for cui in nameinfo['per_cui_status']:
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
import os
2+
3+
from medcat.cdb import CDB
4+
from medcat.utils.legacy import fixes
5+
from medcat.utils.cdb_state import captured_state_cdb
6+
7+
import unittest
8+
9+
from ... import UNPACKED_EXAMPLE_MODEL_PACK_PATH
10+
11+
12+
CONVERTED_CDB_PATH = os.path.join(
13+
UNPACKED_EXAMPLE_MODEL_PACK_PATH, "cdb")
14+
15+
16+
class TestCUI2OriginalNamesFix(unittest.TestCase):
17+
18+
@classmethod
19+
def setUpClass(cls):
20+
cls.converted_cdb = CDB.load(CONVERTED_CDB_PATH,
21+
perform_fixes=False)
22+
23+
def test_converted_model_does_not_have_orig_names(self):
24+
for ci in self.converted_cdb.cui2info.values():
25+
with self.subTest(ci["cui"]):
26+
self.assertFalse(ci["original_names"])
27+
28+
def test_model_has_orig_names_after_fix(self):
29+
# to make sure this is agnostic to the order
30+
with captured_state_cdb(self.converted_cdb):
31+
changed = fixes.fix_cui2original_names_if_needed(
32+
self.converted_cdb)
33+
self.assertTrue(changed)
34+
# has not cui2original_names
35+
self.assertNotIn("cui2original_names",
36+
self.converted_cdb.addl_info)
37+
for ci in self.converted_cdb.cui2info.values():
38+
with self.subTest(ci["cui"]):
39+
self.assertTrue(ci["original_names"])
40+
41+
def test_will_not_fix_twice(self):
42+
with captured_state_cdb(self.converted_cdb):
43+
fixes.fix_cui2original_names_if_needed(
44+
self.converted_cdb)
45+
changed_twice = fixes.fix_cui2original_names_if_needed(
46+
self.converted_cdb)
47+
self.assertFalse(changed_twice)
48+
49+
50+
class TestCUI2OriginalNamesFixAuto(unittest.TestCase):
51+
52+
@classmethod
53+
def setUpClass(cls):
54+
cls.converted_cdb = CDB.load(CONVERTED_CDB_PATH,
55+
perform_fixes=True)
56+
57+
def test_cui2orig_names_fixed_automatically(self):
58+
for ci in self.converted_cdb.cui2info.values():
59+
with self.subTest(ci["cui"]):
60+
self.assertTrue(ci["original_names"])
61+
62+
def test_addl_info_has_no_cui2original_names(self):
63+
self.assertNotIn("cui2original_names", self.converted_cdb.addl_info)

0 commit comments

Comments
 (0)