From 1edcdfbe0aea4b3b056d6da1b28d5ecca5b1f452 Mon Sep 17 00:00:00 2001 From: Tom Searle Date: Thu, 12 Jun 2025 23:29:17 +0100 Subject: [PATCH 01/33] interim changes for medcat-v2 --- notebook_docs/API_Examples.ipynb | 32 ++++----- notebook_docs/Train_MedCAT_Models.ipynb | 8 +-- webapp/api/api/admin/actions.py | 12 ++-- webapp/api/api/metrics.py | 26 +++---- webapp/api/api/model_cache.py | 18 ++++- webapp/api/api/models.py | 17 ++--- webapp/api/api/utils.py | 90 ++++++++++++++----------- webapp/api/api/views.py | 19 +++--- 8 files changed, 120 insertions(+), 102 deletions(-) diff --git a/notebook_docs/API_Examples.ipynb b/notebook_docs/API_Examples.ipynb index feacc4d4..649a3651 100644 --- a/notebook_docs/API_Examples.ipynb +++ b/notebook_docs/API_Examples.ipynb @@ -200,11 +200,11 @@ "for name, d_s in datasets:\n", " payload = {\n", " 'dataset_name': name, # Name that appears in each\n", - " 'dataset': d_s.loc[:, ['name', 'text']].to_dict(), # Dictionary representation of only \n", + " 'dataset': d_s.loc[:, ['name', 'text']].to_dict(), # Dictionary representation of only\n", " 'description': f'{name} first 20 notes from each category' # Description that appears in the trainer\n", " }\n", " resp = requests.post(f'{URL}/api/create-dataset/', json=payload, headers=headers)\n", - " dataset_ids.append(json.loads(resp.text)['dataset_id']) \n", + " dataset_ids.append(json.loads(resp.text)['dataset_id'])\n", "# New datasets created in the trainer have the following IDs\n", "dataset_ids" ] @@ -268,7 +268,7 @@ }, "outputs": [], "source": [ - "from medcat.cdb import CDB" + "from medcat.storage.serialisers import deserialise" ] }, { @@ -290,7 +290,7 @@ } ], "source": [ - "CDB.load('../../medcat-models/deid_medcat_n2c2_modelpack/cdb.dat')" + "cdb = deserialise('../../medcat-models/deid_medcat_n2c2_modelpack/cdb.dat')" ] }, { @@ -301,8 +301,8 @@ }, "outputs": [], "source": [ - "txt = json.loads(requests.post(f'{URL}/api/concept-dbs/', headers=headers, \n", - " data={'name': 'example_cdb', 'use_for_training': True}, \n", + "txt = json.loads(requests.post(f'{URL}/api/concept-dbs/', headers=headers,\n", + " data={'name': 'example_cdb', 'use_for_training': True},\n", " files={'cdb_file': open('../../medcat-models/deid_medcat_n2c2_modelpack/cdb.dat', 'rb')}).text)" ] }, @@ -342,8 +342,8 @@ }, "outputs": [], "source": [ - "txt = json.loads(requests.put(f'{URL}/api/concept-dbs/21/', headers=headers, \n", - " data={'name': 'example_cdb-EDITED', 'use_for_training': True}, \n", + "txt = json.loads(requests.put(f'{URL}/api/concept-dbs/21/', headers=headers,\n", + " data={'name': 'example_cdb-EDITED', 'use_for_training': True},\n", " files={'cdb_file': open('../../medcat-models/deid_medcat_n2c2_modelpack/cdb.dat', 'rb')}).text)" ] }, @@ -379,8 +379,8 @@ } ], "source": [ - "requests.post(f'{URL}/api/concept-dbs/', headers=headers, \n", - " data={'name': 'example_cdb', 'use_for_training': True}, \n", + "requests.post(f'{URL}/api/concept-dbs/', headers=headers,\n", + " data={'name': 'example_cdb', 'use_for_training': True},\n", " files={'cdb_file': open('../../medcat-models/deid_medcat_n2c2_modelpack/cdb.dat', 'rb')}).text)" ] }, @@ -404,7 +404,7 @@ "metadata": {}, "outputs": [], "source": [ - "txt = json.loads(requests.post(f'{URL}/api/vocab/', headers=headers, \n", + "txt = json.loads(requests.post(f'{URL}/api/vocab/', headers=headers,\n", " files={'cdb_file': open('<>', 'rb')}).text)" ] }, @@ -465,7 +465,7 @@ "all_cdbs = json.loads(requests.get(f'{URL}/api/concept-dbs/', headers=headers).text)['results']\n", "# the CDB ID we'll use for this example\n", "cdb_to_use = all_cdbs[0]['id']\n", - "# you might have many CDBs here. First 2 cdbs: \n", + "# you might have many CDBs here. First 2 cdbs:\n", "all_cdbs[0:2]" ] }, @@ -521,12 +521,12 @@ "for d_id, p_name in zip(dataset_ids, project_names):\n", " payload = {\n", " 'name': f'{p_name} Annotation Project',\n", - " 'description': 'Example projects', \n", - " 'cuis': '', \n", + " 'description': 'Example projects',\n", + " 'cuis': '',\n", " 'tuis': '',\n", " 'dataset': d_id,\n", - " 'concept_db': cdb_to_use, \n", - " 'vocab': vocab_to_use, \n", + " 'concept_db': cdb_to_use,\n", + " 'vocab': vocab_to_use,\n", " 'members': users_ids\n", " }\n", " project_ids.append(json.loads(requests.post(f'{URL}/api/project-annotate-entities/', json=payload, headers=headers).text))" diff --git a/notebook_docs/Train_MedCAT_Models.ipynb b/notebook_docs/Train_MedCAT_Models.ipynb index c89e352a..3360be55 100644 --- a/notebook_docs/Train_MedCAT_Models.ipynb +++ b/notebook_docs/Train_MedCAT_Models.ipynb @@ -382,7 +382,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "\r", + "\r\n", "Epoch: 0, Prec: 0.36538461538461536, Rec: 0.8444444444444444, F1: 0.6049145299145299\n", "\n", "Docs with false positives: Psych Text 1; Psych Text 2\n", @@ -1383,11 +1383,11 @@ } ], "source": [ - "cat.train_supervised(data_path=\"example_data/MedCAT_Export_With_Text_2020-05-22_10_34_09.json\", \n", + "cat.train_supervised(data_path=\"example_data/MedCAT_Export_With_Text_2020-05-22_10_34_09.json\",\n", " nepochs=1,\n", " lr=0.1,\n", " anneal=False, # Unless we are reseting the CDB or cui_count this is False\n", - " print_stats=True, \n", + " print_stats=True,\n", " use_filters=True)" ] }, @@ -1443,7 +1443,7 @@ }, "outputs": [], "source": [ - "metacat = MetaCAT(tokenizer=tokenizer, embeddings=embeddings, \n", + "metacat = MetaCAT(tokenizer=tokenizer, embeddings=embeddings,\n", " pad_id=len(embeddings) -1, save_dir='mc_status', device='cpu')" ] }, diff --git a/webapp/api/api/admin/actions.py b/webapp/api/api/admin/actions.py index 57b5a16b..7e9da84a 100644 --- a/webapp/api/api/admin/actions.py +++ b/webapp/api/api/admin/actions.py @@ -13,6 +13,8 @@ from api.models import AnnotatedEntity, MetaAnnotation, EntityRelation, Document, ConceptDB from api.solr_utils import drop_collection, import_all_concepts +from medcat.storage.serialisers import deserialise + logger = logging.getLogger(__name__) _dt_fmt = '%Y-%m-%d %H:%M:%S.%f' @@ -356,20 +358,16 @@ def dataset_document_counts(dataset): @background(schedule=5) def _reset_cdb_filters(id): - from medcat.cdb import CDB concept_db = ConceptDB.objects.get(id=id) - cdb = CDB.load(concept_db.cdb_file.path) - cdb.config.linking['filters'] = {'cuis': set()} + cdb = deserialise(concept_db.cdb_file.path) + cdb.config.components.linking.filters = {'cuis': set()} cdb.save(concept_db.cdb_file.path) @background(schedule=5) def import_concepts_from_cdb(cdb_model_id: int): - from medcat.cdb import CDB - cdb_model = ConceptDB.objects.get(id=cdb_model_id) - cdb = CDB.load(cdb_model.cdb_file.path) - + cdb = deserialise(cdb_model.cdb_file.path) import_all_concepts(cdb, cdb_model) diff --git a/webapp/api/api/metrics.py b/webapp/api/api/metrics.py index 2a87a658..b67a7ba8 100644 --- a/webapp/api/api/metrics.py +++ b/webapp/api/api/metrics.py @@ -14,12 +14,12 @@ from django.contrib.auth.models import User from django.db.models import QuerySet from medcat.cat import CAT -from medcat.cdb import CDB -from medcat.config_meta_cat import ConfigMetaCAT -from medcat.meta_cat import MetaCAT -from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBase -from medcat.utils.meta_cat.data_utils import prepare_from_json, encode_category_values -from medcat.utils.meta_cat.ml_utils import create_batch_piped_data +from medcat.storage.serialisers import deserialise +from medcat.config.config_meta_cat import ConfigMetaCAT +from medcat.components.addons.meta_cat.meta_cat import MetaCAT +from medcat.components.addons.meta_cat.mctokenizers.tokenizers import TokenizerWrapperBase +from medcat.components.addons.meta_cat.data_utils import prepare_from_json, encode_category_values +from medcat.components.addons.meta_cat.ml_utils import create_batch_piped_data from medcat.vocab import Vocab from torch import nn @@ -48,8 +48,8 @@ def calculate_metrics(project_ids: List[int], report_name: str): cat = CAT.load_model_pack(projects[0].model_pack.model_pack.path) else: # assume the cdb / vocab is set in these projects - cdb = CDB.load(projects[0].concept_db.cdb_file.path) - vocab = Vocab.load(projects[0].vocab.vocab_file.path) + cdb = deserialise(projects[0].concept_db.cdb_file.path) + vocab = deserialise(projects[0].vocab.vocab_file.path) cat = CAT(cdb, vocab, config=cdb.config) project_data = retrieve_project_data(projects) metrics = ProjectMetrics(project_data, cat) @@ -76,7 +76,7 @@ def __init__(self, mct_export_data: dict, cat: CAT): """ self.mct_export = mct_export_data self.cat = cat - self.projects2names = {} + self.projects2names = {} self.projects2doc_ids = {} self.docs2names = {} self.docs2texts = {} @@ -166,21 +166,21 @@ def enrich_medcat_metrics(self, examples): """ for tp in [i for e_i in examples['tp'].values() for i in e_i]: try: - ann = AnnotatedEntity.objects.get(project_id=tp['project id'], document_id=tp['document id'], + ann = AnnotatedEntity.objects.get(project_id=tp['project id'], document_id=tp['document id'], start_ind=tp['start'], end_ind=tp['end']) tp['user'] = ann.user.username except: tp['user'] = None for fp in (i for e_i in examples['fp'].values() for i in e_i): try: - ann = AnnotatedEntity.objects.get(project_id=fp['project id'], document_id=fp['document id'], + ann = AnnotatedEntity.objects.get(project_id=fp['project id'], document_id=fp['document id'], start_ind=fp['start'], end_ind=fp['end']) fp['user'] = ann.user.username except: fp['user'] = None for fn in (i for e_i in examples['fn'].values() for i in e_i): try: - ann = AnnotatedEntity.objects.get(project_id=fn['project id'], document_id=fn['document id'], + ann = AnnotatedEntity.objects.get(project_id=fn['project id'], document_id=fn['document id'], start_ind=fn['start'], end_ind=fn['end']) fn['user'] = ann.user.username except: @@ -400,7 +400,7 @@ def generate_report(self, meta_ann=False): return {'user_stats': self.user_stats().to_dict('records'), 'concept_summary': self.concept_summary(), 'annotation_summary': anno_df.to_dict('records'), - 'meta_anno_summary': meta_anns_summary, + 'meta_anno_summary': meta_anns_summary, 'projects2doc_ids': self.projects2doc_ids, 'docs2text': self.docs2texts, 'projects2name': self.projects2names, diff --git a/webapp/api/api/model_cache.py b/webapp/api/api/model_cache.py index b79a4ce0..ebb12cc3 100644 --- a/webapp/api/api/model_cache.py +++ b/webapp/api/api/model_cache.py @@ -6,6 +6,8 @@ from medcat.cat import CAT from medcat.cdb import CDB from medcat.vocab import Vocab +from medcat.storage.serialisers import deserialise +from medcat.utils.legacy.convert_cdb import get_cdb_from_old from api.models import ConceptDB @@ -52,7 +54,17 @@ def get_medcat_from_cdb_vocab(project, else: cdb_path = project.concept_db.cdb_file.path try: - cdb = CDB.load(cdb_path) + cdb = deserialise(cdb_path) + except NotADirectoryError as e: + logger.warning("Legacy CDB found, converting to new format") + # this should live in medcat code directly + cdb = get_cdb_from_old(cdb_path) + serialise(cdb, cdb_path) + cdb_map[cdb_id] = cdb + cdb_path = project.concept_db.cdb_file.path + cdb = deserialise(cdb_path) + cdb_map[cdb_id] = cdb + except KeyError as ke: mc_v = pkg_resources.get_distribution('medcat').version if int(mc_v.split('.')[0]) > 0: @@ -73,7 +85,7 @@ def get_medcat_from_cdb_vocab(project, vocab = vocab_map[vocab_id] else: vocab_path = project.vocab.vocab_file.path - vocab = Vocab.load(vocab_path) + vocab = deserialise(vocab_path) vocab_map[vocab_id] = vocab cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab) cat_map[cat_id] = cat @@ -132,7 +144,7 @@ def clear_cached_medcat(project, cat_map: Dict[str, CAT]=CAT_MAP): def get_cached_cdb(cdb_id: str, cdb_map: Dict[str, CDB]=CDB_MAP) -> CDB: if cdb_id not in cdb_map: cdb_obj = ConceptDB.objects.get(id=cdb_id) - cdb = CDB.load(cdb_obj.cdb_file.path) + cdb = deserialise(cdb_obj.cdb_file.path) cdb_map[cdb_id] = cdb return cdb_map[cdb_id] diff --git a/webapp/api/api/models.py b/webapp/api/api/models.py index f29bc8b7..625dad0c 100644 --- a/webapp/api/api/models.py +++ b/webapp/api/api/models.py @@ -12,8 +12,8 @@ from django.forms import forms, ModelForm from medcat.cat import CAT from medcat.cdb import CDB -from medcat.vocab import Vocab -from medcat.meta_cat import MetaCAT +from medcat.storage.serialisers import deserialise +from medcat.components.addons.meta_cat.meta_cat import MetaCAT from polymorphic.models import PolymorphicModel from core.settings import MEDIA_ROOT @@ -42,14 +42,14 @@ class ModelPack(models.Model): meta_cats = models.ManyToManyField('MetaCATModel', blank=True, default=None) create_time = models.DateTimeField(auto_now_add=True) last_modified = models.DateTimeField(auto_now=True) - last_modified_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=True) + last_modified_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=True) @transaction.atomic def save(self, *args, **kwargs): is_new = self._state.adding if is_new: super().save(*args, **kwargs) - + # Process the model pack logger.info('Loading model pack: %s', self.model_pack) model_pack_name = str(self.model_pack).replace(".zip", "") @@ -72,9 +72,10 @@ def save(self, *args, **kwargs): raise FileNotFoundError(f'Error loading the CDB from this model pack: {self.model_pack.path}') from exc # Load Vocab + vocab_path = os.path.join(unpacked_model_pack_path, "vocab.dat") if os.path.exists(vocab_path): - Vocab.load(vocab_path) + deserialise(vocab_path) vocab = Vocabulary() vocab.vocab_file.name = vocab_path.replace(f'{MEDIA_ROOT}/', '') vocab.save(skip_load=True) @@ -98,7 +99,7 @@ def save(self, *args, **kwargs): self.meta_cats.set(metaCATmodels) # Use set() instead of add() for atomic operation except Exception as exc: raise MedCATLoadException(f'Failure loading MetaCAT models - {unpacked_model_pack_path}') from exc - + # Only save if this is an update (not a new instance) if not is_new: super().save(*args, **kwargs) @@ -133,7 +134,7 @@ def save(self, *args, skip_load=False, **kwargs): # load the CDB, and raise if this fails - must be saved first so storage handler can rename path if name clashes if not skip_load: try: - CDB.load(self.cdb_file.path) + deserialise(self.cdb_file.path) except Exception as exc: raise MedCATLoadException(f'Failed to load Concept DB from {self.cdb_file}, ' f'check if this CDB file successfully loads elsewhere') from exc @@ -156,7 +157,7 @@ def save(self, *args, skip_load=False, **kwargs): # load the Vocab, and raise if this fails if not skip_load: try: - Vocab.load(self.vocab_file.path) + deserialise(self.vocab_file.path) except Exception as exc: raise MedCATLoadException(f'Failed to load Vocab from {self.vocab_file}, ' f'check if this Vocab file successfully loads elsewhere') from exc diff --git a/webapp/api/api/utils.py b/webapp/api/api/utils.py index 8d4d64ad..81a855e8 100644 --- a/webapp/api/api/utils.py +++ b/webapp/api/api/utils.py @@ -9,9 +9,7 @@ from django.db.models.signals import post_save from django.dispatch import receiver from medcat.cat import CAT -from medcat.utils.filters import check_filters -from medcat.utils.helpers import tkns_from_doc -from medcat.utils.ner.deid import DeIdModel +from medcat.components.ner.trf.deid import DeIdModel from .model_cache import get_medcat from .models import Entity, AnnotatedEntity, ProjectAnnotateEntities, \ @@ -37,7 +35,7 @@ def remove_annotations(document, project, partial=False): def add_annotations(spacy_doc, user, project, document, existing_annotations, cat): - spacy_doc._.ents.sort(key=lambda x: len(x.text), reverse=True) + spacy_doc.final_ents.sort(key=lambda x: len(x.text), reverse=True) tkns_in = [] ents = [] @@ -46,9 +44,9 @@ def add_annotations(spacy_doc, user, project, document, existing_annotations, ca # that can be produced are expected to have available models try: metatask2obj = {task_name: MetaTask.objects.get(name=task_name) - for task_name in spacy_doc._.ents[0]._.meta_anns.keys()} + for task_name in spacy_doc.final_ents[0].get_addon_data('meta_cat_meta_anns').keys()} metataskvals2obj = {task_name: {v.name: v for v in MetaTask.objects.get(name=task_name).values.all()} - for task_name in spacy_doc._.ents[0]._.meta_anns.keys()} + for task_name in spacy_doc.final_ents[0].get_addon_data('meta_cat_meta_anns').keys()} except (AttributeError, IndexError): # IndexError: ignore if there are no annotations in this doc # AttributeError: ignore meta_anns that are not present - i.e. non model pack preds @@ -61,8 +59,14 @@ def check_ents(ent): return any((ea[0] < ent.start_char < ea[1]) or (ea[0] < ent.end_char < ea[1]) for ea in existing_annos_intervals) - for ent in spacy_doc._.ents: - if not check_ents(ent) and check_filters(ent._.cui, cat.config.linking['filters']): + def check_filters(cui, filters): + if cui in filters.cuis or not filters.cuis: + return cui not in filters.cuis_exclude + else: + return False + + for ent in spacy_doc.final_ents: + if not check_ents(ent) and check_filters(ent.cui, cat.config.components.linking.filters): to_add = True for tkn in ent: if tkn in tkns_in: @@ -75,7 +79,7 @@ def check_ents(ent): logger.debug('Found %s annotations to store', len(ents)) for ent in ents: logger.debug('Processing annotation ent %s of %s', ents.index(ent), len(ents)) - label = ent._.cui + label = ent.cui if not Entity.objects.filter(label=label).exists(): # Create the entity @@ -87,8 +91,8 @@ def check_ents(ent): ann_ent = AnnotatedEntity.objects.filter(project=project, document=document, - start_ind=ent.start_char, - end_ind=ent.end_char).first() + start_ind=ent.start_char_index, + end_ind=ent.end_char_index).first() if ann_ent is None: # If this entity doesn't exist already ann_ent = AnnotatedEntity() @@ -97,29 +101,31 @@ def check_ents(ent): ann_ent.document = document ann_ent.entity = entity ann_ent.value = ent.text - ann_ent.start_ind = ent.start_char - ann_ent.end_ind = ent.end_char - ann_ent.acc = ent._.context_similarity + ann_ent.start_ind = ent.start_char_index + ann_ent.end_ind = ent.end_char_index + ann_ent.acc = ent.context_similarity - MIN_ACC = cat.config.linking.get('similarity_threshold_trainer', 0.2) - if ent._.context_similarity < MIN_ACC: + MIN_ACC = cat.config.components.linking.similarity_threshold + if ent.context_similarity < MIN_ACC: ann_ent.deleted = True ann_ent.validated = True ann_ent.save() # check the ent._.meta_anns if it exists - if hasattr(ent._, 'meta_anns') and len(metatask2obj) > 0 and len(metataskvals2obj) > 0: - logger.debug('Found %s meta annos on ent', len(ent._.meta_anns.items())) - for meta_ann_task, pred in ent._.meta_anns.items(): - meta_anno_obj = MetaAnnotation() - meta_anno_obj.predicted_meta_task_value = metataskvals2obj[meta_ann_task][pred['value']] - meta_anno_obj.meta_task = metatask2obj[meta_ann_task] - meta_anno_obj.annotated_entity = ann_ent - meta_anno_obj.meta_task_value = metataskvals2obj[meta_ann_task][pred['value']] - meta_anno_obj.acc = pred['confidence'] - meta_anno_obj.save() - logger.debug('Successfully saved %s', meta_anno_obj) + # if hasattr(ent, 'get_addon_data') and \ + # len(metatask2obj) > 0 and + # len(metataskvals2obj) > 0: + # logger.debug('Found %s meta annos on ent', len(ent._.meta_anns.items())) + # for meta_ann_task, pred in ent._.meta_anns.items(): + # meta_anno_obj = MetaAnnotation() + # meta_anno_obj.predicted_meta_task_value = metataskvals2obj[meta_ann_task][pred['value']] + # meta_anno_obj.meta_task = metatask2obj[meta_ann_task] + # meta_anno_obj.annotated_entity = ann_ent + # meta_anno_obj.meta_task_value = metataskvals2obj[meta_ann_task][pred['value']] + # meta_anno_obj.acc = pred['confidence'] + # meta_anno_obj.save() + # logger.debug('Successfully saved %s', meta_anno_obj) @@ -206,35 +212,37 @@ def train_medcat(cat, project, document): for ann in anns: cui = ann.entity.label # Indices for this annotation - spacy_entity = tkns_from_doc(spacy_doc=spacy_doc, start=ann.start_ind, end=ann.end_ind) + spacy_entity = [tkn for tkn in spacy_doc if tkn.char_index == ann.start_ind] # This will add the concept if it doesn't exist and if it - #does just link the new name to the concept, if the namee is - #already linked then it will just train. + # does just link the new name to the concept, if the namee is + # already linked then it will just train. manually_created = False if ann.manually_created or ann.alternative: manually_created = True - cat.add_and_train_concept(cui=cui, - name=ann.value, - spacy_doc=spacy_doc, - spacy_entity=spacy_entity, - negative=ann.deleted, - devalue_others=manually_created) + cat.trainer.add_and_train_concept( + cui=cui, + name=ann.value, + mut_doc=spacy_doc, + mut_entity=spacy_entity, + negative=ann.deleted, + devalue_others=manually_created + ) # Completely remove concept names that the user killed killed_anns = AnnotatedEntity.objects.filter(project=project, document=document, killed=True) for ann in killed_anns: cui = ann.entity.label name = ann.value - cat.unlink_concept_name(cui=cui, name=name) + cat.trainer.unlink_concept_name(cui=cui, name=name) # Add irrelevant cuis to cui_exclude irrelevant_anns = AnnotatedEntity.objects.filter(project=project, document=document, irrelevant=True) for ann in irrelevant_anns: cui = ann.entity.label - if 'cuis_exclude' not in cat.config.linking['filters']: - cat.config.linking['filters']['cuis_exclude'] = set() - cat.config.linking['filters'].get('cuis_exclude').update([cui]) + if 'cuis_exclude' not in cat.config.components.linking.filters: + cat.config.components.linking.filters['cuis_exclude'] = set() + cat.config.components.linking.filters.get('cuis_exclude').update([cui]) @background(schedule=1, queue='doc_prep') @@ -247,7 +255,7 @@ def prep_docs(project_id: List[int], doc_ids: List[int], user_id: int): cat = get_medcat(project=project) # Set CAT filters - cat.config.linking['filters']['cuis'] = project.cuis + cat.config.components.linking.filters.cuis = project.cuis for doc in docs: logger.info(f'Running MedCAT model for project {project.id}:{project.name} over doc: {doc.id}') diff --git a/webapp/api/api/views.py b/webapp/api/api/views.py index 55982b84..729d4b2f 100644 --- a/webapp/api/api/views.py +++ b/webapp/api/api/views.py @@ -10,11 +10,10 @@ from django.shortcuts import render from django.utils import timezone from django_filters import rest_framework as drf -from medcat.utils.helpers import tkns_from_doc from rest_framework import viewsets from rest_framework.decorators import api_view from rest_framework.response import Response -from medcat.utils.ner.deid import DeIdModel +from medcat.components.ner.trf.deid import DeIdModel from .admin import download_projects_with_text, download_projects_without_text, \ import_concepts_from_cdb @@ -282,8 +281,8 @@ def prepare_documents(request): logger.info('loaded medcat model for project: %s', project.id) # Set CAT filters - cat.config.linking['filters']['cuis'] = cuis - + cat.config.components.linking.filters.cuis = cuis + if not project.deid_model_annotation: spacy_doc = cat(document.text) else: @@ -424,9 +423,9 @@ def add_concept(request): if source_val in spacy_doc.text: start = spacy_doc.text.index(source_val) end = start + len(source_val) - spacy_entity = tkns_from_doc(spacy_doc=spacy_doc, start=start, end=end) + spacy_entity = [tkn for tkn in spacy_doc if tkn.idx >= start and tkn.idx <= end] - cat.add_and_train_concept(cui=cui, name=name, name_status='P', spacy_doc=spacy_doc, spacy_entity=spacy_entity) + cat.trainer.add_and_train_concept(cui=cui, name=name, name_status='P', mut_doc=spacy_doc, mut_entity=spacy_entity) id = create_annotation(source_val=source_val, selection_occurrence_index=sel_occur_idx, @@ -614,12 +613,12 @@ def annotate_text(request): project = ProjectAnnotateEntities.objects.get(id=p_id) cat = get_medcat(project=project) - cat.config.linking['filters']['cuis'] = set(cuis) + cat.config.components.linking.filters.cuis = set(cuis) spacy_doc = cat(message) ents = [] anno_tkns = [] - for ent in spacy_doc._.ents: + for ent in spacy_doc.final_ents: cnt = Entity.objects.filter(label=ent._.cui).count() inc_ent = all(tkn not in anno_tkns for tkn in ent) if inc_ent and cnt != 0: @@ -713,7 +712,7 @@ def cache_model(request, project_id): return Response(f'Project with id:{project_id} does not exist', 404) except Exception as e: return Response({'message': f'{str(e)}'}, 500) - + @api_view(http_method_names=['GET']) @@ -926,7 +925,7 @@ def cuis_to_concepts(request): def project_progress(request): if request.GET.get('projects') is None: return HttpResponseBadRequest('Cannot get progress for empty projects') - + projects = [int(p) for p in request.GET.get('projects', []).split(',')] projects2datasets = {p.id: (p, p.dataset) for p in [ProjectAnnotateEntities.objects.filter(id=p_id).first() From 63f51a4458346249ebb34753d1a7551f2dc85c33 Mon Sep 17 00:00:00 2001 From: Tom Searle Date: Thu, 19 Jun 2025 11:08:41 +0100 Subject: [PATCH 02/33] updated TODOs --- webapp/api/api/model_cache.py | 2 +- webapp/api/api/utils.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/webapp/api/api/model_cache.py b/webapp/api/api/model_cache.py index ebb12cc3..f8d61bd5 100644 --- a/webapp/api/api/model_cache.py +++ b/webapp/api/api/model_cache.py @@ -57,7 +57,7 @@ def get_medcat_from_cdb_vocab(project, cdb = deserialise(cdb_path) except NotADirectoryError as e: logger.warning("Legacy CDB found, converting to new format") - # this should live in medcat code directly + # TODO: deserialise and write back to the model path? cdb = get_cdb_from_old(cdb_path) serialise(cdb, cdb_path) cdb_map[cdb_id] = cdb diff --git a/webapp/api/api/utils.py b/webapp/api/api/utils.py index 81a855e8..f4500647 100644 --- a/webapp/api/api/utils.py +++ b/webapp/api/api/utils.py @@ -112,7 +112,8 @@ def check_filters(cui, filters): ann_ent.save() - # check the ent._.meta_anns if it exists + # TODO: Fix before v2 release. + # check the ent.get_addon_data('meta_cat_meta_anns') if it exists # if hasattr(ent, 'get_addon_data') and \ # len(metatask2obj) > 0 and # len(metataskvals2obj) > 0: From e10d594dbee35460514a6f3e7a44aa3902ed9238 Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 23 Jun 2025 10:41:22 +0100 Subject: [PATCH 03/33] Update dependency to medcat v2 --- webapp/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webapp/requirements.txt b/webapp/requirements.txt index 4097dec7..b621d0b9 100644 --- a/webapp/requirements.txt +++ b/webapp/requirements.txt @@ -6,4 +6,4 @@ django-polymorphic==3.0.* djangorestframework==3.15.* django-background-tasks-updated==1.2.* openpyxl==3.1.2 -medcat==1.15.* \ No newline at end of file +medcat[meta-cat,spacy,rel-cat,deid] @ git+https://github.com/CogStack/MedCAT2@v0.8.3 \ No newline at end of file From eb676e7d180f7dfe3dda5b9a29600007f2eef485 Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 23 Jun 2025 10:48:22 +0100 Subject: [PATCH 04/33] Update CDB/Vocab load to use the load classmethod again --- notebook_docs/API_Examples.ipynb | 8 ++++---- webapp/api/api/admin/actions.py | 6 +++--- webapp/api/api/metrics.py | 6 +++--- webapp/api/api/model_cache.py | 10 ++++------ webapp/api/api/models.py | 8 ++++---- 5 files changed, 18 insertions(+), 20 deletions(-) diff --git a/notebook_docs/API_Examples.ipynb b/notebook_docs/API_Examples.ipynb index 649a3651..7e09a659 100644 --- a/notebook_docs/API_Examples.ipynb +++ b/notebook_docs/API_Examples.ipynb @@ -262,18 +262,18 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": { "tags": [] }, "outputs": [], "source": [ - "from medcat.storage.serialisers import deserialise" + "from medcat.cdb import CDB" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": { "tags": [] }, @@ -290,7 +290,7 @@ } ], "source": [ - "cdb = deserialise('../../medcat-models/deid_medcat_n2c2_modelpack/cdb.dat')" + "cdb = CDB.load('../../medcat-models/deid_medcat_n2c2_modelpack/cdb.dat')" ] }, { diff --git a/webapp/api/api/admin/actions.py b/webapp/api/api/admin/actions.py index 7e9da84a..6123e9ef 100644 --- a/webapp/api/api/admin/actions.py +++ b/webapp/api/api/admin/actions.py @@ -13,7 +13,7 @@ from api.models import AnnotatedEntity, MetaAnnotation, EntityRelation, Document, ConceptDB from api.solr_utils import drop_collection, import_all_concepts -from medcat.storage.serialisers import deserialise +from medcat.cdb import CDB logger = logging.getLogger(__name__) @@ -359,7 +359,7 @@ def dataset_document_counts(dataset): @background(schedule=5) def _reset_cdb_filters(id): concept_db = ConceptDB.objects.get(id=id) - cdb = deserialise(concept_db.cdb_file.path) + cdb = CDB.load(concept_db.cdb_file.path) cdb.config.components.linking.filters = {'cuis': set()} cdb.save(concept_db.cdb_file.path) @@ -367,7 +367,7 @@ def _reset_cdb_filters(id): @background(schedule=5) def import_concepts_from_cdb(cdb_model_id: int): cdb_model = ConceptDB.objects.get(id=cdb_model_id) - cdb = deserialise(cdb_model.cdb_file.path) + cdb = CDB.load(cdb_model.cdb_file.path) import_all_concepts(cdb, cdb_model) diff --git a/webapp/api/api/metrics.py b/webapp/api/api/metrics.py index b67a7ba8..b67dbe4c 100644 --- a/webapp/api/api/metrics.py +++ b/webapp/api/api/metrics.py @@ -14,7 +14,7 @@ from django.contrib.auth.models import User from django.db.models import QuerySet from medcat.cat import CAT -from medcat.storage.serialisers import deserialise +from medcat.cdb import CDB from medcat.config.config_meta_cat import ConfigMetaCAT from medcat.components.addons.meta_cat.meta_cat import MetaCAT from medcat.components.addons.meta_cat.mctokenizers.tokenizers import TokenizerWrapperBase @@ -48,8 +48,8 @@ def calculate_metrics(project_ids: List[int], report_name: str): cat = CAT.load_model_pack(projects[0].model_pack.model_pack.path) else: # assume the cdb / vocab is set in these projects - cdb = deserialise(projects[0].concept_db.cdb_file.path) - vocab = deserialise(projects[0].vocab.vocab_file.path) + cdb = CDB.load(projects[0].concept_db.cdb_file.path) + vocab = Vocab.load(projects[0].vocab.vocab_file.path) cat = CAT(cdb, vocab, config=cdb.config) project_data = retrieve_project_data(projects) metrics = ProjectMetrics(project_data, cat) diff --git a/webapp/api/api/model_cache.py b/webapp/api/api/model_cache.py index f8d61bd5..fa1d8dcd 100644 --- a/webapp/api/api/model_cache.py +++ b/webapp/api/api/model_cache.py @@ -6,7 +6,6 @@ from medcat.cat import CAT from medcat.cdb import CDB from medcat.vocab import Vocab -from medcat.storage.serialisers import deserialise from medcat.utils.legacy.convert_cdb import get_cdb_from_old from api.models import ConceptDB @@ -54,15 +53,14 @@ def get_medcat_from_cdb_vocab(project, else: cdb_path = project.concept_db.cdb_file.path try: - cdb = deserialise(cdb_path) + cdb = CDB.load(cdb_path) except NotADirectoryError as e: logger.warning("Legacy CDB found, converting to new format") # TODO: deserialise and write back to the model path? cdb = get_cdb_from_old(cdb_path) - serialise(cdb, cdb_path) + cdb.save(cdb_path) cdb_map[cdb_id] = cdb cdb_path = project.concept_db.cdb_file.path - cdb = deserialise(cdb_path) cdb_map[cdb_id] = cdb except KeyError as ke: @@ -85,7 +83,7 @@ def get_medcat_from_cdb_vocab(project, vocab = vocab_map[vocab_id] else: vocab_path = project.vocab.vocab_file.path - vocab = deserialise(vocab_path) + vocab = Vocab.load(vocab_path) vocab_map[vocab_id] = vocab cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab) cat_map[cat_id] = cat @@ -144,7 +142,7 @@ def clear_cached_medcat(project, cat_map: Dict[str, CAT]=CAT_MAP): def get_cached_cdb(cdb_id: str, cdb_map: Dict[str, CDB]=CDB_MAP) -> CDB: if cdb_id not in cdb_map: cdb_obj = ConceptDB.objects.get(id=cdb_id) - cdb = deserialise(cdb_obj.cdb_file.path) + cdb = CDB.load(cdb_obj.cdb_file.path) cdb_map[cdb_id] = cdb return cdb_map[cdb_id] diff --git a/webapp/api/api/models.py b/webapp/api/api/models.py index 625dad0c..ae85ce91 100644 --- a/webapp/api/api/models.py +++ b/webapp/api/api/models.py @@ -12,7 +12,7 @@ from django.forms import forms, ModelForm from medcat.cat import CAT from medcat.cdb import CDB -from medcat.storage.serialisers import deserialise +from medcat.vocab import Vocab from medcat.components.addons.meta_cat.meta_cat import MetaCAT from polymorphic.models import PolymorphicModel @@ -75,7 +75,7 @@ def save(self, *args, **kwargs): vocab_path = os.path.join(unpacked_model_pack_path, "vocab.dat") if os.path.exists(vocab_path): - deserialise(vocab_path) + Vocab.load(vocab_path) vocab = Vocabulary() vocab.vocab_file.name = vocab_path.replace(f'{MEDIA_ROOT}/', '') vocab.save(skip_load=True) @@ -134,7 +134,7 @@ def save(self, *args, skip_load=False, **kwargs): # load the CDB, and raise if this fails - must be saved first so storage handler can rename path if name clashes if not skip_load: try: - deserialise(self.cdb_file.path) + CDB.load(self.cdb_file.path) except Exception as exc: raise MedCATLoadException(f'Failed to load Concept DB from {self.cdb_file}, ' f'check if this CDB file successfully loads elsewhere') from exc @@ -157,7 +157,7 @@ def save(self, *args, skip_load=False, **kwargs): # load the Vocab, and raise if this fails if not skip_load: try: - deserialise(self.vocab_file.path) + Vocab.load(self.vocab_file.path) except Exception as exc: raise MedCATLoadException(f'Failed to load Vocab from {self.vocab_file}, ' f'check if this Vocab file successfully loads elsewhere') from exc From 524a1cd2b41140dfc1adbc71b1308a09a8798bf9 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 25 Jun 2025 15:16:19 +0100 Subject: [PATCH 05/33] Update requirements - use latest (0.9.0) v2 beta --- webapp/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webapp/requirements.txt b/webapp/requirements.txt index b621d0b9..6d438735 100644 --- a/webapp/requirements.txt +++ b/webapp/requirements.txt @@ -6,4 +6,4 @@ django-polymorphic==3.0.* djangorestframework==3.15.* django-background-tasks-updated==1.2.* openpyxl==3.1.2 -medcat[meta-cat,spacy,rel-cat,deid] @ git+https://github.com/CogStack/MedCAT2@v0.8.3 \ No newline at end of file +medcat[meta-cat,spacy,rel-cat,deid] @ git+https://github.com/CogStack/MedCAT2@v0.9.0 From ad796cf1be500835c0f7a2f223095b9b96e1dae1 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 25 Jun 2025 15:16:50 +0100 Subject: [PATCH 06/33] Move away from pkg_resources (deprecated) --- webapp/api/api/model_cache.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/webapp/api/api/model_cache.py b/webapp/api/api/model_cache.py index fa1d8dcd..efac353d 100644 --- a/webapp/api/api/model_cache.py +++ b/webapp/api/api/model_cache.py @@ -2,7 +2,7 @@ import os from typing import Dict -import pkg_resources +from medcat import __version__ as mct_version from medcat.cat import CAT from medcat.cdb import CDB from medcat.vocab import Vocab @@ -64,7 +64,7 @@ def get_medcat_from_cdb_vocab(project, cdb_map[cdb_id] = cdb except KeyError as ke: - mc_v = pkg_resources.get_distribution('medcat').version + mc_v = mct_version if int(mc_v.split('.')[0]) > 0: logger.error('Attempted to load MedCAT v0.x model with MCTrainer v1.x') raise Exception('Attempted to load MedCAT v0.x model with MCTrainer v1.x', From e6097fecd73c1d0d40445db521b163268cb4aa18 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 26 Jun 2025 12:35:16 +0100 Subject: [PATCH 07/33] Update install target to new version and monorepo install --- webapp/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webapp/requirements.txt b/webapp/requirements.txt index 6d438735..94131d56 100644 --- a/webapp/requirements.txt +++ b/webapp/requirements.txt @@ -6,4 +6,4 @@ django-polymorphic==3.0.* djangorestframework==3.15.* django-background-tasks-updated==1.2.* openpyxl==3.1.2 -medcat[meta-cat,spacy,rel-cat,deid] @ git+https://github.com/CogStack/MedCAT2@v0.9.0 +medcat[meta-cat,spacy,rel-cat,deid] @ git+https://github.com/CogStack/cogstack-nlp.git@refs/tags/medcat/v0.10.0#subdirectory=medcat-v2 From 27f08b603e80b59abea8ce3a6bdab84dfbf35776 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 26 Jun 2025 14:58:05 +0100 Subject: [PATCH 08/33] Use v2 based API for loading addons (MetaCATs) --- webapp/api/api/models.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/webapp/api/api/models.py b/webapp/api/api/models.py index ae85ce91..fb4e537d 100644 --- a/webapp/api/api/models.py +++ b/webapp/api/api/models.py @@ -13,7 +13,7 @@ from medcat.cat import CAT from medcat.cdb import CDB from medcat.vocab import Vocab -from medcat.components.addons.meta_cat.meta_cat import MetaCAT +from medcat.components.addons.meta_cat.meta_cat import MetaCAT, MetaCATAddon from polymorphic.models import PolymorphicModel from core.settings import MEDIA_ROOT @@ -89,7 +89,9 @@ def save(self, *args, **kwargs): try: metaCATmodels = [] # should raise an error if there already is a MetaCAT model with this definition - for meta_cat_dir, meta_cat in CAT.load_meta_cats(unpacked_model_pack_path): + addons = CAT.load_addons(unpacked_model_pack_path) + meta_cats = [addon for addon in addons if isinstance(addon, MetaCATAddon)] + for meta_cat_dir, meta_cat in meta_cats: mc_model = MetaCATModel() mc_model.meta_cat_dir = meta_cat_dir.replace(f'{MEDIA_ROOT}/', '') mc_model.name = f'{meta_cat.config.general.category_name} - {meta_cat.config.model.model_name}' From cdef1f28c80fcdf7059e6de28a6a4bb089b80859 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 26 Jun 2025 15:00:47 +0100 Subject: [PATCH 09/33] Update MetaCAT loading --- webapp/api/api/models.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/webapp/api/api/models.py b/webapp/api/api/models.py index fb4e537d..d2f1a210 100644 --- a/webapp/api/api/models.py +++ b/webapp/api/api/models.py @@ -90,8 +90,11 @@ def save(self, *args, **kwargs): metaCATmodels = [] # should raise an error if there already is a MetaCAT model with this definition addons = CAT.load_addons(unpacked_model_pack_path) - meta_cats = [addon for addon in addons if isinstance(addon, MetaCATAddon)] - for meta_cat_dir, meta_cat in meta_cats: + meta_cat_addons = [ + (addon_path, addon) for addon_path, addon in addons + if isinstance(addon, MetaCATAddon)] + for meta_cat_dir, meta_cat_addon in meta_cat_addons: + meta_cat = meta_cat_addon.mc mc_model = MetaCATModel() mc_model.meta_cat_dir = meta_cat_dir.replace(f'{MEDIA_ROOT}/', '') mc_model.name = f'{meta_cat.config.general.category_name} - {meta_cat.config.model.model_name}' From 4c368bae853a66aa5ced452eed6e76ac93af82e5 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 27 Jun 2025 09:30:05 +0100 Subject: [PATCH 10/33] Update requirements to 0.11.0 of medcat v2 beta --- webapp/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webapp/requirements.txt b/webapp/requirements.txt index 94131d56..184c5f97 100644 --- a/webapp/requirements.txt +++ b/webapp/requirements.txt @@ -6,4 +6,4 @@ django-polymorphic==3.0.* djangorestframework==3.15.* django-background-tasks-updated==1.2.* openpyxl==3.1.2 -medcat[meta-cat,spacy,rel-cat,deid] @ git+https://github.com/CogStack/cogstack-nlp.git@refs/tags/medcat/v0.10.0#subdirectory=medcat-v2 +medcat[meta-cat,spacy,rel-cat,deid] @ git+https://github.com/CogStack/cogstack-nlp.git@refs/tags/medcat/v0.11.0#subdirectory=medcat-v2 From 232ec726b5f9d6c7ea46dc842defe356995045e1 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 27 Jun 2025 13:26:15 +0100 Subject: [PATCH 11/33] Update metrics to v2 format --- webapp/api/api/metrics.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/webapp/api/api/metrics.py b/webapp/api/api/metrics.py index b67dbe4c..c84ac6e8 100644 --- a/webapp/api/api/metrics.py +++ b/webapp/api/api/metrics.py @@ -13,10 +13,11 @@ from background_task.models import Task from django.contrib.auth.models import User from django.db.models import QuerySet +from medcat.stats.stats import get_stats from medcat.cat import CAT from medcat.cdb import CDB from medcat.config.config_meta_cat import ConfigMetaCAT -from medcat.components.addons.meta_cat.meta_cat import MetaCAT +from medcat.components.addons.meta_cat.meta_cat import MetaCATAddon from medcat.components.addons.meta_cat.mctokenizers.tokenizers import TokenizerWrapperBase from medcat.components.addons.meta_cat.data_utils import prepare_from_json, encode_category_values from medcat.components.addons.meta_cat.ml_utils import create_batch_piped_data @@ -113,7 +114,7 @@ def annotation_df(self): """ annotation_df = pd.DataFrame(self.annotations) if self.cat: - annotation_df.insert(5, 'concept_name', annotation_df['cui'].map(self.cat.cdb.cui2preferred_name)) + annotation_df.insert(5, 'concept_name', annotation_df['cui'].map(self.cat.cdb.get_name)) annotation_df['last_modified'] = pd.to_datetime(annotation_df['last_modified']).dt.tz_localize(None) return annotation_df @@ -136,9 +137,10 @@ def concept_summary(self, extra_cui_filter=None): concept_count_df['count_variations_ratio'] = round(concept_count_df['concept_count'] / concept_count_df['variations'], 3) if self.cat: - fps, fns, tps, cui_prec, cui_rec, cui_f1, cui_counts, examples = self.cat._print_stats(data=self.mct_export, - use_project_filters=True, - extra_cui_filter=extra_cui_filter) + fps, fns, tps, cui_prec, cui_rec, cui_f1, cui_counts, examples = get_stats(self.cat, + data=self.mct_export, + use_project_filters=True, + extra_cui_filter=extra_cui_filter) # remap tps, fns, fps to specific user annotations examples = self.enrich_medcat_metrics(examples) concept_count_df['fps'] = concept_count_df['cui'].map(fps) @@ -236,10 +238,10 @@ def rename_meta_anns(self, meta_anns2rename=dict(), meta_ann_values2rename=dict( def _eval_model(self, model: nn.Module, data: List, config: ConfigMetaCAT, tokenizer: TokenizerWrapperBase) -> Dict: device = torch.device(config.general['device']) # Create a torch device - batch_size_eval = config.general['batch_size_eval'] - pad_id = config.model['padding_idx'] - ignore_cpos = config.model['ignore_cpos'] - class_weights = config.train['class_weights'] + batch_size_eval = config.general.batch_size_eval + pad_id = config.model.padding_idx + ignore_cpos = config.model.ignore_cpos + class_weights = config.train.class_weights if class_weights is not None: class_weights = torch.FloatTensor(class_weights).to(device) @@ -319,7 +321,7 @@ def full_annotation_df(self) -> pd.DataFrame: for meta_model_card in self.cat.get_model_card(as_dict=True)['MetaCAT models']: meta_model = meta_model_card['Category Name'] logger.info(f'Checking metacat model: {meta_model}') - _meta_model = MetaCAT.load(self.model_pack_path + '/meta_' + meta_model) + _meta_model = MetaCATAddon.load(self.model_pack_path + '/meta_' + meta_model) meta_results = self._eval(_meta_model, self.mct_export) _meta_values = {v: k for k, v in meta_results['meta_values'].items()} pred_meta_values = [] @@ -381,7 +383,7 @@ def meta_anns_concept_summary(self) -> pd.DataFrame: meta_anns_df['total_anns'] = meta_anns_df[col_lst].sum(axis=1) meta_anns_df = meta_anns_df.sort_values(by='total_anns', ascending=False) meta_anns_df = meta_anns_df.rename_axis('cui').reset_index(drop=False) - meta_anns_df.insert(1, 'concept_name', meta_anns_df['cui'].map(self.cat.cdb.cui2preferred_name)) + meta_anns_df.insert(1, 'concept_name', meta_anns_df['cui'].map(self.cat.cdb.get_name)) return meta_anns_df def generate_report(self, meta_ann=False): From c854ea72feb1221003aab4eb86597a4c0f661fe3 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 27 Jun 2025 15:22:58 +0100 Subject: [PATCH 12/33] Do config parsing locally --- webapp/api/api/model_cache.py | 67 ++++++++++++++++++++++++++++++++--- 1 file changed, 63 insertions(+), 4 deletions(-) diff --git a/webapp/api/api/model_cache.py b/webapp/api/api/model_cache.py index efac353d..e7d6ddb6 100644 --- a/webapp/api/api/model_cache.py +++ b/webapp/api/api/model_cache.py @@ -1,9 +1,12 @@ import logging import os -from typing import Dict +from typing import Dict, Optional, Any + +from pydantic import ValidationError from medcat import __version__ as mct_version from medcat.cat import CAT +from medcat.config.config import Config, SerialisableBaseModel from medcat.cdb import CDB from medcat.vocab import Vocab from medcat.utils.legacy.convert_cdb import get_cdb_from_old @@ -74,7 +77,7 @@ def get_medcat_from_cdb_vocab(project, custom_config = os.getenv("MEDCAT_CONFIG_FILE") if custom_config is not None and os.path.exists(custom_config): - cdb.config.parse_config_file(path=custom_config) + _parse_config_file(cdb.config, custom_config) else: logger.info("No MEDCAT_CONFIG_FILE env var set to valid path, using default config available on CDB") cdb_map[cdb_id] = cdb @@ -91,6 +94,62 @@ def get_medcat_from_cdb_vocab(project, return cat +def _parse_config_file(config: Config, + custom_config_path: str): + # NOTE: the v2 mappings are a little different + mappings = { + "linking": "components.linking", + "ner": "components.ner", + } + mappings_key = { + "spacy_model": "nlp.modelname" + } + with open(custom_config_path) as f: + for line in f: + if not line.strip().startswith("cat"): + continue + line = line[4:] + left, right = line.split("=") + variable, key = left.split(".") + variable = variable.strip() + # map to v2 + variable = mappings.get(variable, variable) + key = key.strip() + # key can also differ + key = mappings_key.get(key, key) + value = eval(right) + alt_value = set() if right.strip() in ({}, "{}") else None + + # get (potentially nested in case of v2 mapping) attribute + cnf = config + while "." in variable: + current, variable = variable.split(".", 1) + cnf = getattr(cnf, current) + attr = getattr(cnf, variable) + while "." in key: + cur_key, key = key.split(".", 1) + attr = getattr(attr, cur_key) + if isinstance(attr, SerialisableBaseModel): + _set_value_or_alt(attr, key, value, alt_value) + elif isinstance(attr, dict): + attr[key] = value + else: + raise ValueError(f'Unknown attribute {attr} for "{line}"') + + +def _set_value_or_alt(conf: SerialisableBaseModel, key: str, value: Any, + alt_value: Any, err: Optional[ValidationError] = None) -> None: + try: + setattr(conf, key, value) # hoping for correct type + except ValidationError as ve: + if alt_value is not None: + _set_value_or_alt(conf, key, alt_value, None, err=ve) + elif err is not None: + raise err + else: + raise ve + + def get_medcat_from_model_pack(project, cat_map: Dict[str, CAT]=CAT_MAP) -> CAT: model_pack_obj = project.model_pack cat_id = 'mp' + str(model_pack_obj.id) @@ -111,8 +170,8 @@ def get_medcat(project, else: cat = get_medcat_from_model_pack(project, cat_map) return cat - except AttributeError: - raise Exception('Failure loading Project ConceptDB, Vocab or Model Pack. Are these set correctly?') + except AttributeError as err: + raise Exception('Failure loading Project ConceptDB, Vocab or Model Pack. Are these set correctly?') from err def get_cached_medcat(project, cat_map: Dict[str, CAT]=CAT_MAP): From 5ec59fc49a7c2eef9c2236ecc3501526c99ccfee Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 27 Jun 2025 15:36:51 +0100 Subject: [PATCH 13/33] Bump version to latest --- webapp/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webapp/requirements.txt b/webapp/requirements.txt index 184c5f97..e230a260 100644 --- a/webapp/requirements.txt +++ b/webapp/requirements.txt @@ -6,4 +6,4 @@ django-polymorphic==3.0.* djangorestframework==3.15.* django-background-tasks-updated==1.2.* openpyxl==3.1.2 -medcat[meta-cat,spacy,rel-cat,deid] @ git+https://github.com/CogStack/cogstack-nlp.git@refs/tags/medcat/v0.11.0#subdirectory=medcat-v2 +medcat[meta-cat,spacy,rel-cat,deid] @ git+https://github.com/CogStack/cogstack-nlp.git@refs/tags/medcat/v0.11.1#subdirectory=medcat-v2 From 3e6ea19bf277c323aeb202f9f0e7b4f9c39d9f8c Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 27 Jun 2025 16:19:36 +0100 Subject: [PATCH 14/33] Bump version to latest --- webapp/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webapp/requirements.txt b/webapp/requirements.txt index e230a260..311d021e 100644 --- a/webapp/requirements.txt +++ b/webapp/requirements.txt @@ -6,4 +6,4 @@ django-polymorphic==3.0.* djangorestframework==3.15.* django-background-tasks-updated==1.2.* openpyxl==3.1.2 -medcat[meta-cat,spacy,rel-cat,deid] @ git+https://github.com/CogStack/cogstack-nlp.git@refs/tags/medcat/v0.11.1#subdirectory=medcat-v2 +medcat[meta-cat,spacy,rel-cat,deid] @ git+https://github.com/CogStack/cogstack-nlp.git@refs/tags/medcat/v0.11.2#subdirectory=medcat-v2 From 84422fe60b6a4a7e5770c67847c62efb27df6b3b Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 27 Jun 2025 16:33:59 +0100 Subject: [PATCH 15/33] Update to correct attribute name --- webapp/api/api/utils.py | 8 ++++---- webapp/api/api/views.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/webapp/api/api/utils.py b/webapp/api/api/utils.py index f4500647..eab61afb 100644 --- a/webapp/api/api/utils.py +++ b/webapp/api/api/utils.py @@ -35,7 +35,7 @@ def remove_annotations(document, project, partial=False): def add_annotations(spacy_doc, user, project, document, existing_annotations, cat): - spacy_doc.final_ents.sort(key=lambda x: len(x.text), reverse=True) + spacy_doc.linked_ents.sort(key=lambda x: len(x.text), reverse=True) tkns_in = [] ents = [] @@ -44,9 +44,9 @@ def add_annotations(spacy_doc, user, project, document, existing_annotations, ca # that can be produced are expected to have available models try: metatask2obj = {task_name: MetaTask.objects.get(name=task_name) - for task_name in spacy_doc.final_ents[0].get_addon_data('meta_cat_meta_anns').keys()} + for task_name in spacy_doc.linked_ents[0].get_addon_data('meta_cat_meta_anns').keys()} metataskvals2obj = {task_name: {v.name: v for v in MetaTask.objects.get(name=task_name).values.all()} - for task_name in spacy_doc.final_ents[0].get_addon_data('meta_cat_meta_anns').keys()} + for task_name in spacy_doc.linked_ents[0].get_addon_data('meta_cat_meta_anns').keys()} except (AttributeError, IndexError): # IndexError: ignore if there are no annotations in this doc # AttributeError: ignore meta_anns that are not present - i.e. non model pack preds @@ -65,7 +65,7 @@ def check_filters(cui, filters): else: return False - for ent in spacy_doc.final_ents: + for ent in spacy_doc.linked_ents: if not check_ents(ent) and check_filters(ent.cui, cat.config.components.linking.filters): to_add = True for tkn in ent: diff --git a/webapp/api/api/views.py b/webapp/api/api/views.py index 729d4b2f..d53a6465 100644 --- a/webapp/api/api/views.py +++ b/webapp/api/api/views.py @@ -618,7 +618,7 @@ def annotate_text(request): ents = [] anno_tkns = [] - for ent in spacy_doc.final_ents: + for ent in spacy_doc.linked_ents: cnt = Entity.objects.filter(label=ent._.cui).count() inc_ent = all(tkn not in anno_tkns for tkn in ent) if inc_ent and cnt != 0: From 3d94e55281c9b2fcc81ba4a7117f5535848e712f Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 30 Jun 2025 09:07:14 +0100 Subject: [PATCH 16/33] Update solr utils to v2 --- webapp/api/api/solr_utils.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/webapp/api/api/solr_utils.py b/webapp/api/api/solr_utils.py index 65c5183d..d7d19a77 100644 --- a/webapp/api/api/solr_utils.py +++ b/webapp/api/api/solr_utils.py @@ -6,6 +6,7 @@ import requests from django.http import HttpResponseServerError from medcat.cdb import CDB +from medcat.cdb.concepts import CUIInfo from rest_framework.response import Response from api.models import ConceptDB @@ -128,14 +129,14 @@ def import_all_concepts(cdb: CDB, cdb_model: ConceptDB): if resp.status_code != 200: _solr_error_response(resp, 'Failure creating collection') - cui2name_iter = iter(cdb.cui2names.items()) + cui2info_iter = iter(cdb.cui2info.items()) payload = [] try: while True: for i in range(5000): - cui, name = next(cui2name_iter) - concept_dct = _concept_dct(cui, cdb) + cui, info = next(cui2info_iter) + concept_dct = _concept_dct(cui, cdb, info) payload.append(concept_dct) _upload_payload(f'{base_url}/{collection_name}/update', payload, collection_name) payload = [] @@ -175,7 +176,7 @@ def ensure_concept_searchable(cui, cdb: CDB, cdb_model: ConceptDB): resp = requests.get(url) if resp.status_code == 200: collections = json.loads(resp.text)['collections'] - data = [_concept_dct(cui, cdb)] + data = [_concept_dct(cui, cdb, cdb.cui2info[cui])] if collection in collections: _upload_payload(f'{base_url}/{collection}/update', data, collection, commit=True) @@ -190,14 +191,14 @@ def _upload_payload(update_url, data, collection, commit=False): _solr_error_response(resp, f'error updating {collection}') -def _concept_dct(cui: str, cdb: CDB): - synonyms = list(cdb.addl_info.get('cui2original_names', {}).get(cui, set())) +def _concept_dct(cui: str, cdb: CDB, info: CUIInfo): + synonyms = list(info['original_names'] or []) concept_dct = { 'cui': str(cui), 'pretty_name': cdb.get_name(cui), 'name': re.sub(r'\([\w+\s]+\)', '', cdb.get_name(cui)).strip(), - 'type_ids': list(cdb.cui2type_ids[cui]), - 'desc': cdb.addl_info.get('cui2description', {}).get(cui, ''), + 'type_ids': list(info['type_ids']), + 'desc': info['description'], 'synonyms': synonyms if len(synonyms) > 0 else [cdb.get_name(cui)] } return concept_dct From bec4945870a12c7e57dff2ef2a175d40d320eb13 Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 30 Jun 2025 09:09:01 +0100 Subject: [PATCH 17/33] Fix config access for v2 --- webapp/api/api/metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webapp/api/api/metrics.py b/webapp/api/api/metrics.py index c84ac6e8..37a3c4f4 100644 --- a/webapp/api/api/metrics.py +++ b/webapp/api/api/metrics.py @@ -237,7 +237,7 @@ def rename_meta_anns(self, meta_anns2rename=dict(), meta_ann_values2rename=dict( return def _eval_model(self, model: nn.Module, data: List, config: ConfigMetaCAT, tokenizer: TokenizerWrapperBase) -> Dict: - device = torch.device(config.general['device']) # Create a torch device + device = torch.device(config.general.device) # Create a torch device batch_size_eval = config.general.batch_size_eval pad_id = config.model.padding_idx ignore_cpos = config.model.ignore_cpos From 239806b1c2ab9756deb033bbc5c94bf4c2b07ad2 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 1 Jul 2025 11:22:10 +0100 Subject: [PATCH 18/33] Remove addons from CDB config upon load --- webapp/api/api/model_cache.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/webapp/api/api/model_cache.py b/webapp/api/api/model_cache.py index e7d6ddb6..b3da8aa4 100644 --- a/webapp/api/api/model_cache.py +++ b/webapp/api/api/model_cache.py @@ -65,6 +65,10 @@ def get_medcat_from_cdb_vocab(project, cdb_map[cdb_id] = cdb cdb_path = project.concept_db.cdb_file.path cdb_map[cdb_id] = cdb + # NOTE: when loading a CDB separately, we don't necessarily want to + # load / create addons like MetaCAT as well + logger.info('Clearing addons for CDB upon load: %s', cdb_id) + cdb.config.components.addons.clear() except KeyError as ke: mc_v = mct_version From c40e17b4981889371a4457b913f25bbba4eb0d83 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 1 Jul 2025 11:24:40 +0100 Subject: [PATCH 19/33] Fix syntax error --- webapp/api/api/model_cache.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/webapp/api/api/model_cache.py b/webapp/api/api/model_cache.py index b3da8aa4..b52fa3de 100644 --- a/webapp/api/api/model_cache.py +++ b/webapp/api/api/model_cache.py @@ -65,10 +65,6 @@ def get_medcat_from_cdb_vocab(project, cdb_map[cdb_id] = cdb cdb_path = project.concept_db.cdb_file.path cdb_map[cdb_id] = cdb - # NOTE: when loading a CDB separately, we don't necessarily want to - # load / create addons like MetaCAT as well - logger.info('Clearing addons for CDB upon load: %s', cdb_id) - cdb.config.components.addons.clear() except KeyError as ke: mc_v = mct_version @@ -78,6 +74,10 @@ def get_medcat_from_cdb_vocab(project, 'Please re-configure this project to use a MedCAT v1.x CDB or consult the ' 'MedCATTrainer Dev team if you believe this should work') from ke raise + # NOTE: when loading a CDB separately, we don't necessarily want to + # load / create addons like MetaCAT as well + logger.info('Clearing addons for CDB upon load: %s', cdb_id) + cdb.config.components.addons.clear() custom_config = os.getenv("MEDCAT_CONFIG_FILE") if custom_config is not None and os.path.exists(custom_config): From 98efed8f9c6ef2c82c9f510b7cb16adb8d010323 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 1 Jul 2025 11:35:44 +0100 Subject: [PATCH 20/33] Update Meta Annotation getting so as to avoid error if none set --- webapp/api/api/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/webapp/api/api/utils.py b/webapp/api/api/utils.py index eab61afb..30c41ac2 100644 --- a/webapp/api/api/utils.py +++ b/webapp/api/api/utils.py @@ -10,6 +10,7 @@ from django.dispatch import receiver from medcat.cat import CAT from medcat.components.ner.trf.deid import DeIdModel +from medcat.tokenizing.tokens import UnregisteredDataPathException from .model_cache import get_medcat from .models import Entity, AnnotatedEntity, ProjectAnnotateEntities, \ @@ -47,7 +48,7 @@ def add_annotations(spacy_doc, user, project, document, existing_annotations, ca for task_name in spacy_doc.linked_ents[0].get_addon_data('meta_cat_meta_anns').keys()} metataskvals2obj = {task_name: {v.name: v for v in MetaTask.objects.get(name=task_name).values.all()} for task_name in spacy_doc.linked_ents[0].get_addon_data('meta_cat_meta_anns').keys()} - except (AttributeError, IndexError): + except (AttributeError, IndexError, UnregisteredDataPathException): # IndexError: ignore if there are no annotations in this doc # AttributeError: ignore meta_anns that are not present - i.e. non model pack preds # or model pack preds with no meta_anns From 6e2c1c05c598245d0a0933f1575ab816647783c4 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 1 Jul 2025 11:50:36 +0100 Subject: [PATCH 21/33] Fix entity CUI / start/end char access --- webapp/api/api/views.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/webapp/api/api/views.py b/webapp/api/api/views.py index d53a6465..26d94ce7 100644 --- a/webapp/api/api/views.py +++ b/webapp/api/api/views.py @@ -619,17 +619,17 @@ def annotate_text(request): ents = [] anno_tkns = [] for ent in spacy_doc.linked_ents: - cnt = Entity.objects.filter(label=ent._.cui).count() + cnt = Entity.objects.filter(label=ent.cui).count() inc_ent = all(tkn not in anno_tkns for tkn in ent) if inc_ent and cnt != 0: anno_tkns.extend([tkn for tkn in ent]) - entity = Entity.objects.get(label=ent._.cui) + entity = Entity.objects.get(label=ent.cui) ents.append({ 'entity': entity.id, - 'value': ent.text, - 'start_ind': ent.start_char, - 'end_ind': ent.end_char, - 'acc': ent._.context_similarity + 'value': ent.base.text, + 'start_ind': ent.base.start_char_index, + 'end_ind': ent.base.end_char_index, + 'acc': ent.context_similarity }) ents.sort(key=lambda e: e['start_ind']) From bac52940f240a373a3d07d68f62938dc4dc631c2 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 1 Jul 2025 11:52:18 +0100 Subject: [PATCH 22/33] Fix some more entity detail access --- webapp/api/api/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/webapp/api/api/utils.py b/webapp/api/api/utils.py index 30c41ac2..5319be2f 100644 --- a/webapp/api/api/utils.py +++ b/webapp/api/api/utils.py @@ -57,8 +57,8 @@ def add_annotations(spacy_doc, user, project, document, existing_annotations, ca pass def check_ents(ent): - return any((ea[0] < ent.start_char < ea[1]) or - (ea[0] < ent.end_char < ea[1]) for ea in existing_annos_intervals) + return any((ea[0] < ent.start_char_index < ea[1]) or + (ea[0] < ent.end_char_index < ea[1]) for ea in existing_annos_intervals) def check_filters(cui, filters): if cui in filters.cuis or not filters.cuis: From 3ef9805222bb0531069f9e81ed750612902e6a56 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 1 Jul 2025 16:42:09 +0100 Subject: [PATCH 23/33] Remove unigram table error (irrelevant / redundant) --- webapp/api/api/views.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/webapp/api/api/views.py b/webapp/api/api/views.py index 26d94ce7..effccb70 100644 --- a/webapp/api/api/views.py +++ b/webapp/api/api/views.py @@ -460,16 +460,8 @@ def import_cdb_concepts(request): def _submit_document(project: ProjectAnnotateEntities, document: Document): if project.train_model_on_submit: - try: - cat = get_medcat(project=project) - train_medcat(cat, project, document) - except Exception as e: - if project.vocab.id: - if len(VOCAB_MAP[project.vocab.id].unigram_table) == 0: - return Exception('Vocab is missing the unigram table. On the vocab instance ' - 'use vocab.make_unigram_table() to build') - else: - raise e + cat = get_medcat(project=project) + train_medcat(cat, project, document) # Add cuis to filter if they did not exist cuis = [] From c60692dd05dcd883221f34f3c61d287b64e01cee Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 1 Jul 2025 16:46:57 +0100 Subject: [PATCH 24/33] Log more info regarding failure upon document preparation --- webapp/api/api/views.py | 1 + 1 file changed, 1 insertion(+) diff --git a/webapp/api/api/views.py b/webapp/api/api/views.py index effccb70..4b61bbcb 100644 --- a/webapp/api/api/views.py +++ b/webapp/api/api/views.py @@ -303,6 +303,7 @@ def prepare_documents(request): project.save() except Exception as e: + logger.warning('Error preparing documents for project %s', p_id, exc_info=e) stack = traceback.format_exc() return Response({'message': e.args[0] if len(e.args) > 0 else 'Internal Server Error', 'description': e.args[1] if len(e.args) > 1 else '', From b9b3c3e25b875ea16dec7173c4db37c58c17d4e7 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 1 Jul 2025 20:57:57 +0100 Subject: [PATCH 25/33] Bump dependency version to latest (0.12.0) --- webapp/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webapp/requirements.txt b/webapp/requirements.txt index 311d021e..bf538fba 100644 --- a/webapp/requirements.txt +++ b/webapp/requirements.txt @@ -6,4 +6,4 @@ django-polymorphic==3.0.* djangorestframework==3.15.* django-background-tasks-updated==1.2.* openpyxl==3.1.2 -medcat[meta-cat,spacy,rel-cat,deid] @ git+https://github.com/CogStack/cogstack-nlp.git@refs/tags/medcat/v0.11.2#subdirectory=medcat-v2 +medcat[meta-cat,spacy,rel-cat,deid] @ git+https://github.com/CogStack/cogstack-nlp.git@refs/tags/medcat/v0.12.0#subdirectory=medcat-v2 From d394fb8133e3d533ed8c363320a6cc6c629614ae Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 2 Jul 2025 10:50:49 +0100 Subject: [PATCH 26/33] Centralising clearnig CDB addons afer explicit load --- webapp/api/api/admin/actions.py | 2 ++ .../api/migrations/0074_auto_20231211_1526.py | 18 ++++++++++++++++++ .../api/migrations/0090_merge_20250623_1330.py | 14 ++++++++++++++ webapp/api/api/model_cache.py | 6 ++---- webapp/api/api/utils.py | 7 +++++++ 5 files changed, 43 insertions(+), 4 deletions(-) create mode 100644 webapp/api/api/migrations/0074_auto_20231211_1526.py create mode 100644 webapp/api/api/migrations/0090_merge_20250623_1330.py diff --git a/webapp/api/api/admin/actions.py b/webapp/api/api/admin/actions.py index 6123e9ef..e5f7afde 100644 --- a/webapp/api/api/admin/actions.py +++ b/webapp/api/api/admin/actions.py @@ -360,6 +360,7 @@ def dataset_document_counts(dataset): def _reset_cdb_filters(id): concept_db = ConceptDB.objects.get(id=id) cdb = CDB.load(concept_db.cdb_file.path) + # TODO: clear addons cdb.config.components.linking.filters = {'cuis': set()} cdb.save(concept_db.cdb_file.path) @@ -368,6 +369,7 @@ def _reset_cdb_filters(id): def import_concepts_from_cdb(cdb_model_id: int): cdb_model = ConceptDB.objects.get(id=cdb_model_id) cdb = CDB.load(cdb_model.cdb_file.path) + # TODO: clear addons import_all_concepts(cdb, cdb_model) diff --git a/webapp/api/api/migrations/0074_auto_20231211_1526.py b/webapp/api/api/migrations/0074_auto_20231211_1526.py new file mode 100644 index 00000000..e910b17b --- /dev/null +++ b/webapp/api/api/migrations/0074_auto_20231211_1526.py @@ -0,0 +1,18 @@ +# Generated by Django 2.2.28 on 2023-12-11 15:26 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('api', '0073_auto_20231022_0028'), + ] + + operations = [ + migrations.AlterField( + model_name='projectmetrics', + name='projects', + field=models.ManyToManyField(blank=True, to='api.ProjectAnnotateEntities'), + ), + ] diff --git a/webapp/api/api/migrations/0090_merge_20250623_1330.py b/webapp/api/api/migrations/0090_merge_20250623_1330.py new file mode 100644 index 00000000..8c502e4a --- /dev/null +++ b/webapp/api/api/migrations/0090_merge_20250623_1330.py @@ -0,0 +1,14 @@ +# Generated by Django 5.1.11 on 2025-06-23 13:30 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('api', '0074_auto_20231211_1526'), + ('api', '0089_projectannotateentities_deid_model_annotation_and_more'), + ] + + operations = [ + ] diff --git a/webapp/api/api/model_cache.py b/webapp/api/api/model_cache.py index b52fa3de..e2526a87 100644 --- a/webapp/api/api/model_cache.py +++ b/webapp/api/api/model_cache.py @@ -12,6 +12,7 @@ from medcat.utils.legacy.convert_cdb import get_cdb_from_old from api.models import ConceptDB +from .utils import clear_cdb_cnf_addons """ Module level caches for CDBs, Vocabs and CAT instances. @@ -74,10 +75,7 @@ def get_medcat_from_cdb_vocab(project, 'Please re-configure this project to use a MedCAT v1.x CDB or consult the ' 'MedCATTrainer Dev team if you believe this should work') from ke raise - # NOTE: when loading a CDB separately, we don't necessarily want to - # load / create addons like MetaCAT as well - logger.info('Clearing addons for CDB upon load: %s', cdb_id) - cdb.config.components.addons.clear() + clear_cdb_cnf_addons(cdb, cdb_id) custom_config = os.getenv("MEDCAT_CONFIG_FILE") if custom_config is not None and os.path.exists(custom_config): diff --git a/webapp/api/api/utils.py b/webapp/api/api/utils.py index 5319be2f..6f619c67 100644 --- a/webapp/api/api/utils.py +++ b/webapp/api/api/utils.py @@ -9,6 +9,7 @@ from django.db.models.signals import post_save from django.dispatch import receiver from medcat.cat import CAT +from medcat.cdb import CDB from medcat.components.ner.trf.deid import DeIdModel from medcat.tokenizing.tokens import UnregisteredDataPathException @@ -130,6 +131,12 @@ def check_filters(cui, filters): # logger.debug('Successfully saved %s', meta_anno_obj) +def clear_cdb_cnf_addons(cdb: CDB, cdb_id: str): + # NOTE: when loading a CDB separately, we don't necessarily want to + # load / create addons like MetaCAT as well + logger.info('Clearing addons for CDB upon load: %s', cdb_id) + cdb.config.components.addons.clear() + def get_create_cdb_infos(cdb, concept, cui, cui_info_prop, code_prop, desc_prop, model_clazz): codes = [c[code_prop] for c in cdb.cui2info.get(cui, {}).get(cui_info_prop, []) if code_prop in c] From 503b7c2637b8f083f89b20f8f6bd0da3b16228ba Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 2 Jul 2025 10:51:14 +0100 Subject: [PATCH 27/33] More specific import --- webapp/api/api/model_cache.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webapp/api/api/model_cache.py b/webapp/api/api/model_cache.py index e2526a87..5fdf37d7 100644 --- a/webapp/api/api/model_cache.py +++ b/webapp/api/api/model_cache.py @@ -12,7 +12,7 @@ from medcat.utils.legacy.convert_cdb import get_cdb_from_old from api.models import ConceptDB -from .utils import clear_cdb_cnf_addons +from api.utils import clear_cdb_cnf_addons """ Module level caches for CDBs, Vocabs and CAT instances. From 34854466a90a822256b1e26858e117d6f2bc5f50 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 2 Jul 2025 10:57:32 +0100 Subject: [PATCH 28/33] Clear CDB config addons everywhere if/when applicable --- webapp/api/api/admin/actions.py | 5 +++-- webapp/api/api/metrics.py | 2 ++ webapp/api/api/model_cache.py | 1 + webapp/api/api/utils.py | 2 +- 4 files changed, 7 insertions(+), 3 deletions(-) diff --git a/webapp/api/api/admin/actions.py b/webapp/api/api/admin/actions.py index e5f7afde..1ec12d16 100644 --- a/webapp/api/api/admin/actions.py +++ b/webapp/api/api/admin/actions.py @@ -12,6 +12,7 @@ from api.models import AnnotatedEntity, MetaAnnotation, EntityRelation, Document, ConceptDB from api.solr_utils import drop_collection, import_all_concepts +from api.utils import clear_cdb_cnf_addons from medcat.cdb import CDB @@ -360,7 +361,7 @@ def dataset_document_counts(dataset): def _reset_cdb_filters(id): concept_db = ConceptDB.objects.get(id=id) cdb = CDB.load(concept_db.cdb_file.path) - # TODO: clear addons + clear_cdb_cnf_addons(cdb, id) cdb.config.components.linking.filters = {'cuis': set()} cdb.save(concept_db.cdb_file.path) @@ -369,7 +370,7 @@ def _reset_cdb_filters(id): def import_concepts_from_cdb(cdb_model_id: int): cdb_model = ConceptDB.objects.get(id=cdb_model_id) cdb = CDB.load(cdb_model.cdb_file.path) - # TODO: clear addons + clear_cdb_cnf_addons(cdb, cdb_model_id) import_all_concepts(cdb, cdb_model) diff --git a/webapp/api/api/metrics.py b/webapp/api/api/metrics.py index 37a3c4f4..bcb8116f 100644 --- a/webapp/api/api/metrics.py +++ b/webapp/api/api/metrics.py @@ -26,6 +26,7 @@ from api.admin import retrieve_project_data from api.models import AnnotatedEntity, ProjectAnnotateEntities, ProjectMetrics as AppProjectMetrics +from api.utils import clear_cdb_cnf_addons from core.settings import MEDIA_ROOT _dt_fmt = '%Y-%m-%d %H:%M:%S.%f' @@ -50,6 +51,7 @@ def calculate_metrics(project_ids: List[int], report_name: str): else: # assume the cdb / vocab is set in these projects cdb = CDB.load(projects[0].concept_db.cdb_file.path) + clear_cdb_cnf_addons(cdb, projects[0].concept_db.name) vocab = Vocab.load(projects[0].vocab.vocab_file.path) cat = CAT(cdb, vocab, config=cdb.config) project_data = retrieve_project_data(projects) diff --git a/webapp/api/api/model_cache.py b/webapp/api/api/model_cache.py index 5fdf37d7..ad010641 100644 --- a/webapp/api/api/model_cache.py +++ b/webapp/api/api/model_cache.py @@ -204,6 +204,7 @@ def get_cached_cdb(cdb_id: str, cdb_map: Dict[str, CDB]=CDB_MAP) -> CDB: if cdb_id not in cdb_map: cdb_obj = ConceptDB.objects.get(id=cdb_id) cdb = CDB.load(cdb_obj.cdb_file.path) + clear_cdb_cnf_addons(cdb, cdb_id) cdb_map[cdb_id] = cdb return cdb_map[cdb_id] diff --git a/webapp/api/api/utils.py b/webapp/api/api/utils.py index 6f619c67..483370d4 100644 --- a/webapp/api/api/utils.py +++ b/webapp/api/api/utils.py @@ -131,7 +131,7 @@ def check_filters(cui, filters): # logger.debug('Successfully saved %s', meta_anno_obj) -def clear_cdb_cnf_addons(cdb: CDB, cdb_id: str): +def clear_cdb_cnf_addons(cdb: CDB, cdb_id: str | int): # NOTE: when loading a CDB separately, we don't necessarily want to # load / create addons like MetaCAT as well logger.info('Clearing addons for CDB upon load: %s', cdb_id) From 0ddfb61f5cf64804bf16ac8f3692e8d83ed1cf1a Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 2 Jul 2025 11:05:10 +0100 Subject: [PATCH 29/33] Avoid circular imports by importing dynamically --- webapp/api/api/model_cache.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/webapp/api/api/model_cache.py b/webapp/api/api/model_cache.py index ad010641..e2104851 100644 --- a/webapp/api/api/model_cache.py +++ b/webapp/api/api/model_cache.py @@ -12,7 +12,6 @@ from medcat.utils.legacy.convert_cdb import get_cdb_from_old from api.models import ConceptDB -from api.utils import clear_cdb_cnf_addons """ Module level caches for CDBs, Vocabs and CAT instances. @@ -75,6 +74,8 @@ def get_medcat_from_cdb_vocab(project, 'Please re-configure this project to use a MedCAT v1.x CDB or consult the ' 'MedCATTrainer Dev team if you believe this should work') from ke raise + # NOTE: dynamic import to avoid circular imports + from api.utils import clear_cdb_cnf_addons clear_cdb_cnf_addons(cdb, cdb_id) custom_config = os.getenv("MEDCAT_CONFIG_FILE") From be70c2cabd2ab15baaa0bab61786481b06bde8b5 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 2 Jul 2025 11:31:55 +0100 Subject: [PATCH 30/33] Correctly set CDB path within v2 model packs --- webapp/api/api/models.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/webapp/api/api/models.py b/webapp/api/api/models.py index d2f1a210..82705af1 100644 --- a/webapp/api/api/models.py +++ b/webapp/api/api/models.py @@ -64,7 +64,12 @@ def save(self, *args, **kwargs): CAT.load_cdb(unpacked_model_pack_path) concept_db = ConceptDB() unpacked_file_name = self.model_pack.file.name.replace('.zip', '') - concept_db.cdb_file.name = os.path.join(unpacked_file_name, 'cdb.dat') + # cdb path for v2 + cdb_path = os.path.join(unpacked_file_name, 'cdb') + if not os.path.exists(cdb_path): + # cdb path for v1 + cdb_path = os.path.join(unpacked_file_name, 'cdb.dat') + concept_db.cdb_file.name = cdb_path concept_db.name = f'{self.name}_CDB' concept_db.save(skip_load=True) self.concept_db = concept_db From 2ba64917ee0b56739c596804f8311faab565d017 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 5 Aug 2025 16:58:14 +0100 Subject: [PATCH 31/33] Update dependency to pypy-based version --- webapp/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webapp/requirements.txt b/webapp/requirements.txt index bf538fba..837a42d9 100644 --- a/webapp/requirements.txt +++ b/webapp/requirements.txt @@ -6,4 +6,4 @@ django-polymorphic==3.0.* djangorestframework==3.15.* django-background-tasks-updated==1.2.* openpyxl==3.1.2 -medcat[meta-cat,spacy,rel-cat,deid] @ git+https://github.com/CogStack/cogstack-nlp.git@refs/tags/medcat/v0.12.0#subdirectory=medcat-v2 +medcat[meta-cat,spacy,rel-cat,deid]~=2.0.0b4 From b82a1e36db502cf38048ee4c26d066c6c3859e39 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 6 Aug 2025 13:49:54 +0100 Subject: [PATCH 32/33] Update (very old) notebook to v2 --- notebook_docs/Train_MedCAT_Models.ipynb | 36 +++++++++++++------------ 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/notebook_docs/Train_MedCAT_Models.ipynb b/notebook_docs/Train_MedCAT_Models.ipynb index 3360be55..5a263bf6 100644 --- a/notebook_docs/Train_MedCAT_Models.ipynb +++ b/notebook_docs/Train_MedCAT_Models.ipynb @@ -186,7 +186,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2020-09-08T11:27:34.270631Z", @@ -195,9 +195,11 @@ }, "outputs": [], "source": [ + "import json\n", + "\n", "from medcat.cat import CAT\n", "from medcat.cdb import CDB\n", - "from medcat.utils.vocab import Vocab" + "from medcat.vocab import Vocab" ] }, { @@ -310,7 +312,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2020-09-08T11:27:59.782731Z", @@ -319,16 +321,14 @@ }, "outputs": [], "source": [ - "cdb = CDB()\n", - "cdb.load_dict(cdb_path)\n", - "vocab = Vocab()\n", - "vocab.load_dict(vocab_path)\n", + "cdb = CDB.load(cdb_path)\n", + "vocab = Vocab.load(vocab_path)\n", "cat = CAT(cdb, vocab)" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2020-09-08T11:37:38.546552Z", @@ -1383,12 +1383,13 @@ } ], "source": [ - "cat.train_supervised(data_path=\"example_data/MedCAT_Export_With_Text_2020-05-22_10_34_09.json\",\n", - " nepochs=1,\n", - " lr=0.1,\n", - " anneal=False, # Unless we are reseting the CDB or cui_count this is False\n", - " print_stats=True,\n", - " use_filters=True)" + "with open(\"example_data/MedCAT_Export_With_Text_2020-05-22_10_34_09.json\") as f:\n", + " data = json.load(f)\n", + "cat.trainer.train_supervised_raw(\n", + " data=data,\n", + " nepochs=1,\n", + " print_stats=True,\n", + " use_filters=True)" ] }, { @@ -1402,7 +1403,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2020-09-08T15:04:02.394607Z", @@ -1411,14 +1412,14 @@ }, "outputs": [], "source": [ - "from medcat.meta_cat import MetaCAT\n", + "from medcat.components.addons.meta_cat import MetaCAT\n", "from tokenizers import ByteLevelBPETokenizer\n", "from itertools import chain" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2020-09-08T14:46:39.070589Z", @@ -1427,6 +1428,7 @@ }, "outputs": [], "source": [ + "import numpy as np\n", "# Tokenizer instantiation\n", "tokenizer = ByteLevelBPETokenizer(vocab_file='data/medmen-vocab.json', merges_file='data/medmen-merges.txt')\n", "embeddings = np.load(open('data/embeddings.npy', 'rb'))" From 1a4a18dc41030cfbfa22384b70e99513e7c71d8d Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 6 Aug 2025 13:50:58 +0100 Subject: [PATCH 33/33] Update (very old) notebook for v2 installation --- notebook_docs/Train_MedCAT_Models.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/notebook_docs/Train_MedCAT_Models.ipynb b/notebook_docs/Train_MedCAT_Models.ipynb index 5a263bf6..3b3e74cb 100644 --- a/notebook_docs/Train_MedCAT_Models.ipynb +++ b/notebook_docs/Train_MedCAT_Models.ipynb @@ -25,7 +25,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -177,7 +177,7 @@ ], "source": [ "# install medcat\n", - "!pip install medcat\n", + "!pip install \"medcat[spacy,meta-cat,rel-cat,deid]>=2.0.0\"\n", "# scispacy medium models\n", "!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_md-0.2.5.tar.gz\n", "# ipywidgets\n",