From 1edcdfbe0aea4b3b056d6da1b28d5ecca5b1f452 Mon Sep 17 00:00:00 2001
From: Tom Searle <tom@cogstack.org>
Date: Thu, 12 Jun 2025 23:29:17 +0100
Subject: [PATCH 01/33] interim changes for medcat-v2

---
 notebook_docs/API_Examples.ipynb        | 32 ++++-----
 notebook_docs/Train_MedCAT_Models.ipynb |  8 +--
 webapp/api/api/admin/actions.py         | 12 ++--
 webapp/api/api/metrics.py               | 26 +++----
 webapp/api/api/model_cache.py           | 18 ++++-
 webapp/api/api/models.py                | 17 ++---
 webapp/api/api/utils.py                 | 90 ++++++++++++++-----------
 webapp/api/api/views.py                 | 19 +++---
 8 files changed, 120 insertions(+), 102 deletions(-)

diff --git a/notebook_docs/API_Examples.ipynb b/notebook_docs/API_Examples.ipynb
index feacc4d4..649a3651 100644
--- a/notebook_docs/API_Examples.ipynb
+++ b/notebook_docs/API_Examples.ipynb
@@ -200,11 +200,11 @@
     "for name, d_s in datasets:\n",
     "    payload = {\n",
     "        'dataset_name': name,   # Name that appears in each\n",
-    "        'dataset': d_s.loc[:, ['name', 'text']].to_dict(),  #  Dictionary representation of only  \n",
+    "        'dataset': d_s.loc[:, ['name', 'text']].to_dict(),  #  Dictionary representation of only\n",
     "        'description': f'{name} first 20 notes from each category' # Description that appears in the trainer\n",
     "    }\n",
     "    resp = requests.post(f'{URL}/api/create-dataset/', json=payload, headers=headers)\n",
-    "    dataset_ids.append(json.loads(resp.text)['dataset_id']) \n",
+    "    dataset_ids.append(json.loads(resp.text)['dataset_id'])\n",
     "# New datasets created in the trainer have the following IDs\n",
     "dataset_ids"
    ]
@@ -268,7 +268,7 @@
    },
    "outputs": [],
    "source": [
-    "from medcat.cdb import CDB"
+    "from medcat.storage.serialisers import deserialise"
    ]
   },
   {
@@ -290,7 +290,7 @@
     }
    ],
    "source": [
-    "CDB.load('../../medcat-models/deid_medcat_n2c2_modelpack/cdb.dat')"
+    "cdb = deserialise('../../medcat-models/deid_medcat_n2c2_modelpack/cdb.dat')"
    ]
   },
   {
@@ -301,8 +301,8 @@
    },
    "outputs": [],
    "source": [
-    "txt = json.loads(requests.post(f'{URL}/api/concept-dbs/', headers=headers, \n",
-    "                               data={'name': 'example_cdb', 'use_for_training': True}, \n",
+    "txt = json.loads(requests.post(f'{URL}/api/concept-dbs/', headers=headers,\n",
+    "                               data={'name': 'example_cdb', 'use_for_training': True},\n",
     "                               files={'cdb_file': open('../../medcat-models/deid_medcat_n2c2_modelpack/cdb.dat', 'rb')}).text)"
    ]
   },
@@ -342,8 +342,8 @@
    },
    "outputs": [],
    "source": [
-    "txt = json.loads(requests.put(f'{URL}/api/concept-dbs/21/', headers=headers, \n",
-    "                               data={'name': 'example_cdb-EDITED', 'use_for_training': True}, \n",
+    "txt = json.loads(requests.put(f'{URL}/api/concept-dbs/21/', headers=headers,\n",
+    "                               data={'name': 'example_cdb-EDITED', 'use_for_training': True},\n",
     "                               files={'cdb_file': open('../../medcat-models/deid_medcat_n2c2_modelpack/cdb.dat', 'rb')}).text)"
    ]
   },
@@ -379,8 +379,8 @@
     }
    ],
    "source": [
-    "requests.post(f'{URL}/api/concept-dbs/', headers=headers, \n",
-    "                               data={'name': 'example_cdb', 'use_for_training': True}, \n",
+    "requests.post(f'{URL}/api/concept-dbs/', headers=headers,\n",
+    "                               data={'name': 'example_cdb', 'use_for_training': True},\n",
     "                               files={'cdb_file': open('../../medcat-models/deid_medcat_n2c2_modelpack/cdb.dat', 'rb')}).text)"
    ]
   },
@@ -404,7 +404,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "txt = json.loads(requests.post(f'{URL}/api/vocab/', headers=headers, \n",
+    "txt = json.loads(requests.post(f'{URL}/api/vocab/', headers=headers,\n",
     "                               files={'cdb_file': open('<<LOCATION OF vocab>>', 'rb')}).text)"
    ]
   },
@@ -465,7 +465,7 @@
     "all_cdbs = json.loads(requests.get(f'{URL}/api/concept-dbs/', headers=headers).text)['results']\n",
     "# the CDB ID we'll use for this example\n",
     "cdb_to_use = all_cdbs[0]['id']\n",
-    "# you might have many CDBs here. First 2 cdbs: \n",
+    "# you might have many CDBs here. First 2 cdbs:\n",
     "all_cdbs[0:2]"
    ]
   },
@@ -521,12 +521,12 @@
     "for d_id, p_name in zip(dataset_ids, project_names):\n",
     "    payload = {\n",
     "        'name': f'{p_name} Annotation Project',\n",
-    "        'description': 'Example projects', \n",
-    "        'cuis': '', \n",
+    "        'description': 'Example projects',\n",
+    "        'cuis': '',\n",
     "        'tuis': '',\n",
     "        'dataset': d_id,\n",
-    "        'concept_db': cdb_to_use, \n",
-    "        'vocab': vocab_to_use, \n",
+    "        'concept_db': cdb_to_use,\n",
+    "        'vocab': vocab_to_use,\n",
     "        'members': users_ids\n",
     "    }\n",
     "    project_ids.append(json.loads(requests.post(f'{URL}/api/project-annotate-entities/', json=payload, headers=headers).text))"
diff --git a/notebook_docs/Train_MedCAT_Models.ipynb b/notebook_docs/Train_MedCAT_Models.ipynb
index c89e352a..3360be55 100644
--- a/notebook_docs/Train_MedCAT_Models.ipynb
+++ b/notebook_docs/Train_MedCAT_Models.ipynb
@@ -382,7 +382,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\r",
+      "\r\n",
       "Epoch: 0, Prec: 0.36538461538461536, Rec: 0.8444444444444444, F1: 0.6049145299145299\n",
       "\n",
       "Docs with false positives: Psych Text 1; Psych Text 2\n",
@@ -1383,11 +1383,11 @@
     }
    ],
    "source": [
-    "cat.train_supervised(data_path=\"example_data/MedCAT_Export_With_Text_2020-05-22_10_34_09.json\", \n",
+    "cat.train_supervised(data_path=\"example_data/MedCAT_Export_With_Text_2020-05-22_10_34_09.json\",\n",
     "                     nepochs=1,\n",
     "                     lr=0.1,\n",
     "                     anneal=False, # Unless we are reseting the CDB or cui_count this is False\n",
-    "                     print_stats=True, \n",
+    "                     print_stats=True,\n",
     "                     use_filters=True)"
    ]
   },
@@ -1443,7 +1443,7 @@
    },
    "outputs": [],
    "source": [
-    "metacat = MetaCAT(tokenizer=tokenizer, embeddings=embeddings, \n",
+    "metacat = MetaCAT(tokenizer=tokenizer, embeddings=embeddings,\n",
     "             pad_id=len(embeddings) -1, save_dir='mc_status', device='cpu')"
    ]
   },
diff --git a/webapp/api/api/admin/actions.py b/webapp/api/api/admin/actions.py
index 57b5a16b..7e9da84a 100644
--- a/webapp/api/api/admin/actions.py
+++ b/webapp/api/api/admin/actions.py
@@ -13,6 +13,8 @@
 from api.models import AnnotatedEntity, MetaAnnotation, EntityRelation, Document, ConceptDB
 from api.solr_utils import drop_collection, import_all_concepts
 
+from medcat.storage.serialisers import deserialise
+
 logger = logging.getLogger(__name__)
 
 _dt_fmt = '%Y-%m-%d %H:%M:%S.%f'
@@ -356,20 +358,16 @@ def dataset_document_counts(dataset):
 
 @background(schedule=5)
 def _reset_cdb_filters(id):
-    from medcat.cdb import CDB
     concept_db = ConceptDB.objects.get(id=id)
-    cdb = CDB.load(concept_db.cdb_file.path)
-    cdb.config.linking['filters'] = {'cuis': set()}
+    cdb = deserialise(concept_db.cdb_file.path)
+    cdb.config.components.linking.filters = {'cuis': set()}
     cdb.save(concept_db.cdb_file.path)
 
 
 @background(schedule=5)
 def import_concepts_from_cdb(cdb_model_id: int):
-    from medcat.cdb import CDB
-
     cdb_model = ConceptDB.objects.get(id=cdb_model_id)
-    cdb = CDB.load(cdb_model.cdb_file.path)
-
+    cdb = deserialise(cdb_model.cdb_file.path)
     import_all_concepts(cdb, cdb_model)
 
 
diff --git a/webapp/api/api/metrics.py b/webapp/api/api/metrics.py
index 2a87a658..b67a7ba8 100644
--- a/webapp/api/api/metrics.py
+++ b/webapp/api/api/metrics.py
@@ -14,12 +14,12 @@
 from django.contrib.auth.models import User
 from django.db.models import QuerySet
 from medcat.cat import CAT
-from medcat.cdb import CDB
-from medcat.config_meta_cat import ConfigMetaCAT
-from medcat.meta_cat import MetaCAT
-from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBase
-from medcat.utils.meta_cat.data_utils import prepare_from_json, encode_category_values
-from medcat.utils.meta_cat.ml_utils import create_batch_piped_data
+from medcat.storage.serialisers import deserialise
+from medcat.config.config_meta_cat import ConfigMetaCAT
+from medcat.components.addons.meta_cat.meta_cat import MetaCAT
+from medcat.components.addons.meta_cat.mctokenizers.tokenizers import TokenizerWrapperBase
+from medcat.components.addons.meta_cat.data_utils import prepare_from_json, encode_category_values
+from medcat.components.addons.meta_cat.ml_utils import create_batch_piped_data
 from medcat.vocab import Vocab
 from torch import nn
 
@@ -48,8 +48,8 @@ def calculate_metrics(project_ids: List[int], report_name: str):
         cat = CAT.load_model_pack(projects[0].model_pack.model_pack.path)
     else:
         # assume the cdb / vocab is set in these projects
-        cdb = CDB.load(projects[0].concept_db.cdb_file.path)
-        vocab = Vocab.load(projects[0].vocab.vocab_file.path)
+        cdb = deserialise(projects[0].concept_db.cdb_file.path)
+        vocab = deserialise(projects[0].vocab.vocab_file.path)
         cat = CAT(cdb, vocab, config=cdb.config)
     project_data = retrieve_project_data(projects)
     metrics = ProjectMetrics(project_data, cat)
@@ -76,7 +76,7 @@ def __init__(self, mct_export_data: dict, cat: CAT):
         """
         self.mct_export = mct_export_data
         self.cat = cat
-        self.projects2names = {}    
+        self.projects2names = {}
         self.projects2doc_ids = {}
         self.docs2names = {}
         self.docs2texts = {}
@@ -166,21 +166,21 @@ def enrich_medcat_metrics(self, examples):
         """
         for tp in [i for e_i in examples['tp'].values() for i in e_i]:
             try:
-                ann = AnnotatedEntity.objects.get(project_id=tp['project id'], document_id=tp['document id'], 
+                ann = AnnotatedEntity.objects.get(project_id=tp['project id'], document_id=tp['document id'],
                                                   start_ind=tp['start'], end_ind=tp['end'])
                 tp['user'] = ann.user.username
             except:
                 tp['user'] = None
         for fp in (i for e_i in examples['fp'].values() for i in e_i):
             try:
-                ann = AnnotatedEntity.objects.get(project_id=fp['project id'], document_id=fp['document id'], 
+                ann = AnnotatedEntity.objects.get(project_id=fp['project id'], document_id=fp['document id'],
                                                   start_ind=fp['start'], end_ind=fp['end'])
                 fp['user'] = ann.user.username
             except:
                 fp['user'] = None
         for fn in (i for e_i in examples['fn'].values() for i in e_i):
             try:
-                ann = AnnotatedEntity.objects.get(project_id=fn['project id'], document_id=fn['document id'], 
+                ann = AnnotatedEntity.objects.get(project_id=fn['project id'], document_id=fn['document id'],
                                                   start_ind=fn['start'], end_ind=fn['end'])
                 fn['user'] = ann.user.username
             except:
@@ -400,7 +400,7 @@ def generate_report(self, meta_ann=False):
         return {'user_stats': self.user_stats().to_dict('records'),
                 'concept_summary': self.concept_summary(),
                 'annotation_summary': anno_df.to_dict('records'),
-                'meta_anno_summary': meta_anns_summary, 
+                'meta_anno_summary': meta_anns_summary,
                 'projects2doc_ids': self.projects2doc_ids,
                 'docs2text': self.docs2texts,
                 'projects2name': self.projects2names,
diff --git a/webapp/api/api/model_cache.py b/webapp/api/api/model_cache.py
index b79a4ce0..ebb12cc3 100644
--- a/webapp/api/api/model_cache.py
+++ b/webapp/api/api/model_cache.py
@@ -6,6 +6,8 @@
 from medcat.cat import CAT
 from medcat.cdb import CDB
 from medcat.vocab import Vocab
+from medcat.storage.serialisers import deserialise
+from medcat.utils.legacy.convert_cdb import get_cdb_from_old
 
 from api.models import ConceptDB
 
@@ -52,7 +54,17 @@ def get_medcat_from_cdb_vocab(project,
         else:
             cdb_path = project.concept_db.cdb_file.path
             try:
-                cdb = CDB.load(cdb_path)
+                cdb = deserialise(cdb_path)
+            except NotADirectoryError as e:
+                logger.warning("Legacy CDB found, converting to new format")
+                # this should live in medcat code directly
+                cdb = get_cdb_from_old(cdb_path)
+                serialise(cdb, cdb_path)
+                cdb_map[cdb_id] = cdb
+                cdb_path = project.concept_db.cdb_file.path
+                cdb = deserialise(cdb_path)
+                cdb_map[cdb_id] = cdb
+
             except KeyError as ke:
                 mc_v = pkg_resources.get_distribution('medcat').version
                 if int(mc_v.split('.')[0]) > 0:
@@ -73,7 +85,7 @@ def get_medcat_from_cdb_vocab(project,
             vocab = vocab_map[vocab_id]
         else:
             vocab_path = project.vocab.vocab_file.path
-            vocab = Vocab.load(vocab_path)
+            vocab = deserialise(vocab_path)
             vocab_map[vocab_id] = vocab
         cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab)
         cat_map[cat_id] = cat
@@ -132,7 +144,7 @@ def clear_cached_medcat(project, cat_map: Dict[str, CAT]=CAT_MAP):
 def get_cached_cdb(cdb_id: str, cdb_map: Dict[str, CDB]=CDB_MAP) -> CDB:
     if cdb_id not in cdb_map:
         cdb_obj = ConceptDB.objects.get(id=cdb_id)
-        cdb = CDB.load(cdb_obj.cdb_file.path)
+        cdb = deserialise(cdb_obj.cdb_file.path)
         cdb_map[cdb_id] = cdb
     return cdb_map[cdb_id]
 
diff --git a/webapp/api/api/models.py b/webapp/api/api/models.py
index f29bc8b7..625dad0c 100644
--- a/webapp/api/api/models.py
+++ b/webapp/api/api/models.py
@@ -12,8 +12,8 @@
 from django.forms import forms, ModelForm
 from medcat.cat import CAT
 from medcat.cdb import CDB
-from medcat.vocab import Vocab
-from medcat.meta_cat import MetaCAT
+from medcat.storage.serialisers import deserialise
+from medcat.components.addons.meta_cat.meta_cat import MetaCAT
 from polymorphic.models import PolymorphicModel
 
 from core.settings import MEDIA_ROOT
@@ -42,14 +42,14 @@ class ModelPack(models.Model):
     meta_cats = models.ManyToManyField('MetaCATModel', blank=True, default=None)
     create_time = models.DateTimeField(auto_now_add=True)
     last_modified = models.DateTimeField(auto_now=True)
-    last_modified_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=True)   
+    last_modified_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=True)
 
     @transaction.atomic
     def save(self, *args, **kwargs):
         is_new = self._state.adding
         if is_new:
             super().save(*args, **kwargs)
-        
+
         # Process the model pack
         logger.info('Loading model pack: %s', self.model_pack)
         model_pack_name = str(self.model_pack).replace(".zip", "")
@@ -72,9 +72,10 @@ def save(self, *args, **kwargs):
             raise FileNotFoundError(f'Error loading the CDB from this model pack: {self.model_pack.path}') from exc
 
         # Load Vocab
+
         vocab_path = os.path.join(unpacked_model_pack_path, "vocab.dat")
         if os.path.exists(vocab_path):
-            Vocab.load(vocab_path)
+            deserialise(vocab_path)
             vocab = Vocabulary()
             vocab.vocab_file.name = vocab_path.replace(f'{MEDIA_ROOT}/', '')
             vocab.save(skip_load=True)
@@ -98,7 +99,7 @@ def save(self, *args, **kwargs):
             self.meta_cats.set(metaCATmodels)  # Use set() instead of add() for atomic operation
         except Exception as exc:
             raise MedCATLoadException(f'Failure loading MetaCAT models - {unpacked_model_pack_path}') from exc
-            
+
         # Only save if this is an update (not a new instance)
         if not is_new:
             super().save(*args, **kwargs)
@@ -133,7 +134,7 @@ def save(self, *args, skip_load=False, **kwargs):
         # load the CDB, and raise if this fails - must be saved first so storage handler can rename path if name clashes
         if not skip_load:
             try:
-                CDB.load(self.cdb_file.path)
+                deserialise(self.cdb_file.path)
             except Exception as exc:
                 raise MedCATLoadException(f'Failed to load Concept DB from {self.cdb_file}, '
                                           f'check if this CDB file successfully loads elsewhere') from exc
@@ -156,7 +157,7 @@ def save(self, *args, skip_load=False, **kwargs):
         # load the Vocab, and raise if this fails
         if not skip_load:
             try:
-                Vocab.load(self.vocab_file.path)
+                deserialise(self.vocab_file.path)
             except Exception as exc:
                 raise MedCATLoadException(f'Failed to load Vocab from {self.vocab_file}, '
                                           f'check if this Vocab file successfully loads elsewhere') from exc
diff --git a/webapp/api/api/utils.py b/webapp/api/api/utils.py
index 8d4d64ad..81a855e8 100644
--- a/webapp/api/api/utils.py
+++ b/webapp/api/api/utils.py
@@ -9,9 +9,7 @@
 from django.db.models.signals import post_save
 from django.dispatch import receiver
 from medcat.cat import CAT
-from medcat.utils.filters import check_filters
-from medcat.utils.helpers import tkns_from_doc
-from medcat.utils.ner.deid import DeIdModel
+from medcat.components.ner.trf.deid import DeIdModel
 
 from .model_cache import get_medcat
 from .models import Entity, AnnotatedEntity, ProjectAnnotateEntities, \
@@ -37,7 +35,7 @@ def remove_annotations(document, project, partial=False):
 
 
 def add_annotations(spacy_doc, user, project, document, existing_annotations, cat):
-    spacy_doc._.ents.sort(key=lambda x: len(x.text), reverse=True)
+    spacy_doc.final_ents.sort(key=lambda x: len(x.text), reverse=True)
 
     tkns_in = []
     ents = []
@@ -46,9 +44,9 @@ def add_annotations(spacy_doc, user, project, document, existing_annotations, ca
     # that can be produced are expected to have available models
     try:
         metatask2obj = {task_name: MetaTask.objects.get(name=task_name)
-                        for task_name in spacy_doc._.ents[0]._.meta_anns.keys()}
+                        for task_name in spacy_doc.final_ents[0].get_addon_data('meta_cat_meta_anns').keys()}
         metataskvals2obj = {task_name: {v.name: v for v in MetaTask.objects.get(name=task_name).values.all()}
-                            for task_name in spacy_doc._.ents[0]._.meta_anns.keys()}
+                            for task_name in spacy_doc.final_ents[0].get_addon_data('meta_cat_meta_anns').keys()}
     except (AttributeError, IndexError):
         # IndexError: ignore if there are no annotations in this doc
         # AttributeError: ignore meta_anns that are not present - i.e. non model pack preds
@@ -61,8 +59,14 @@ def check_ents(ent):
         return any((ea[0] < ent.start_char < ea[1]) or
                    (ea[0] < ent.end_char < ea[1]) for ea in existing_annos_intervals)
 
-    for ent in spacy_doc._.ents:
-        if not check_ents(ent) and check_filters(ent._.cui, cat.config.linking['filters']):
+    def check_filters(cui, filters):
+        if cui in filters.cuis or not filters.cuis:
+            return cui not in filters.cuis_exclude
+        else:
+            return False
+
+    for ent in spacy_doc.final_ents:
+        if not check_ents(ent) and check_filters(ent.cui, cat.config.components.linking.filters):
             to_add = True
             for tkn in ent:
                 if tkn in tkns_in:
@@ -75,7 +79,7 @@ def check_ents(ent):
     logger.debug('Found %s annotations to store', len(ents))
     for ent in ents:
         logger.debug('Processing annotation ent %s of %s', ents.index(ent), len(ents))
-        label = ent._.cui
+        label = ent.cui
 
         if not Entity.objects.filter(label=label).exists():
             # Create the entity
@@ -87,8 +91,8 @@ def check_ents(ent):
 
         ann_ent = AnnotatedEntity.objects.filter(project=project,
                                                   document=document,
-                                                  start_ind=ent.start_char,
-                                                  end_ind=ent.end_char).first()
+                                                  start_ind=ent.start_char_index,
+                                                  end_ind=ent.end_char_index).first()
         if ann_ent is None:
             # If this entity doesn't exist already
             ann_ent = AnnotatedEntity()
@@ -97,29 +101,31 @@ def check_ents(ent):
             ann_ent.document = document
             ann_ent.entity = entity
             ann_ent.value = ent.text
-            ann_ent.start_ind = ent.start_char
-            ann_ent.end_ind = ent.end_char
-            ann_ent.acc = ent._.context_similarity
+            ann_ent.start_ind = ent.start_char_index
+            ann_ent.end_ind = ent.end_char_index
+            ann_ent.acc = ent.context_similarity
 
-            MIN_ACC = cat.config.linking.get('similarity_threshold_trainer', 0.2)
-            if ent._.context_similarity < MIN_ACC:
+            MIN_ACC = cat.config.components.linking.similarity_threshold
+            if ent.context_similarity < MIN_ACC:
                 ann_ent.deleted = True
                 ann_ent.validated = True
 
             ann_ent.save()
 
             # check the ent._.meta_anns if it exists
-            if hasattr(ent._, 'meta_anns') and len(metatask2obj) > 0 and len(metataskvals2obj) > 0:
-                logger.debug('Found %s meta annos on ent', len(ent._.meta_anns.items()))
-                for meta_ann_task, pred in ent._.meta_anns.items():
-                    meta_anno_obj = MetaAnnotation()
-                    meta_anno_obj.predicted_meta_task_value = metataskvals2obj[meta_ann_task][pred['value']]
-                    meta_anno_obj.meta_task = metatask2obj[meta_ann_task]
-                    meta_anno_obj.annotated_entity = ann_ent
-                    meta_anno_obj.meta_task_value = metataskvals2obj[meta_ann_task][pred['value']]
-                    meta_anno_obj.acc = pred['confidence']
-                    meta_anno_obj.save()
-                    logger.debug('Successfully saved %s', meta_anno_obj)
+            # if hasattr(ent, 'get_addon_data') and \
+            #            len(metatask2obj) > 0 and
+            #            len(metataskvals2obj) > 0:
+            #     logger.debug('Found %s meta annos on ent', len(ent._.meta_anns.items()))
+            #     for meta_ann_task, pred in ent._.meta_anns.items():
+            #         meta_anno_obj = MetaAnnotation()
+            #         meta_anno_obj.predicted_meta_task_value = metataskvals2obj[meta_ann_task][pred['value']]
+            #         meta_anno_obj.meta_task = metatask2obj[meta_ann_task]
+            #         meta_anno_obj.annotated_entity = ann_ent
+            #         meta_anno_obj.meta_task_value = metataskvals2obj[meta_ann_task][pred['value']]
+            #         meta_anno_obj.acc = pred['confidence']
+            #         meta_anno_obj.save()
+            #         logger.debug('Successfully saved %s', meta_anno_obj)
 
 
 
@@ -206,35 +212,37 @@ def train_medcat(cat, project, document):
         for ann in anns:
             cui = ann.entity.label
             # Indices for this annotation
-            spacy_entity = tkns_from_doc(spacy_doc=spacy_doc, start=ann.start_ind, end=ann.end_ind)
+            spacy_entity = [tkn for tkn in spacy_doc if tkn.char_index == ann.start_ind]
             # This will add the concept if it doesn't exist and if it
-            #does just link the new name to the concept, if the namee is
-            #already linked then it will just train.
+            # does just link the new name to the concept, if the namee is
+            # already linked then it will just train.
             manually_created = False
             if ann.manually_created or ann.alternative:
                 manually_created = True
 
-            cat.add_and_train_concept(cui=cui,
-                          name=ann.value,
-                          spacy_doc=spacy_doc,
-                          spacy_entity=spacy_entity,
-                          negative=ann.deleted,
-                          devalue_others=manually_created)
+            cat.trainer.add_and_train_concept(
+                cui=cui,
+                name=ann.value,
+                mut_doc=spacy_doc,
+                mut_entity=spacy_entity,
+                negative=ann.deleted,
+                devalue_others=manually_created
+            )
 
     # Completely remove concept names that the user killed
     killed_anns = AnnotatedEntity.objects.filter(project=project, document=document, killed=True)
     for ann in killed_anns:
         cui = ann.entity.label
         name = ann.value
-        cat.unlink_concept_name(cui=cui, name=name)
+        cat.trainer.unlink_concept_name(cui=cui, name=name)
 
     # Add irrelevant cuis to cui_exclude
     irrelevant_anns = AnnotatedEntity.objects.filter(project=project, document=document, irrelevant=True)
     for ann in irrelevant_anns:
         cui = ann.entity.label
-        if 'cuis_exclude' not in cat.config.linking['filters']:
-            cat.config.linking['filters']['cuis_exclude'] = set()
-        cat.config.linking['filters'].get('cuis_exclude').update([cui])
+        if 'cuis_exclude' not in cat.config.components.linking.filters:
+            cat.config.components.linking.filters['cuis_exclude'] = set()
+        cat.config.components.linking.filters.get('cuis_exclude').update([cui])
 
 
 @background(schedule=1, queue='doc_prep')
@@ -247,7 +255,7 @@ def prep_docs(project_id: List[int], doc_ids: List[int], user_id: int):
     cat = get_medcat(project=project)
 
     # Set CAT filters
-    cat.config.linking['filters']['cuis'] = project.cuis
+    cat.config.components.linking.filters.cuis = project.cuis
 
     for doc in docs:
         logger.info(f'Running MedCAT model for project {project.id}:{project.name} over doc: {doc.id}')
diff --git a/webapp/api/api/views.py b/webapp/api/api/views.py
index 55982b84..729d4b2f 100644
--- a/webapp/api/api/views.py
+++ b/webapp/api/api/views.py
@@ -10,11 +10,10 @@
 from django.shortcuts import render
 from django.utils import timezone
 from django_filters import rest_framework as drf
-from medcat.utils.helpers import tkns_from_doc
 from rest_framework import viewsets
 from rest_framework.decorators import api_view
 from rest_framework.response import Response
-from medcat.utils.ner.deid import DeIdModel
+from medcat.components.ner.trf.deid import DeIdModel
 
 from .admin import download_projects_with_text, download_projects_without_text, \
     import_concepts_from_cdb
@@ -282,8 +281,8 @@ def prepare_documents(request):
                     logger.info('loaded medcat model for project: %s', project.id)
 
                     # Set CAT filters
-                    cat.config.linking['filters']['cuis'] = cuis
-                    
+                    cat.config.components.linking.filters.cuis = cuis
+
                     if not project.deid_model_annotation:
                         spacy_doc = cat(document.text)
                     else:
@@ -424,9 +423,9 @@ def add_concept(request):
     if source_val in spacy_doc.text:
         start = spacy_doc.text.index(source_val)
         end = start + len(source_val)
-        spacy_entity = tkns_from_doc(spacy_doc=spacy_doc, start=start, end=end)
+        spacy_entity = [tkn for tkn in spacy_doc if tkn.idx >= start and tkn.idx <= end]
 
-    cat.add_and_train_concept(cui=cui, name=name, name_status='P', spacy_doc=spacy_doc, spacy_entity=spacy_entity)
+    cat.trainer.add_and_train_concept(cui=cui, name=name, name_status='P', mut_doc=spacy_doc, mut_entity=spacy_entity)
 
     id = create_annotation(source_val=source_val,
                            selection_occurrence_index=sel_occur_idx,
@@ -614,12 +613,12 @@ def annotate_text(request):
     project = ProjectAnnotateEntities.objects.get(id=p_id)
 
     cat = get_medcat(project=project)
-    cat.config.linking['filters']['cuis'] = set(cuis)
+    cat.config.components.linking.filters.cuis = set(cuis)
     spacy_doc = cat(message)
 
     ents = []
     anno_tkns = []
-    for ent in spacy_doc._.ents:
+    for ent in spacy_doc.final_ents:
         cnt = Entity.objects.filter(label=ent._.cui).count()
         inc_ent = all(tkn not in anno_tkns for tkn in ent)
         if inc_ent and cnt != 0:
@@ -713,7 +712,7 @@ def cache_model(request, project_id):
         return Response(f'Project with id:{project_id} does not exist', 404)
     except Exception as e:
         return Response({'message': f'{str(e)}'}, 500)
-    
+
 
 
 @api_view(http_method_names=['GET'])
@@ -926,7 +925,7 @@ def cuis_to_concepts(request):
 def project_progress(request):
     if request.GET.get('projects') is None:
         return HttpResponseBadRequest('Cannot get progress for empty projects')
-    
+
     projects = [int(p) for p in request.GET.get('projects', []).split(',')]
 
     projects2datasets = {p.id: (p, p.dataset) for p in [ProjectAnnotateEntities.objects.filter(id=p_id).first()

From 63f51a4458346249ebb34753d1a7551f2dc85c33 Mon Sep 17 00:00:00 2001
From: Tom Searle <tom@cogstack.org>
Date: Thu, 19 Jun 2025 11:08:41 +0100
Subject: [PATCH 02/33] updated TODOs

---
 webapp/api/api/model_cache.py | 2 +-
 webapp/api/api/utils.py       | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/webapp/api/api/model_cache.py b/webapp/api/api/model_cache.py
index ebb12cc3..f8d61bd5 100644
--- a/webapp/api/api/model_cache.py
+++ b/webapp/api/api/model_cache.py
@@ -57,7 +57,7 @@ def get_medcat_from_cdb_vocab(project,
                 cdb = deserialise(cdb_path)
             except NotADirectoryError as e:
                 logger.warning("Legacy CDB found, converting to new format")
-                # this should live in medcat code directly
+                # TODO: deserialise and write back to the model path?
                 cdb = get_cdb_from_old(cdb_path)
                 serialise(cdb, cdb_path)
                 cdb_map[cdb_id] = cdb
diff --git a/webapp/api/api/utils.py b/webapp/api/api/utils.py
index 81a855e8..f4500647 100644
--- a/webapp/api/api/utils.py
+++ b/webapp/api/api/utils.py
@@ -112,7 +112,8 @@ def check_filters(cui, filters):
 
             ann_ent.save()
 
-            # check the ent._.meta_anns if it exists
+            # TODO: Fix before v2 release.
+            # check the ent.get_addon_data('meta_cat_meta_anns') if it exists
             # if hasattr(ent, 'get_addon_data') and \
             #            len(metatask2obj) > 0 and
             #            len(metataskvals2obj) > 0:

From e10d594dbee35460514a6f3e7a44aa3902ed9238 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Mon, 23 Jun 2025 10:41:22 +0100
Subject: [PATCH 03/33] Update dependency to medcat v2

---
 webapp/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/webapp/requirements.txt b/webapp/requirements.txt
index 4097dec7..b621d0b9 100644
--- a/webapp/requirements.txt
+++ b/webapp/requirements.txt
@@ -6,4 +6,4 @@ django-polymorphic==3.0.*
 djangorestframework==3.15.*
 django-background-tasks-updated==1.2.*
 openpyxl==3.1.2
-medcat==1.15.*
\ No newline at end of file
+medcat[meta-cat,spacy,rel-cat,deid] @ git+https://github.com/CogStack/MedCAT2@v0.8.3
\ No newline at end of file

From eb676e7d180f7dfe3dda5b9a29600007f2eef485 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Mon, 23 Jun 2025 10:48:22 +0100
Subject: [PATCH 04/33] Update CDB/Vocab load to use the load classmethod again

---
 notebook_docs/API_Examples.ipynb |  8 ++++----
 webapp/api/api/admin/actions.py  |  6 +++---
 webapp/api/api/metrics.py        |  6 +++---
 webapp/api/api/model_cache.py    | 10 ++++------
 webapp/api/api/models.py         |  8 ++++----
 5 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/notebook_docs/API_Examples.ipynb b/notebook_docs/API_Examples.ipynb
index 649a3651..7e09a659 100644
--- a/notebook_docs/API_Examples.ipynb
+++ b/notebook_docs/API_Examples.ipynb
@@ -262,18 +262,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "metadata": {
     "tags": []
    },
    "outputs": [],
    "source": [
-    "from medcat.storage.serialisers import deserialise"
+    "from medcat.cdb import CDB"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
    "metadata": {
     "tags": []
    },
@@ -290,7 +290,7 @@
     }
    ],
    "source": [
-    "cdb = deserialise('../../medcat-models/deid_medcat_n2c2_modelpack/cdb.dat')"
+    "cdb = CDB.load('../../medcat-models/deid_medcat_n2c2_modelpack/cdb.dat')"
    ]
   },
   {
diff --git a/webapp/api/api/admin/actions.py b/webapp/api/api/admin/actions.py
index 7e9da84a..6123e9ef 100644
--- a/webapp/api/api/admin/actions.py
+++ b/webapp/api/api/admin/actions.py
@@ -13,7 +13,7 @@
 from api.models import AnnotatedEntity, MetaAnnotation, EntityRelation, Document, ConceptDB
 from api.solr_utils import drop_collection, import_all_concepts
 
-from medcat.storage.serialisers import deserialise
+from medcat.cdb import CDB
 
 logger = logging.getLogger(__name__)
 
@@ -359,7 +359,7 @@ def dataset_document_counts(dataset):
 @background(schedule=5)
 def _reset_cdb_filters(id):
     concept_db = ConceptDB.objects.get(id=id)
-    cdb = deserialise(concept_db.cdb_file.path)
+    cdb = CDB.load(concept_db.cdb_file.path)
     cdb.config.components.linking.filters = {'cuis': set()}
     cdb.save(concept_db.cdb_file.path)
 
@@ -367,7 +367,7 @@ def _reset_cdb_filters(id):
 @background(schedule=5)
 def import_concepts_from_cdb(cdb_model_id: int):
     cdb_model = ConceptDB.objects.get(id=cdb_model_id)
-    cdb = deserialise(cdb_model.cdb_file.path)
+    cdb = CDB.load(cdb_model.cdb_file.path)
     import_all_concepts(cdb, cdb_model)
 
 
diff --git a/webapp/api/api/metrics.py b/webapp/api/api/metrics.py
index b67a7ba8..b67dbe4c 100644
--- a/webapp/api/api/metrics.py
+++ b/webapp/api/api/metrics.py
@@ -14,7 +14,7 @@
 from django.contrib.auth.models import User
 from django.db.models import QuerySet
 from medcat.cat import CAT
-from medcat.storage.serialisers import deserialise
+from medcat.cdb import CDB
 from medcat.config.config_meta_cat import ConfigMetaCAT
 from medcat.components.addons.meta_cat.meta_cat import MetaCAT
 from medcat.components.addons.meta_cat.mctokenizers.tokenizers import TokenizerWrapperBase
@@ -48,8 +48,8 @@ def calculate_metrics(project_ids: List[int], report_name: str):
         cat = CAT.load_model_pack(projects[0].model_pack.model_pack.path)
     else:
         # assume the cdb / vocab is set in these projects
-        cdb = deserialise(projects[0].concept_db.cdb_file.path)
-        vocab = deserialise(projects[0].vocab.vocab_file.path)
+        cdb = CDB.load(projects[0].concept_db.cdb_file.path)
+        vocab = Vocab.load(projects[0].vocab.vocab_file.path)
         cat = CAT(cdb, vocab, config=cdb.config)
     project_data = retrieve_project_data(projects)
     metrics = ProjectMetrics(project_data, cat)
diff --git a/webapp/api/api/model_cache.py b/webapp/api/api/model_cache.py
index f8d61bd5..fa1d8dcd 100644
--- a/webapp/api/api/model_cache.py
+++ b/webapp/api/api/model_cache.py
@@ -6,7 +6,6 @@
 from medcat.cat import CAT
 from medcat.cdb import CDB
 from medcat.vocab import Vocab
-from medcat.storage.serialisers import deserialise
 from medcat.utils.legacy.convert_cdb import get_cdb_from_old
 
 from api.models import ConceptDB
@@ -54,15 +53,14 @@ def get_medcat_from_cdb_vocab(project,
         else:
             cdb_path = project.concept_db.cdb_file.path
             try:
-                cdb = deserialise(cdb_path)
+                cdb = CDB.load(cdb_path)
             except NotADirectoryError as e:
                 logger.warning("Legacy CDB found, converting to new format")
                 # TODO: deserialise and write back to the model path?
                 cdb = get_cdb_from_old(cdb_path)
-                serialise(cdb, cdb_path)
+                cdb.save(cdb_path)
                 cdb_map[cdb_id] = cdb
                 cdb_path = project.concept_db.cdb_file.path
-                cdb = deserialise(cdb_path)
                 cdb_map[cdb_id] = cdb
 
             except KeyError as ke:
@@ -85,7 +83,7 @@ def get_medcat_from_cdb_vocab(project,
             vocab = vocab_map[vocab_id]
         else:
             vocab_path = project.vocab.vocab_file.path
-            vocab = deserialise(vocab_path)
+            vocab = Vocab.load(vocab_path)
             vocab_map[vocab_id] = vocab
         cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab)
         cat_map[cat_id] = cat
@@ -144,7 +142,7 @@ def clear_cached_medcat(project, cat_map: Dict[str, CAT]=CAT_MAP):
 def get_cached_cdb(cdb_id: str, cdb_map: Dict[str, CDB]=CDB_MAP) -> CDB:
     if cdb_id not in cdb_map:
         cdb_obj = ConceptDB.objects.get(id=cdb_id)
-        cdb = deserialise(cdb_obj.cdb_file.path)
+        cdb = CDB.load(cdb_obj.cdb_file.path)
         cdb_map[cdb_id] = cdb
     return cdb_map[cdb_id]
 
diff --git a/webapp/api/api/models.py b/webapp/api/api/models.py
index 625dad0c..ae85ce91 100644
--- a/webapp/api/api/models.py
+++ b/webapp/api/api/models.py
@@ -12,7 +12,7 @@
 from django.forms import forms, ModelForm
 from medcat.cat import CAT
 from medcat.cdb import CDB
-from medcat.storage.serialisers import deserialise
+from medcat.vocab import Vocab
 from medcat.components.addons.meta_cat.meta_cat import MetaCAT
 from polymorphic.models import PolymorphicModel
 
@@ -75,7 +75,7 @@ def save(self, *args, **kwargs):
 
         vocab_path = os.path.join(unpacked_model_pack_path, "vocab.dat")
         if os.path.exists(vocab_path):
-            deserialise(vocab_path)
+            Vocab.load(vocab_path)
             vocab = Vocabulary()
             vocab.vocab_file.name = vocab_path.replace(f'{MEDIA_ROOT}/', '')
             vocab.save(skip_load=True)
@@ -134,7 +134,7 @@ def save(self, *args, skip_load=False, **kwargs):
         # load the CDB, and raise if this fails - must be saved first so storage handler can rename path if name clashes
         if not skip_load:
             try:
-                deserialise(self.cdb_file.path)
+                CDB.load(self.cdb_file.path)
             except Exception as exc:
                 raise MedCATLoadException(f'Failed to load Concept DB from {self.cdb_file}, '
                                           f'check if this CDB file successfully loads elsewhere') from exc
@@ -157,7 +157,7 @@ def save(self, *args, skip_load=False, **kwargs):
         # load the Vocab, and raise if this fails
         if not skip_load:
             try:
-                deserialise(self.vocab_file.path)
+                Vocab.load(self.vocab_file.path)
             except Exception as exc:
                 raise MedCATLoadException(f'Failed to load Vocab from {self.vocab_file}, '
                                           f'check if this Vocab file successfully loads elsewhere') from exc

From 524a1cd2b41140dfc1adbc71b1308a09a8798bf9 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Wed, 25 Jun 2025 15:16:19 +0100
Subject: [PATCH 05/33] Update requirements - use latest (0.9.0) v2 beta

---
 webapp/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/webapp/requirements.txt b/webapp/requirements.txt
index b621d0b9..6d438735 100644
--- a/webapp/requirements.txt
+++ b/webapp/requirements.txt
@@ -6,4 +6,4 @@ django-polymorphic==3.0.*
 djangorestframework==3.15.*
 django-background-tasks-updated==1.2.*
 openpyxl==3.1.2
-medcat[meta-cat,spacy,rel-cat,deid] @ git+https://github.com/CogStack/MedCAT2@v0.8.3
\ No newline at end of file
+medcat[meta-cat,spacy,rel-cat,deid] @ git+https://github.com/CogStack/MedCAT2@v0.9.0

From ad796cf1be500835c0f7a2f223095b9b96e1dae1 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Wed, 25 Jun 2025 15:16:50 +0100
Subject: [PATCH 06/33] Move away from pkg_resources (deprecated)

---
 webapp/api/api/model_cache.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/webapp/api/api/model_cache.py b/webapp/api/api/model_cache.py
index fa1d8dcd..efac353d 100644
--- a/webapp/api/api/model_cache.py
+++ b/webapp/api/api/model_cache.py
@@ -2,7 +2,7 @@
 import os
 from typing import Dict
 
-import pkg_resources
+from medcat import __version__ as mct_version
 from medcat.cat import CAT
 from medcat.cdb import CDB
 from medcat.vocab import Vocab
@@ -64,7 +64,7 @@ def get_medcat_from_cdb_vocab(project,
                 cdb_map[cdb_id] = cdb
 
             except KeyError as ke:
-                mc_v = pkg_resources.get_distribution('medcat').version
+                mc_v = mct_version
                 if int(mc_v.split('.')[0]) > 0:
                     logger.error('Attempted to load MedCAT v0.x model with MCTrainer v1.x')
                     raise Exception('Attempted to load MedCAT v0.x model with MCTrainer v1.x',

From e6097fecd73c1d0d40445db521b163268cb4aa18 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Thu, 26 Jun 2025 12:35:16 +0100
Subject: [PATCH 07/33] Update install target to new version and monorepo
 install

---
 webapp/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/webapp/requirements.txt b/webapp/requirements.txt
index 6d438735..94131d56 100644
--- a/webapp/requirements.txt
+++ b/webapp/requirements.txt
@@ -6,4 +6,4 @@ django-polymorphic==3.0.*
 djangorestframework==3.15.*
 django-background-tasks-updated==1.2.*
 openpyxl==3.1.2
-medcat[meta-cat,spacy,rel-cat,deid] @ git+https://github.com/CogStack/MedCAT2@v0.9.0
+medcat[meta-cat,spacy,rel-cat,deid] @ git+https://github.com/CogStack/cogstack-nlp.git@refs/tags/medcat/v0.10.0#subdirectory=medcat-v2

From 27f08b603e80b59abea8ce3a6bdab84dfbf35776 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Thu, 26 Jun 2025 14:58:05 +0100
Subject: [PATCH 08/33] Use v2 based API for loading addons (MetaCATs)

---
 webapp/api/api/models.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/webapp/api/api/models.py b/webapp/api/api/models.py
index ae85ce91..fb4e537d 100644
--- a/webapp/api/api/models.py
+++ b/webapp/api/api/models.py
@@ -13,7 +13,7 @@
 from medcat.cat import CAT
 from medcat.cdb import CDB
 from medcat.vocab import Vocab
-from medcat.components.addons.meta_cat.meta_cat import MetaCAT
+from medcat.components.addons.meta_cat.meta_cat import MetaCAT, MetaCATAddon
 from polymorphic.models import PolymorphicModel
 
 from core.settings import MEDIA_ROOT
@@ -89,7 +89,9 @@ def save(self, *args, **kwargs):
         try:
             metaCATmodels = []
             # should raise an error if there already is a MetaCAT model with this definition
-            for meta_cat_dir, meta_cat in CAT.load_meta_cats(unpacked_model_pack_path):
+            addons = CAT.load_addons(unpacked_model_pack_path)
+            meta_cats = [addon for addon in addons if isinstance(addon, MetaCATAddon)]
+            for meta_cat_dir, meta_cat in meta_cats:
                 mc_model = MetaCATModel()
                 mc_model.meta_cat_dir = meta_cat_dir.replace(f'{MEDIA_ROOT}/', '')
                 mc_model.name = f'{meta_cat.config.general.category_name} - {meta_cat.config.model.model_name}'

From cdef1f28c80fcdf7059e6de28a6a4bb089b80859 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Thu, 26 Jun 2025 15:00:47 +0100
Subject: [PATCH 09/33] Update MetaCAT loading

---
 webapp/api/api/models.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/webapp/api/api/models.py b/webapp/api/api/models.py
index fb4e537d..d2f1a210 100644
--- a/webapp/api/api/models.py
+++ b/webapp/api/api/models.py
@@ -90,8 +90,11 @@ def save(self, *args, **kwargs):
             metaCATmodels = []
             # should raise an error if there already is a MetaCAT model with this definition
             addons = CAT.load_addons(unpacked_model_pack_path)
-            meta_cats = [addon for addon in addons if isinstance(addon, MetaCATAddon)]
-            for meta_cat_dir, meta_cat in meta_cats:
+            meta_cat_addons = [
+                (addon_path, addon) for addon_path, addon in addons
+                if isinstance(addon, MetaCATAddon)]
+            for meta_cat_dir, meta_cat_addon in meta_cat_addons:
+                meta_cat = meta_cat_addon.mc
                 mc_model = MetaCATModel()
                 mc_model.meta_cat_dir = meta_cat_dir.replace(f'{MEDIA_ROOT}/', '')
                 mc_model.name = f'{meta_cat.config.general.category_name} - {meta_cat.config.model.model_name}'

From 4c368bae853a66aa5ced452eed6e76ac93af82e5 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Fri, 27 Jun 2025 09:30:05 +0100
Subject: [PATCH 10/33] Update requirements to 0.11.0 of medcat v2 beta

---
 webapp/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/webapp/requirements.txt b/webapp/requirements.txt
index 94131d56..184c5f97 100644
--- a/webapp/requirements.txt
+++ b/webapp/requirements.txt
@@ -6,4 +6,4 @@ django-polymorphic==3.0.*
 djangorestframework==3.15.*
 django-background-tasks-updated==1.2.*
 openpyxl==3.1.2
-medcat[meta-cat,spacy,rel-cat,deid] @ git+https://github.com/CogStack/cogstack-nlp.git@refs/tags/medcat/v0.10.0#subdirectory=medcat-v2
+medcat[meta-cat,spacy,rel-cat,deid] @ git+https://github.com/CogStack/cogstack-nlp.git@refs/tags/medcat/v0.11.0#subdirectory=medcat-v2

From 232ec726b5f9d6c7ea46dc842defe356995045e1 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Fri, 27 Jun 2025 13:26:15 +0100
Subject: [PATCH 11/33] Update metrics to v2 format

---
 webapp/api/api/metrics.py | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/webapp/api/api/metrics.py b/webapp/api/api/metrics.py
index b67dbe4c..c84ac6e8 100644
--- a/webapp/api/api/metrics.py
+++ b/webapp/api/api/metrics.py
@@ -13,10 +13,11 @@
 from background_task.models import Task
 from django.contrib.auth.models import User
 from django.db.models import QuerySet
+from medcat.stats.stats import get_stats
 from medcat.cat import CAT
 from medcat.cdb import CDB
 from medcat.config.config_meta_cat import ConfigMetaCAT
-from medcat.components.addons.meta_cat.meta_cat import MetaCAT
+from medcat.components.addons.meta_cat.meta_cat import MetaCATAddon
 from medcat.components.addons.meta_cat.mctokenizers.tokenizers import TokenizerWrapperBase
 from medcat.components.addons.meta_cat.data_utils import prepare_from_json, encode_category_values
 from medcat.components.addons.meta_cat.ml_utils import create_batch_piped_data
@@ -113,7 +114,7 @@ def annotation_df(self):
         """
         annotation_df = pd.DataFrame(self.annotations)
         if self.cat:
-            annotation_df.insert(5, 'concept_name', annotation_df['cui'].map(self.cat.cdb.cui2preferred_name))
+            annotation_df.insert(5, 'concept_name', annotation_df['cui'].map(self.cat.cdb.get_name))
         annotation_df['last_modified'] = pd.to_datetime(annotation_df['last_modified']).dt.tz_localize(None)
         return annotation_df
 
@@ -136,9 +137,10 @@ def concept_summary(self, extra_cui_filter=None):
         concept_count_df['count_variations_ratio'] = round(concept_count_df['concept_count'] /
                                                            concept_count_df['variations'], 3)
         if self.cat:
-            fps, fns, tps, cui_prec, cui_rec, cui_f1, cui_counts, examples = self.cat._print_stats(data=self.mct_export,
-                                                                                                   use_project_filters=True,
-                                                                                                   extra_cui_filter=extra_cui_filter)
+            fps, fns, tps, cui_prec, cui_rec, cui_f1, cui_counts, examples = get_stats(self.cat,
+                                                                                       data=self.mct_export,
+                                                                                       use_project_filters=True,
+                                                                                       extra_cui_filter=extra_cui_filter)
             # remap tps, fns, fps to specific user annotations
             examples = self.enrich_medcat_metrics(examples)
             concept_count_df['fps'] = concept_count_df['cui'].map(fps)
@@ -236,10 +238,10 @@ def rename_meta_anns(self, meta_anns2rename=dict(), meta_ann_values2rename=dict(
 
     def _eval_model(self, model: nn.Module, data: List, config: ConfigMetaCAT, tokenizer: TokenizerWrapperBase) -> Dict:
         device = torch.device(config.general['device'])  # Create a torch device
-        batch_size_eval = config.general['batch_size_eval']
-        pad_id = config.model['padding_idx']
-        ignore_cpos = config.model['ignore_cpos']
-        class_weights = config.train['class_weights']
+        batch_size_eval = config.general.batch_size_eval
+        pad_id = config.model.padding_idx
+        ignore_cpos = config.model.ignore_cpos
+        class_weights = config.train.class_weights
 
         if class_weights is not None:
             class_weights = torch.FloatTensor(class_weights).to(device)
@@ -319,7 +321,7 @@ def full_annotation_df(self) -> pd.DataFrame:
         for meta_model_card in self.cat.get_model_card(as_dict=True)['MetaCAT models']:
             meta_model = meta_model_card['Category Name']
             logger.info(f'Checking metacat model: {meta_model}')
-            _meta_model = MetaCAT.load(self.model_pack_path + '/meta_' + meta_model)
+            _meta_model = MetaCATAddon.load(self.model_pack_path + '/meta_' + meta_model)
             meta_results = self._eval(_meta_model, self.mct_export)
             _meta_values = {v: k for k, v in meta_results['meta_values'].items()}
             pred_meta_values = []
@@ -381,7 +383,7 @@ def meta_anns_concept_summary(self) -> pd.DataFrame:
         meta_anns_df['total_anns'] = meta_anns_df[col_lst].sum(axis=1)
         meta_anns_df = meta_anns_df.sort_values(by='total_anns', ascending=False)
         meta_anns_df = meta_anns_df.rename_axis('cui').reset_index(drop=False)
-        meta_anns_df.insert(1, 'concept_name', meta_anns_df['cui'].map(self.cat.cdb.cui2preferred_name))
+        meta_anns_df.insert(1, 'concept_name', meta_anns_df['cui'].map(self.cat.cdb.get_name))
         return meta_anns_df
 
     def generate_report(self, meta_ann=False):

From c854ea72feb1221003aab4eb86597a4c0f661fe3 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Fri, 27 Jun 2025 15:22:58 +0100
Subject: [PATCH 12/33] Do config parsing locally

---
 webapp/api/api/model_cache.py | 67 ++++++++++++++++++++++++++++++++---
 1 file changed, 63 insertions(+), 4 deletions(-)

diff --git a/webapp/api/api/model_cache.py b/webapp/api/api/model_cache.py
index efac353d..e7d6ddb6 100644
--- a/webapp/api/api/model_cache.py
+++ b/webapp/api/api/model_cache.py
@@ -1,9 +1,12 @@
 import logging
 import os
-from typing import Dict
+from typing import Dict, Optional, Any
+
+from pydantic import ValidationError
 
 from medcat import __version__ as mct_version
 from medcat.cat import CAT
+from medcat.config.config import Config, SerialisableBaseModel
 from medcat.cdb import CDB
 from medcat.vocab import Vocab
 from medcat.utils.legacy.convert_cdb import get_cdb_from_old
@@ -74,7 +77,7 @@ def get_medcat_from_cdb_vocab(project,
 
             custom_config = os.getenv("MEDCAT_CONFIG_FILE")
             if custom_config is not None and os.path.exists(custom_config):
-                cdb.config.parse_config_file(path=custom_config)
+                _parse_config_file(cdb.config, custom_config)
             else:
                 logger.info("No MEDCAT_CONFIG_FILE env var set to valid path, using default config available on CDB")
             cdb_map[cdb_id] = cdb
@@ -91,6 +94,62 @@ def get_medcat_from_cdb_vocab(project,
     return cat
 
 
+def _parse_config_file(config: Config,
+                       custom_config_path: str):
+    # NOTE: the v2 mappings are a little different
+    mappings = {
+        "linking": "components.linking",
+        "ner": "components.ner",
+    }
+    mappings_key = {
+        "spacy_model": "nlp.modelname"
+    }
+    with open(custom_config_path) as f:
+        for line in f:
+            if not line.strip().startswith("cat"):
+                continue
+            line = line[4:]
+            left, right = line.split("=")
+            variable, key = left.split(".")
+            variable = variable.strip()
+            # map to v2
+            variable = mappings.get(variable, variable)
+            key = key.strip()
+            # key can also differ
+            key = mappings_key.get(key, key)
+            value = eval(right)
+            alt_value = set() if right.strip() in ({}, "{}") else None
+
+            # get (potentially nested in case of v2 mapping) attribute
+            cnf = config
+            while "." in variable:
+                current, variable = variable.split(".", 1)
+                cnf = getattr(cnf, current)
+            attr = getattr(cnf, variable)
+            while "." in key:
+                cur_key, key = key.split(".", 1)
+                attr = getattr(attr, cur_key)
+            if isinstance(attr, SerialisableBaseModel):
+                _set_value_or_alt(attr, key, value, alt_value)
+            elif isinstance(attr, dict):
+                attr[key] = value
+            else:
+                raise ValueError(f'Unknown attribute {attr} for "{line}"')
+
+
+def _set_value_or_alt(conf: SerialisableBaseModel, key: str, value: Any,
+                      alt_value: Any, err: Optional[ValidationError] = None) -> None:
+    try:
+        setattr(conf, key, value) # hoping for correct type
+    except ValidationError as ve:
+        if alt_value is not None:
+            _set_value_or_alt(conf, key, alt_value, None, err=ve)
+        elif err is not None:
+            raise err
+        else:
+            raise ve
+
+
 def get_medcat_from_model_pack(project, cat_map: Dict[str, CAT]=CAT_MAP) -> CAT:
     model_pack_obj = project.model_pack
     cat_id = 'mp' + str(model_pack_obj.id)
@@ -111,8 +170,8 @@ def get_medcat(project,
         else:
             cat = get_medcat_from_model_pack(project, cat_map)
         return cat
-    except AttributeError:
-        raise Exception('Failure loading Project ConceptDB, Vocab or Model Pack. Are these set correctly?')
+    except AttributeError as err:
+        raise Exception('Failure loading Project ConceptDB, Vocab or Model Pack. Are these set correctly?') from err
 
 
 def get_cached_medcat(project, cat_map: Dict[str, CAT]=CAT_MAP):

From 5ec59fc49a7c2eef9c2236ecc3501526c99ccfee Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Fri, 27 Jun 2025 15:36:51 +0100
Subject: [PATCH 13/33] Bump version to latest

---
 webapp/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/webapp/requirements.txt b/webapp/requirements.txt
index 184c5f97..e230a260 100644
--- a/webapp/requirements.txt
+++ b/webapp/requirements.txt
@@ -6,4 +6,4 @@ django-polymorphic==3.0.*
 djangorestframework==3.15.*
 django-background-tasks-updated==1.2.*
 openpyxl==3.1.2
-medcat[meta-cat,spacy,rel-cat,deid] @ git+https://github.com/CogStack/cogstack-nlp.git@refs/tags/medcat/v0.11.0#subdirectory=medcat-v2
+medcat[meta-cat,spacy,rel-cat,deid] @ git+https://github.com/CogStack/cogstack-nlp.git@refs/tags/medcat/v0.11.1#subdirectory=medcat-v2

From 3e6ea19bf277c323aeb202f9f0e7b4f9c39d9f8c Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Fri, 27 Jun 2025 16:19:36 +0100
Subject: [PATCH 14/33] Bump version to latest

---
 webapp/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/webapp/requirements.txt b/webapp/requirements.txt
index e230a260..311d021e 100644
--- a/webapp/requirements.txt
+++ b/webapp/requirements.txt
@@ -6,4 +6,4 @@ django-polymorphic==3.0.*
 djangorestframework==3.15.*
 django-background-tasks-updated==1.2.*
 openpyxl==3.1.2
-medcat[meta-cat,spacy,rel-cat,deid] @ git+https://github.com/CogStack/cogstack-nlp.git@refs/tags/medcat/v0.11.1#subdirectory=medcat-v2
+medcat[meta-cat,spacy,rel-cat,deid] @ git+https://github.com/CogStack/cogstack-nlp.git@refs/tags/medcat/v0.11.2#subdirectory=medcat-v2

From 84422fe60b6a4a7e5770c67847c62efb27df6b3b Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Fri, 27 Jun 2025 16:33:59 +0100
Subject: [PATCH 15/33] Update to correct attribute name

---
 webapp/api/api/utils.py | 8 ++++----
 webapp/api/api/views.py | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/webapp/api/api/utils.py b/webapp/api/api/utils.py
index f4500647..eab61afb 100644
--- a/webapp/api/api/utils.py
+++ b/webapp/api/api/utils.py
@@ -35,7 +35,7 @@ def remove_annotations(document, project, partial=False):
 
 
 def add_annotations(spacy_doc, user, project, document, existing_annotations, cat):
-    spacy_doc.final_ents.sort(key=lambda x: len(x.text), reverse=True)
+    spacy_doc.linked_ents.sort(key=lambda x: len(x.text), reverse=True)
 
     tkns_in = []
     ents = []
@@ -44,9 +44,9 @@ def add_annotations(spacy_doc, user, project, document, existing_annotations, ca
     # that can be produced are expected to have available models
     try:
         metatask2obj = {task_name: MetaTask.objects.get(name=task_name)
-                        for task_name in spacy_doc.final_ents[0].get_addon_data('meta_cat_meta_anns').keys()}
+                        for task_name in spacy_doc.linked_ents[0].get_addon_data('meta_cat_meta_anns').keys()}
         metataskvals2obj = {task_name: {v.name: v for v in MetaTask.objects.get(name=task_name).values.all()}
-                            for task_name in spacy_doc.final_ents[0].get_addon_data('meta_cat_meta_anns').keys()}
+                            for task_name in spacy_doc.linked_ents[0].get_addon_data('meta_cat_meta_anns').keys()}
     except (AttributeError, IndexError):
         # IndexError: ignore if there are no annotations in this doc
         # AttributeError: ignore meta_anns that are not present - i.e. non model pack preds
@@ -65,7 +65,7 @@ def check_filters(cui, filters):
         else:
             return False
 
-    for ent in spacy_doc.final_ents:
+    for ent in spacy_doc.linked_ents:
         if not check_ents(ent) and check_filters(ent.cui, cat.config.components.linking.filters):
             to_add = True
             for tkn in ent:
diff --git a/webapp/api/api/views.py b/webapp/api/api/views.py
index 729d4b2f..d53a6465 100644
--- a/webapp/api/api/views.py
+++ b/webapp/api/api/views.py
@@ -618,7 +618,7 @@ def annotate_text(request):
 
     ents = []
     anno_tkns = []
-    for ent in spacy_doc.final_ents:
+    for ent in spacy_doc.linked_ents:
         cnt = Entity.objects.filter(label=ent._.cui).count()
         inc_ent = all(tkn not in anno_tkns for tkn in ent)
         if inc_ent and cnt != 0:

From 3d94e55281c9b2fcc81ba4a7117f5535848e712f Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Mon, 30 Jun 2025 09:07:14 +0100
Subject: [PATCH 16/33] Update solr utils to v2

---
 webapp/api/api/solr_utils.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/webapp/api/api/solr_utils.py b/webapp/api/api/solr_utils.py
index 65c5183d..d7d19a77 100644
--- a/webapp/api/api/solr_utils.py
+++ b/webapp/api/api/solr_utils.py
@@ -6,6 +6,7 @@
 import requests
 from django.http import HttpResponseServerError
 from medcat.cdb import CDB
+from medcat.cdb.concepts import CUIInfo
 from rest_framework.response import Response
 
 from api.models import ConceptDB
@@ -128,14 +129,14 @@ def import_all_concepts(cdb: CDB, cdb_model: ConceptDB):
     if resp.status_code != 200:
         _solr_error_response(resp, 'Failure creating collection')
 
-    cui2name_iter = iter(cdb.cui2names.items())
+    cui2info_iter = iter(cdb.cui2info.items())
 
     payload = []
     try:
         while True:
             for i in range(5000):
-                cui, name = next(cui2name_iter)
-                concept_dct = _concept_dct(cui, cdb)
+                cui, info = next(cui2info_iter)
+                concept_dct = _concept_dct(cui, cdb, info)
                 payload.append(concept_dct)
             _upload_payload(f'{base_url}/{collection_name}/update', payload, collection_name)
             payload = []
@@ -175,7 +176,7 @@ def ensure_concept_searchable(cui, cdb: CDB, cdb_model: ConceptDB):
     resp = requests.get(url)
     if resp.status_code == 200:
         collections = json.loads(resp.text)['collections']
-        data = [_concept_dct(cui, cdb)]
+        data = [_concept_dct(cui, cdb, cdb.cui2info[cui])]
         if collection in collections:
             _upload_payload(f'{base_url}/{collection}/update', data, collection, commit=True)
 
@@ -190,14 +191,14 @@ def _upload_payload(update_url, data, collection, commit=False):
         _solr_error_response(resp, f'error updating {collection}')
 
 
-def _concept_dct(cui: str, cdb: CDB):
-    synonyms = list(cdb.addl_info.get('cui2original_names', {}).get(cui, set()))
+def _concept_dct(cui: str, cdb: CDB, info: CUIInfo):
+    synonyms = list(info['original_names'] or [])
     concept_dct = {
         'cui': str(cui),
         'pretty_name': cdb.get_name(cui),
         'name': re.sub(r'\([\w+\s]+\)', '', cdb.get_name(cui)).strip(),
-        'type_ids': list(cdb.cui2type_ids[cui]),
-        'desc': cdb.addl_info.get('cui2description', {}).get(cui, ''),
+        'type_ids': list(info['type_ids']),
+        'desc': info['description'],
         'synonyms': synonyms if len(synonyms) > 0 else [cdb.get_name(cui)]
     }
     return concept_dct

From bec4945870a12c7e57dff2ef2a175d40d320eb13 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Mon, 30 Jun 2025 09:09:01 +0100
Subject: [PATCH 17/33] Fix config access for v2

---
 webapp/api/api/metrics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/webapp/api/api/metrics.py b/webapp/api/api/metrics.py
index c84ac6e8..37a3c4f4 100644
--- a/webapp/api/api/metrics.py
+++ b/webapp/api/api/metrics.py
@@ -237,7 +237,7 @@ def rename_meta_anns(self, meta_anns2rename=dict(), meta_ann_values2rename=dict(
         return
 
     def _eval_model(self, model: nn.Module, data: List, config: ConfigMetaCAT, tokenizer: TokenizerWrapperBase) -> Dict:
-        device = torch.device(config.general['device'])  # Create a torch device
+        device = torch.device(config.general.device)  # Create a torch device
         batch_size_eval = config.general.batch_size_eval
         pad_id = config.model.padding_idx
         ignore_cpos = config.model.ignore_cpos

From 239806b1c2ab9756deb033bbc5c94bf4c2b07ad2 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Tue, 1 Jul 2025 11:22:10 +0100
Subject: [PATCH 18/33] Remove addons from CDB config upon load

---
 webapp/api/api/model_cache.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/webapp/api/api/model_cache.py b/webapp/api/api/model_cache.py
index e7d6ddb6..b3da8aa4 100644
--- a/webapp/api/api/model_cache.py
+++ b/webapp/api/api/model_cache.py
@@ -65,6 +65,10 @@ def get_medcat_from_cdb_vocab(project,
                 cdb_map[cdb_id] = cdb
                 cdb_path = project.concept_db.cdb_file.path
                 cdb_map[cdb_id] = cdb
+            # NOTE: when loading a CDB separately, we don't necessarily want to
+            #       load / create addons like MetaCAT as well
+            logger.info('Clearing addons for CDB upon load: %s', cdb_id)
+            cdb.config.components.addons.clear()
 
             except KeyError as ke:
                 mc_v = mct_version

From c40e17b4981889371a4457b913f25bbba4eb0d83 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Tue, 1 Jul 2025 11:24:40 +0100
Subject: [PATCH 19/33] Fix syntax error

---
 webapp/api/api/model_cache.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/webapp/api/api/model_cache.py b/webapp/api/api/model_cache.py
index b3da8aa4..b52fa3de 100644
--- a/webapp/api/api/model_cache.py
+++ b/webapp/api/api/model_cache.py
@@ -65,10 +65,6 @@ def get_medcat_from_cdb_vocab(project,
                 cdb_map[cdb_id] = cdb
                 cdb_path = project.concept_db.cdb_file.path
                 cdb_map[cdb_id] = cdb
-            # NOTE: when loading a CDB separately, we don't necessarily want to
-            #       load / create addons like MetaCAT as well
-            logger.info('Clearing addons for CDB upon load: %s', cdb_id)
-            cdb.config.components.addons.clear()
 
             except KeyError as ke:
                 mc_v = mct_version
@@ -78,6 +74,10 @@ def get_medcat_from_cdb_vocab(project,
                                     'Please re-configure this project to use a MedCAT v1.x CDB or consult the '
                                     'MedCATTrainer Dev team if you believe this should work') from ke
                 raise
+            # NOTE: when loading a CDB separately, we don't necessarily want to
+            #       load / create addons like MetaCAT as well
+            logger.info('Clearing addons for CDB upon load: %s', cdb_id)
+            cdb.config.components.addons.clear()
 
             custom_config = os.getenv("MEDCAT_CONFIG_FILE")
             if custom_config is not None and os.path.exists(custom_config):

From 98efed8f9c6ef2c82c9f510b7cb16adb8d010323 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Tue, 1 Jul 2025 11:35:44 +0100
Subject: [PATCH 20/33] Update Meta Annotation getting so as to avoid error if
 none set

---
 webapp/api/api/utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/webapp/api/api/utils.py b/webapp/api/api/utils.py
index eab61afb..30c41ac2 100644
--- a/webapp/api/api/utils.py
+++ b/webapp/api/api/utils.py
@@ -10,6 +10,7 @@
 from django.dispatch import receiver
 from medcat.cat import CAT
 from medcat.components.ner.trf.deid import DeIdModel
+from medcat.tokenizing.tokens import UnregisteredDataPathException
 
 from .model_cache import get_medcat
 from .models import Entity, AnnotatedEntity, ProjectAnnotateEntities, \
@@ -47,7 +48,7 @@ def add_annotations(spacy_doc, user, project, document, existing_annotations, ca
                         for task_name in spacy_doc.linked_ents[0].get_addon_data('meta_cat_meta_anns').keys()}
         metataskvals2obj = {task_name: {v.name: v for v in MetaTask.objects.get(name=task_name).values.all()}
                             for task_name in spacy_doc.linked_ents[0].get_addon_data('meta_cat_meta_anns').keys()}
-    except (AttributeError, IndexError):
+    except (AttributeError, IndexError, UnregisteredDataPathException):
         # IndexError: ignore if there are no annotations in this doc
         # AttributeError: ignore meta_anns that are not present - i.e. non model pack preds
         # or model pack preds with no meta_anns

From 6e2c1c05c598245d0a0933f1575ab816647783c4 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Tue, 1 Jul 2025 11:50:36 +0100
Subject: [PATCH 21/33] Fix entity CUI / start/end char access

---
 webapp/api/api/views.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/webapp/api/api/views.py b/webapp/api/api/views.py
index d53a6465..26d94ce7 100644
--- a/webapp/api/api/views.py
+++ b/webapp/api/api/views.py
@@ -619,17 +619,17 @@ def annotate_text(request):
     ents = []
     anno_tkns = []
     for ent in spacy_doc.linked_ents:
-        cnt = Entity.objects.filter(label=ent._.cui).count()
+        cnt = Entity.objects.filter(label=ent.cui).count()
         inc_ent = all(tkn not in anno_tkns for tkn in ent)
         if inc_ent and cnt != 0:
             anno_tkns.extend([tkn for tkn in ent])
-            entity = Entity.objects.get(label=ent._.cui)
+            entity = Entity.objects.get(label=ent.cui)
             ents.append({
                 'entity': entity.id,
-                'value': ent.text,
-                'start_ind': ent.start_char,
-                'end_ind': ent.end_char,
-                'acc': ent._.context_similarity
+                'value': ent.base.text,
+                'start_ind': ent.base.start_char_index,
+                'end_ind': ent.base.end_char_index,
+                'acc': ent.context_similarity
             })
 
     ents.sort(key=lambda e: e['start_ind'])

From bac52940f240a373a3d07d68f62938dc4dc631c2 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Tue, 1 Jul 2025 11:52:18 +0100
Subject: [PATCH 22/33] Fix some more entity detail access

---
 webapp/api/api/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/webapp/api/api/utils.py b/webapp/api/api/utils.py
index 30c41ac2..5319be2f 100644
--- a/webapp/api/api/utils.py
+++ b/webapp/api/api/utils.py
@@ -57,8 +57,8 @@ def add_annotations(spacy_doc, user, project, document, existing_annotations, ca
         pass
 
     def check_ents(ent):
-        return any((ea[0] < ent.start_char < ea[1]) or
-                   (ea[0] < ent.end_char < ea[1]) for ea in existing_annos_intervals)
+        return any((ea[0] < ent.start_char_index < ea[1]) or
+                   (ea[0] < ent.end_char_index < ea[1]) for ea in existing_annos_intervals)
 
     def check_filters(cui, filters):
         if cui in filters.cuis or not filters.cuis:

From 3ef9805222bb0531069f9e81ed750612902e6a56 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Tue, 1 Jul 2025 16:42:09 +0100
Subject: [PATCH 23/33] Remove unigram table error (irrelevant / redundant)

---
 webapp/api/api/views.py | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/webapp/api/api/views.py b/webapp/api/api/views.py
index 26d94ce7..effccb70 100644
--- a/webapp/api/api/views.py
+++ b/webapp/api/api/views.py
@@ -460,16 +460,8 @@ def import_cdb_concepts(request):
 
 def _submit_document(project: ProjectAnnotateEntities, document: Document):
     if project.train_model_on_submit:
-        try:
-            cat = get_medcat(project=project)
-            train_medcat(cat, project, document)
-        except Exception as e:
-            if project.vocab.id:
-                if len(VOCAB_MAP[project.vocab.id].unigram_table) == 0:
-                    return Exception('Vocab is missing the unigram table. On the vocab instance '
-                                     'use vocab.make_unigram_table() to build')
-            else:
-                raise e
+        cat = get_medcat(project=project)
+        train_medcat(cat, project, document)
 
     # Add cuis to filter if they did not exist
     cuis = []

From c60692dd05dcd883221f34f3c61d287b64e01cee Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Tue, 1 Jul 2025 16:46:57 +0100
Subject: [PATCH 24/33] Log more info regarding failure upon document
 preparation

---
 webapp/api/api/views.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/webapp/api/api/views.py b/webapp/api/api/views.py
index effccb70..4b61bbcb 100644
--- a/webapp/api/api/views.py
+++ b/webapp/api/api/views.py
@@ -303,6 +303,7 @@ def prepare_documents(request):
                 project.save()
 
     except Exception as e:
+        logger.warning('Error preparing documents for project %s', p_id, exc_info=e)
         stack = traceback.format_exc()
         return Response({'message': e.args[0] if len(e.args) > 0 else 'Internal Server Error',
                          'description': e.args[1] if len(e.args) > 1 else '',

From b9b3c3e25b875ea16dec7173c4db37c58c17d4e7 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Tue, 1 Jul 2025 20:57:57 +0100
Subject: [PATCH 25/33] Bump dependency version to latest (0.12.0)

---
 webapp/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/webapp/requirements.txt b/webapp/requirements.txt
index 311d021e..bf538fba 100644
--- a/webapp/requirements.txt
+++ b/webapp/requirements.txt
@@ -6,4 +6,4 @@ django-polymorphic==3.0.*
 djangorestframework==3.15.*
 django-background-tasks-updated==1.2.*
 openpyxl==3.1.2
-medcat[meta-cat,spacy,rel-cat,deid] @ git+https://github.com/CogStack/cogstack-nlp.git@refs/tags/medcat/v0.11.2#subdirectory=medcat-v2
+medcat[meta-cat,spacy,rel-cat,deid] @ git+https://github.com/CogStack/cogstack-nlp.git@refs/tags/medcat/v0.12.0#subdirectory=medcat-v2

From d394fb8133e3d533ed8c363320a6cc6c629614ae Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Wed, 2 Jul 2025 10:50:49 +0100
Subject: [PATCH 26/33] Centralising clearnig CDB addons afer explicit load

---
 webapp/api/api/admin/actions.py                |  2 ++
 .../api/migrations/0074_auto_20231211_1526.py  | 18 ++++++++++++++++++
 .../api/migrations/0090_merge_20250623_1330.py | 14 ++++++++++++++
 webapp/api/api/model_cache.py                  |  6 ++----
 webapp/api/api/utils.py                        |  7 +++++++
 5 files changed, 43 insertions(+), 4 deletions(-)
 create mode 100644 webapp/api/api/migrations/0074_auto_20231211_1526.py
 create mode 100644 webapp/api/api/migrations/0090_merge_20250623_1330.py

diff --git a/webapp/api/api/admin/actions.py b/webapp/api/api/admin/actions.py
index 6123e9ef..e5f7afde 100644
--- a/webapp/api/api/admin/actions.py
+++ b/webapp/api/api/admin/actions.py
@@ -360,6 +360,7 @@ def dataset_document_counts(dataset):
 def _reset_cdb_filters(id):
     concept_db = ConceptDB.objects.get(id=id)
     cdb = CDB.load(concept_db.cdb_file.path)
+    # TODO: clear addons
     cdb.config.components.linking.filters = {'cuis': set()}
     cdb.save(concept_db.cdb_file.path)
 
@@ -368,6 +369,7 @@ def _reset_cdb_filters(id):
 def import_concepts_from_cdb(cdb_model_id: int):
     cdb_model = ConceptDB.objects.get(id=cdb_model_id)
     cdb = CDB.load(cdb_model.cdb_file.path)
+    # TODO: clear addons
     import_all_concepts(cdb, cdb_model)
 
 
diff --git a/webapp/api/api/migrations/0074_auto_20231211_1526.py b/webapp/api/api/migrations/0074_auto_20231211_1526.py
new file mode 100644
index 00000000..e910b17b
--- /dev/null
+++ b/webapp/api/api/migrations/0074_auto_20231211_1526.py
@@ -0,0 +1,18 @@
+# Generated by Django 2.2.28 on 2023-12-11 15:26
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('api', '0073_auto_20231022_0028'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='projectmetrics',
+            name='projects',
+            field=models.ManyToManyField(blank=True, to='api.ProjectAnnotateEntities'),
+        ),
+    ]
diff --git a/webapp/api/api/migrations/0090_merge_20250623_1330.py b/webapp/api/api/migrations/0090_merge_20250623_1330.py
new file mode 100644
index 00000000..8c502e4a
--- /dev/null
+++ b/webapp/api/api/migrations/0090_merge_20250623_1330.py
@@ -0,0 +1,14 @@
+# Generated by Django 5.1.11 on 2025-06-23 13:30
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('api', '0074_auto_20231211_1526'),
+        ('api', '0089_projectannotateentities_deid_model_annotation_and_more'),
+    ]
+
+    operations = [
+    ]
diff --git a/webapp/api/api/model_cache.py b/webapp/api/api/model_cache.py
index b52fa3de..e2526a87 100644
--- a/webapp/api/api/model_cache.py
+++ b/webapp/api/api/model_cache.py
@@ -12,6 +12,7 @@
 from medcat.utils.legacy.convert_cdb import get_cdb_from_old
 
 from api.models import ConceptDB
+from .utils import clear_cdb_cnf_addons
 
 """
 Module level caches for CDBs, Vocabs and CAT instances.
@@ -74,10 +75,7 @@ def get_medcat_from_cdb_vocab(project,
                                     'Please re-configure this project to use a MedCAT v1.x CDB or consult the '
                                     'MedCATTrainer Dev team if you believe this should work') from ke
                 raise
-            # NOTE: when loading a CDB separately, we don't necessarily want to
-            #       load / create addons like MetaCAT as well
-            logger.info('Clearing addons for CDB upon load: %s', cdb_id)
-            cdb.config.components.addons.clear()
+            clear_cdb_cnf_addons(cdb, cdb_id)
 
             custom_config = os.getenv("MEDCAT_CONFIG_FILE")
             if custom_config is not None and os.path.exists(custom_config):
diff --git a/webapp/api/api/utils.py b/webapp/api/api/utils.py
index 5319be2f..6f619c67 100644
--- a/webapp/api/api/utils.py
+++ b/webapp/api/api/utils.py
@@ -9,6 +9,7 @@
 from django.db.models.signals import post_save
 from django.dispatch import receiver
 from medcat.cat import CAT
+from medcat.cdb import CDB
 from medcat.components.ner.trf.deid import DeIdModel
 from medcat.tokenizing.tokens import UnregisteredDataPathException
 
@@ -130,6 +131,12 @@ def check_filters(cui, filters):
             #         logger.debug('Successfully saved %s', meta_anno_obj)
 
 
+def clear_cdb_cnf_addons(cdb: CDB, cdb_id: str):
+    # NOTE: when loading a CDB separately, we don't necessarily want to
+    #       load / create addons like MetaCAT as well
+    logger.info('Clearing addons for CDB upon load: %s', cdb_id)
+    cdb.config.components.addons.clear()
+
 
 def get_create_cdb_infos(cdb, concept, cui, cui_info_prop, code_prop, desc_prop, model_clazz):
     codes = [c[code_prop] for c in cdb.cui2info.get(cui, {}).get(cui_info_prop, []) if code_prop in c]

From 503b7c2637b8f083f89b20f8f6bd0da3b16228ba Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Wed, 2 Jul 2025 10:51:14 +0100
Subject: [PATCH 27/33] More specific import

---
 webapp/api/api/model_cache.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/webapp/api/api/model_cache.py b/webapp/api/api/model_cache.py
index e2526a87..5fdf37d7 100644
--- a/webapp/api/api/model_cache.py
+++ b/webapp/api/api/model_cache.py
@@ -12,7 +12,7 @@
 from medcat.utils.legacy.convert_cdb import get_cdb_from_old
 
 from api.models import ConceptDB
-from .utils import clear_cdb_cnf_addons
+from api.utils import clear_cdb_cnf_addons
 
 """
 Module level caches for CDBs, Vocabs and CAT instances.

From 34854466a90a822256b1e26858e117d6f2bc5f50 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Wed, 2 Jul 2025 10:57:32 +0100
Subject: [PATCH 28/33] Clear CDB config addons everywhere if/when applicable

---
 webapp/api/api/admin/actions.py | 5 +++--
 webapp/api/api/metrics.py       | 2 ++
 webapp/api/api/model_cache.py   | 1 +
 webapp/api/api/utils.py         | 2 +-
 4 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/webapp/api/api/admin/actions.py b/webapp/api/api/admin/actions.py
index e5f7afde..1ec12d16 100644
--- a/webapp/api/api/admin/actions.py
+++ b/webapp/api/api/admin/actions.py
@@ -12,6 +12,7 @@
 
 from api.models import AnnotatedEntity, MetaAnnotation, EntityRelation, Document, ConceptDB
 from api.solr_utils import drop_collection, import_all_concepts
+from api.utils import clear_cdb_cnf_addons
 
 from medcat.cdb import CDB
 
@@ -360,7 +361,7 @@ def dataset_document_counts(dataset):
 def _reset_cdb_filters(id):
     concept_db = ConceptDB.objects.get(id=id)
     cdb = CDB.load(concept_db.cdb_file.path)
-    # TODO: clear addons
+    clear_cdb_cnf_addons(cdb, id)
     cdb.config.components.linking.filters = {'cuis': set()}
     cdb.save(concept_db.cdb_file.path)
 
@@ -369,7 +370,7 @@ def _reset_cdb_filters(id):
 def import_concepts_from_cdb(cdb_model_id: int):
     cdb_model = ConceptDB.objects.get(id=cdb_model_id)
     cdb = CDB.load(cdb_model.cdb_file.path)
-    # TODO: clear addons
+    clear_cdb_cnf_addons(cdb, cdb_model_id)
     import_all_concepts(cdb, cdb_model)
 
 
diff --git a/webapp/api/api/metrics.py b/webapp/api/api/metrics.py
index 37a3c4f4..bcb8116f 100644
--- a/webapp/api/api/metrics.py
+++ b/webapp/api/api/metrics.py
@@ -26,6 +26,7 @@
 
 from api.admin import retrieve_project_data
 from api.models import AnnotatedEntity, ProjectAnnotateEntities, ProjectMetrics as AppProjectMetrics
+from api.utils import clear_cdb_cnf_addons
 from core.settings import MEDIA_ROOT
 
 _dt_fmt = '%Y-%m-%d %H:%M:%S.%f'
@@ -50,6 +51,7 @@ def calculate_metrics(project_ids: List[int], report_name: str):
     else:
         # assume the cdb / vocab is set in these projects
         cdb = CDB.load(projects[0].concept_db.cdb_file.path)
+        clear_cdb_cnf_addons(cdb, projects[0].concept_db.name)
         vocab = Vocab.load(projects[0].vocab.vocab_file.path)
         cat = CAT(cdb, vocab, config=cdb.config)
     project_data = retrieve_project_data(projects)
diff --git a/webapp/api/api/model_cache.py b/webapp/api/api/model_cache.py
index 5fdf37d7..ad010641 100644
--- a/webapp/api/api/model_cache.py
+++ b/webapp/api/api/model_cache.py
@@ -204,6 +204,7 @@ def get_cached_cdb(cdb_id: str, cdb_map: Dict[str, CDB]=CDB_MAP) -> CDB:
     if cdb_id not in cdb_map:
         cdb_obj = ConceptDB.objects.get(id=cdb_id)
         cdb = CDB.load(cdb_obj.cdb_file.path)
+        clear_cdb_cnf_addons(cdb, cdb_id)
         cdb_map[cdb_id] = cdb
     return cdb_map[cdb_id]
 
diff --git a/webapp/api/api/utils.py b/webapp/api/api/utils.py
index 6f619c67..483370d4 100644
--- a/webapp/api/api/utils.py
+++ b/webapp/api/api/utils.py
@@ -131,7 +131,7 @@ def check_filters(cui, filters):
             #         logger.debug('Successfully saved %s', meta_anno_obj)
 
 
-def clear_cdb_cnf_addons(cdb: CDB, cdb_id: str):
+def clear_cdb_cnf_addons(cdb: CDB, cdb_id: str | int):
     # NOTE: when loading a CDB separately, we don't necessarily want to
     #       load / create addons like MetaCAT as well
     logger.info('Clearing addons for CDB upon load: %s', cdb_id)

From 0ddfb61f5cf64804bf16ac8f3692e8d83ed1cf1a Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Wed, 2 Jul 2025 11:05:10 +0100
Subject: [PATCH 29/33] Avoid circular imports by importing dynamically

---
 webapp/api/api/model_cache.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/webapp/api/api/model_cache.py b/webapp/api/api/model_cache.py
index ad010641..e2104851 100644
--- a/webapp/api/api/model_cache.py
+++ b/webapp/api/api/model_cache.py
@@ -12,7 +12,6 @@
 from medcat.utils.legacy.convert_cdb import get_cdb_from_old
 
 from api.models import ConceptDB
-from api.utils import clear_cdb_cnf_addons
 
 """
 Module level caches for CDBs, Vocabs and CAT instances.
@@ -75,6 +74,8 @@ def get_medcat_from_cdb_vocab(project,
                                     'Please re-configure this project to use a MedCAT v1.x CDB or consult the '
                                     'MedCATTrainer Dev team if you believe this should work') from ke
                 raise
+            # NOTE: dynamic import to avoid circular imports
+            from api.utils import clear_cdb_cnf_addons
             clear_cdb_cnf_addons(cdb, cdb_id)
 
             custom_config = os.getenv("MEDCAT_CONFIG_FILE")

From be70c2cabd2ab15baaa0bab61786481b06bde8b5 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Wed, 2 Jul 2025 11:31:55 +0100
Subject: [PATCH 30/33] Correctly set CDB path within v2 model packs

---
 webapp/api/api/models.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/webapp/api/api/models.py b/webapp/api/api/models.py
index d2f1a210..82705af1 100644
--- a/webapp/api/api/models.py
+++ b/webapp/api/api/models.py
@@ -64,7 +64,12 @@ def save(self, *args, **kwargs):
             CAT.load_cdb(unpacked_model_pack_path)
             concept_db = ConceptDB()
             unpacked_file_name = self.model_pack.file.name.replace('.zip', '')
-            concept_db.cdb_file.name = os.path.join(unpacked_file_name, 'cdb.dat')
+            # cdb path for v2
+            cdb_path = os.path.join(unpacked_file_name, 'cdb')
+            if not os.path.exists(cdb_path):
+                # cdb path for v1
+                cdb_path = os.path.join(unpacked_file_name, 'cdb.dat')
+            concept_db.cdb_file.name = cdb_path
             concept_db.name = f'{self.name}_CDB'
             concept_db.save(skip_load=True)
             self.concept_db = concept_db

From 2ba64917ee0b56739c596804f8311faab565d017 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Tue, 5 Aug 2025 16:58:14 +0100
Subject: [PATCH 31/33] Update dependency to pypy-based version

---
 webapp/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/webapp/requirements.txt b/webapp/requirements.txt
index bf538fba..837a42d9 100644
--- a/webapp/requirements.txt
+++ b/webapp/requirements.txt
@@ -6,4 +6,4 @@ django-polymorphic==3.0.*
 djangorestframework==3.15.*
 django-background-tasks-updated==1.2.*
 openpyxl==3.1.2
-medcat[meta-cat,spacy,rel-cat,deid] @ git+https://github.com/CogStack/cogstack-nlp.git@refs/tags/medcat/v0.12.0#subdirectory=medcat-v2
+medcat[meta-cat,spacy,rel-cat,deid]~=2.0.0b4

From b82a1e36db502cf38048ee4c26d066c6c3859e39 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Wed, 6 Aug 2025 13:49:54 +0100
Subject: [PATCH 32/33] Update (very old) notebook to v2

---
 notebook_docs/Train_MedCAT_Models.ipynb | 36 +++++++++++++------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/notebook_docs/Train_MedCAT_Models.ipynb b/notebook_docs/Train_MedCAT_Models.ipynb
index 3360be55..5a263bf6 100644
--- a/notebook_docs/Train_MedCAT_Models.ipynb
+++ b/notebook_docs/Train_MedCAT_Models.ipynb
@@ -186,7 +186,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2020-09-08T11:27:34.270631Z",
@@ -195,9 +195,11 @@
    },
    "outputs": [],
    "source": [
+    "import json\n",
+    "\n",
     "from medcat.cat import CAT\n",
     "from medcat.cdb import CDB\n",
-    "from medcat.utils.vocab import Vocab"
+    "from medcat.vocab import Vocab"
    ]
   },
   {
@@ -310,7 +312,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2020-09-08T11:27:59.782731Z",
@@ -319,16 +321,14 @@
    },
    "outputs": [],
    "source": [
-    "cdb = CDB()\n",
-    "cdb.load_dict(cdb_path)\n",
-    "vocab = Vocab()\n",
-    "vocab.load_dict(vocab_path)\n",
+    "cdb = CDB.load(cdb_path)\n",
+    "vocab = Vocab.load(vocab_path)\n",
     "cat = CAT(cdb, vocab)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2020-09-08T11:37:38.546552Z",
@@ -1383,12 +1383,13 @@
     }
    ],
    "source": [
-    "cat.train_supervised(data_path=\"example_data/MedCAT_Export_With_Text_2020-05-22_10_34_09.json\",\n",
-    "                     nepochs=1,\n",
-    "                     lr=0.1,\n",
-    "                     anneal=False, # Unless we are reseting the CDB or cui_count this is False\n",
-    "                     print_stats=True,\n",
-    "                     use_filters=True)"
+    "with open(\"example_data/MedCAT_Export_With_Text_2020-05-22_10_34_09.json\") as f:\n",
+    "    data = json.load(f)\n",
+    "cat.trainer.train_supervised_raw(\n",
+    "    data=data,\n",
+    "    nepochs=1,\n",
+    "    print_stats=True,\n",
+    "    use_filters=True)"
    ]
   },
   {
@@ -1402,7 +1403,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 50,
+   "execution_count": null,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2020-09-08T15:04:02.394607Z",
@@ -1411,14 +1412,14 @@
    },
    "outputs": [],
    "source": [
-    "from medcat.meta_cat import MetaCAT\n",
+    "from medcat.components.addons.meta_cat import MetaCAT\n",
     "from tokenizers import ByteLevelBPETokenizer\n",
     "from itertools import chain"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": null,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2020-09-08T14:46:39.070589Z",
@@ -1427,6 +1428,7 @@
    },
    "outputs": [],
    "source": [
+    "import numpy as np\n",
     "# Tokenizer instantiation\n",
     "tokenizer = ByteLevelBPETokenizer(vocab_file='data/medmen-vocab.json', merges_file='data/medmen-merges.txt')\n",
     "embeddings = np.load(open('data/embeddings.npy', 'rb'))"

From 1a4a18dc41030cfbfa22384b70e99513e7c71d8d Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Wed, 6 Aug 2025 13:50:58 +0100
Subject: [PATCH 33/33] Update (very old) notebook for v2 installation

---
 notebook_docs/Train_MedCAT_Models.ipynb | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/notebook_docs/Train_MedCAT_Models.ipynb b/notebook_docs/Train_MedCAT_Models.ipynb
index 5a263bf6..3b3e74cb 100644
--- a/notebook_docs/Train_MedCAT_Models.ipynb
+++ b/notebook_docs/Train_MedCAT_Models.ipynb
@@ -25,7 +25,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -177,7 +177,7 @@
    ],
    "source": [
     "# install medcat\n",
-    "!pip install medcat\n",
+    "!pip install \"medcat[spacy,meta-cat,rel-cat,deid]>=2.0.0\"\n",
     "# scispacy medium models\n",
     "!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_md-0.2.5.tar.gz\n",
     "# ipywidgets\n",