CogStack · mart-r · Jun 12, 2025 · Jun 19, 2025 · Jun 23, 2025 · Jun 23, 2025
diff --git a/notebook_docs/API_Examples.ipynb b/notebook_docs/API_Examples.ipynb
@@ -200,11 +200,11 @@
     "for name, d_s in datasets:\n",
     "    payload = {\n",
     "        'dataset_name': name,   # Name that appears in each\n",
-    "        'dataset': d_s.loc[:, ['name', 'text']].to_dict(),  #  Dictionary representation of only  \n",
+    "        'dataset': d_s.loc[:, ['name', 'text']].to_dict(),  #  Dictionary representation of only\n",
     "        'description': f'{name} first 20 notes from each category' # Description that appears in the trainer\n",
     "    }\n",
     "    resp = requests.post(f'{URL}/api/create-dataset/', json=payload, headers=headers)\n",
-    "    dataset_ids.append(json.loads(resp.text)['dataset_id']) \n",
+    "    dataset_ids.append(json.loads(resp.text)['dataset_id'])\n",
     "# New datasets created in the trainer have the following IDs\n",
     "dataset_ids"
    ]
@@ -262,7 +262,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "metadata": {
     "tags": []
    },
@@ -273,7 +273,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
    "metadata": {
     "tags": []
    },
@@ -290,7 +290,7 @@
     }
    ],
    "source": [
-    "CDB.load('../../medcat-models/deid_medcat_n2c2_modelpack/cdb.dat')"
+    "cdb = CDB.load('../../medcat-models/deid_medcat_n2c2_modelpack/cdb.dat')"
    ]
   },
   {
@@ -301,8 +301,8 @@
    },
    "outputs": [],
    "source": [
-    "txt = json.loads(requests.post(f'{URL}/api/concept-dbs/', headers=headers, \n",
-    "                               data={'name': 'example_cdb', 'use_for_training': True}, \n",
+    "txt = json.loads(requests.post(f'{URL}/api/concept-dbs/', headers=headers,\n",
+    "                               data={'name': 'example_cdb', 'use_for_training': True},\n",
     "                               files={'cdb_file': open('../../medcat-models/deid_medcat_n2c2_modelpack/cdb.dat', 'rb')}).text)"
    ]
   },
@@ -342,8 +342,8 @@
    },
    "outputs": [],
    "source": [
-    "txt = json.loads(requests.put(f'{URL}/api/concept-dbs/21/', headers=headers, \n",
-    "                               data={'name': 'example_cdb-EDITED', 'use_for_training': True}, \n",
+    "txt = json.loads(requests.put(f'{URL}/api/concept-dbs/21/', headers=headers,\n",
+    "                               data={'name': 'example_cdb-EDITED', 'use_for_training': True},\n",
     "                               files={'cdb_file': open('../../medcat-models/deid_medcat_n2c2_modelpack/cdb.dat', 'rb')}).text)"
    ]
   },
@@ -379,8 +379,8 @@
     }
    ],
    "source": [
-    "requests.post(f'{URL}/api/concept-dbs/', headers=headers, \n",
-    "                               data={'name': 'example_cdb', 'use_for_training': True}, \n",
+    "requests.post(f'{URL}/api/concept-dbs/', headers=headers,\n",
+    "                               data={'name': 'example_cdb', 'use_for_training': True},\n",
     "                               files={'cdb_file': open('../../medcat-models/deid_medcat_n2c2_modelpack/cdb.dat', 'rb')}).text)"
    ]
   },
@@ -404,7 +404,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "txt = json.loads(requests.post(f'{URL}/api/vocab/', headers=headers, \n",
+    "txt = json.loads(requests.post(f'{URL}/api/vocab/', headers=headers,\n",
     "                               files={'cdb_file': open('<<LOCATION OF vocab>>', 'rb')}).text)"
    ]
   },
@@ -465,7 +465,7 @@
     "all_cdbs = json.loads(requests.get(f'{URL}/api/concept-dbs/', headers=headers).text)['results']\n",
     "# the CDB ID we'll use for this example\n",
     "cdb_to_use = all_cdbs[0]['id']\n",
-    "# you might have many CDBs here. First 2 cdbs: \n",
+    "# you might have many CDBs here. First 2 cdbs:\n",
     "all_cdbs[0:2]"
    ]
   },
@@ -521,12 +521,12 @@
     "for d_id, p_name in zip(dataset_ids, project_names):\n",
     "    payload = {\n",
     "        'name': f'{p_name} Annotation Project',\n",
-    "        'description': 'Example projects', \n",
-    "        'cuis': '', \n",
+    "        'description': 'Example projects',\n",
+    "        'cuis': '',\n",
     "        'tuis': '',\n",
     "        'dataset': d_id,\n",
-    "        'concept_db': cdb_to_use, \n",
-    "        'vocab': vocab_to_use, \n",
+    "        'concept_db': cdb_to_use,\n",
+    "        'vocab': vocab_to_use,\n",
     "        'members': users_ids\n",
     "    }\n",
     "    project_ids.append(json.loads(requests.post(f'{URL}/api/project-annotate-entities/', json=payload, headers=headers).text))"

diff --git a/notebook_docs/Train_MedCAT_Models.ipynb b/notebook_docs/Train_MedCAT_Models.ipynb
@@ -25,7 +25,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -177,7 +177,7 @@
    ],
    "source": [
     "# install medcat\n",
-    "!pip install medcat\n",
+    "!pip install \"medcat[spacy,meta-cat,rel-cat,deid]>=2.0.0\"\n",
     "# scispacy medium models\n",
     "!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_md-0.2.5.tar.gz\n",
     "# ipywidgets\n",
@@ -186,7 +186,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2020-09-08T11:27:34.270631Z",
@@ -195,9 +195,11 @@
    },
    "outputs": [],
    "source": [
+    "import json\n",
+    "\n",
     "from medcat.cat import CAT\n",
     "from medcat.cdb import CDB\n",
-    "from medcat.utils.vocab import Vocab"
+    "from medcat.vocab import Vocab"
    ]
   },
   {
@@ -310,7 +312,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2020-09-08T11:27:59.782731Z",
@@ -319,16 +321,14 @@
    },
    "outputs": [],
    "source": [
-    "cdb = CDB()\n",
-    "cdb.load_dict(cdb_path)\n",
-    "vocab = Vocab()\n",
-    "vocab.load_dict(vocab_path)\n",
+    "cdb = CDB.load(cdb_path)\n",
+    "vocab = Vocab.load(vocab_path)\n",
     "cat = CAT(cdb, vocab)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2020-09-08T11:37:38.546552Z",
@@ -382,7 +382,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\r",
+      "\r\n",
       "Epoch: 0, Prec: 0.36538461538461536, Rec: 0.8444444444444444, F1: 0.6049145299145299\n",
       "\n",
       "Docs with false positives: Psych Text 1; Psych Text 2\n",
@@ -1383,12 +1383,13 @@
     }
    ],
    "source": [
-    "cat.train_supervised(data_path=\"example_data/MedCAT_Export_With_Text_2020-05-22_10_34_09.json\", \n",
-    "                     nepochs=1,\n",
-    "                     lr=0.1,\n",
-    "                     anneal=False, # Unless we are reseting the CDB or cui_count this is False\n",
-    "                     print_stats=True, \n",
-    "                     use_filters=True)"
+    "with open(\"example_data/MedCAT_Export_With_Text_2020-05-22_10_34_09.json\") as f:\n",
+    "    data = json.load(f)\n",
+    "cat.trainer.train_supervised_raw(\n",
+    "    data=data,\n",
+    "    nepochs=1,\n",
+    "    print_stats=True,\n",
+    "    use_filters=True)"
    ]
   },
   {
@@ -1402,7 +1403,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 50,
+   "execution_count": null,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2020-09-08T15:04:02.394607Z",
@@ -1411,14 +1412,14 @@
    },
    "outputs": [],
    "source": [
-    "from medcat.meta_cat import MetaCAT\n",
+    "from medcat.components.addons.meta_cat import MetaCAT\n",
     "from tokenizers import ByteLevelBPETokenizer\n",
     "from itertools import chain"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": null,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2020-09-08T14:46:39.070589Z",
@@ -1427,6 +1428,7 @@
    },
    "outputs": [],
    "source": [
+    "import numpy as np\n",
     "# Tokenizer instantiation\n",
     "tokenizer = ByteLevelBPETokenizer(vocab_file='data/medmen-vocab.json', merges_file='data/medmen-merges.txt')\n",
     "embeddings = np.load(open('data/embeddings.npy', 'rb'))"
@@ -1443,7 +1445,7 @@
    },
    "outputs": [],
    "source": [
-    "metacat = MetaCAT(tokenizer=tokenizer, embeddings=embeddings, \n",
+    "metacat = MetaCAT(tokenizer=tokenizer, embeddings=embeddings,\n",
     "             pad_id=len(embeddings) -1, save_dir='mc_status', device='cpu')"
    ]
   },

diff --git a/webapp/api/api/admin/actions.py b/webapp/api/api/admin/actions.py
@@ -12,6 +12,9 @@
 
 from api.models import AnnotatedEntity, MetaAnnotation, EntityRelation, Document, ConceptDB
 from api.solr_utils import drop_collection, import_all_concepts
+from api.utils import clear_cdb_cnf_addons
+
+from medcat.cdb import CDB
 
 logger = logging.getLogger(__name__)
 
@@ -356,20 +359,18 @@ def dataset_document_counts(dataset):
 
 @background(schedule=5)
 def _reset_cdb_filters(id):
-    from medcat.cdb import CDB
     concept_db = ConceptDB.objects.get(id=id)
     cdb = CDB.load(concept_db.cdb_file.path)
-    cdb.config.linking['filters'] = {'cuis': set()}
+    clear_cdb_cnf_addons(cdb, id)
+    cdb.config.components.linking.filters = {'cuis': set()}
     cdb.save(concept_db.cdb_file.path)
 
 
 @background(schedule=5)
 def import_concepts_from_cdb(cdb_model_id: int):
-    from medcat.cdb import CDB
-
     cdb_model = ConceptDB.objects.get(id=cdb_model_id)
     cdb = CDB.load(cdb_model.cdb_file.path)
-
+    clear_cdb_cnf_addons(cdb, cdb_model_id)
     import_all_concepts(cdb, cdb_model)
 
 

diff --git a/webapp/api/api/metrics.py b/webapp/api/api/metrics.py
@@ -13,18 +13,20 @@
 from background_task.models import Task
 from django.contrib.auth.models import User
 from django.db.models import QuerySet
+from medcat.stats.stats import get_stats
 from medcat.cat import CAT
 from medcat.cdb import CDB
-from medcat.config_meta_cat import ConfigMetaCAT
-from medcat.meta_cat import MetaCAT
-from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBase
-from medcat.utils.meta_cat.data_utils import prepare_from_json, encode_category_values
-from medcat.utils.meta_cat.ml_utils import create_batch_piped_data
+from medcat.config.config_meta_cat import ConfigMetaCAT
+from medcat.components.addons.meta_cat.meta_cat import MetaCATAddon
+from medcat.components.addons.meta_cat.mctokenizers.tokenizers import TokenizerWrapperBase
+from medcat.components.addons.meta_cat.data_utils import prepare_from_json, encode_category_values
+from medcat.components.addons.meta_cat.ml_utils import create_batch_piped_data
 from medcat.vocab import Vocab
 from torch import nn
 
 from api.admin import retrieve_project_data
 from api.models import AnnotatedEntity, ProjectAnnotateEntities, ProjectMetrics as AppProjectMetrics
+from api.utils import clear_cdb_cnf_addons
 from core.settings import MEDIA_ROOT
 
 _dt_fmt = '%Y-%m-%d %H:%M:%S.%f'
@@ -51,6 +53,7 @@ def calculate_metrics(project_ids: List[int], report_name: str):
     else:
         # assume the cdb / vocab is set in these projects
         cdb = CDB.load(projects[0].concept_db.cdb_file.path)
+        clear_cdb_cnf_addons(cdb, projects[0].concept_db.name)
         vocab = Vocab.load(projects[0].vocab.vocab_file.path)
         cat = CAT(cdb, vocab, config=cdb.config)
     project_data = retrieve_project_data(projects)
@@ -116,7 +119,7 @@ def annotation_df(self):
         """
         annotation_df = pd.DataFrame(self.annotations)
         if self.cat:
-            annotation_df.insert(5, 'concept_name', annotation_df['cui'].map(self.cat.cdb.cui2preferred_name))
+            annotation_df.insert(5, 'concept_name', annotation_df['cui'].map(self.cat.cdb.get_name))
         annotation_df['last_modified'] = pd.to_datetime(annotation_df['last_modified']).dt.tz_localize(None)
         return annotation_df
 
@@ -138,9 +141,10 @@ def concept_summary(self, extra_cui_filter=None):
         concept_count_df['count_variations_ratio'] = round(concept_count_df['concept_count'] /
                                                            concept_count_df['variations'], 3)
         if self.cat:
-            fps, fns, tps, cui_prec, cui_rec, cui_f1, cui_counts, examples = self.cat._print_stats(data=self.mct_export,
-                                                                                                   use_project_filters=True,
-                                                                                                   extra_cui_filter=extra_cui_filter)
+            fps, fns, tps, cui_prec, cui_rec, cui_f1, cui_counts, examples = get_stats(self.cat,
+                                                                                       data=self.mct_export,
+                                                                                       use_project_filters=True,
+                                                                                       extra_cui_filter=extra_cui_filter)
             # remap tps, fns, fps to specific user annotations
             examples = self.enrich_medcat_metrics(examples)
             concept_count_df['fps'] = concept_count_df['cui'].map(fps)
@@ -242,11 +246,11 @@ def rename_meta_anns(self, meta_anns2rename=dict(), meta_ann_values2rename=dict(
         return
 
     def _eval_model(self, model: nn.Module, data: List, config: ConfigMetaCAT, tokenizer: TokenizerWrapperBase) -> Dict:
-        device = torch.device(config.general['device'])  # Create a torch device
-        batch_size_eval = config.general['batch_size_eval']
-        pad_id = config.model['padding_idx']
-        ignore_cpos = config.model['ignore_cpos']
-        class_weights = config.train['class_weights']
+        device = torch.device(config.general.device)  # Create a torch device
+        batch_size_eval = config.general.batch_size_eval
+        pad_id = config.model.padding_idx
+        ignore_cpos = config.model.ignore_cpos
+        class_weights = config.train.class_weights
 
         if class_weights is not None:
             class_weights = torch.FloatTensor(class_weights).to(device)
@@ -323,9 +327,17 @@ def full_annotation_df(self) -> pd.DataFrame:
                           ~anns_df['killed'] & ~anns_df['irrelevant']]
         meta_df = meta_df.reset_index(drop=True)
 
-        for meta_model in self.cat._meta_cats:
-            logger.info(f'Checking metacat model: {meta_model}')
-            meta_model_task = meta_model.name
+        all_meta_cats = self.cat.get_addons_of_type(MetaCATAddon)
+
+        for meta_model_card in self.cat.get_model_card(as_dict=True)['MetaCAT models']:
+            meta_model_task = meta_model_card['Category Name']
+            logger.info(f'Checking metacat model: {meta_model_task}')
+            _meta_models = [mc for mc in all_meta_cats
+                           if mc.config.general.category_name == meta_model_task]
+            if not _meta_models:
+                logger.warning(f'MetaCAT model {meta_model_task} not found in the CAT instance.')
+                continue
+            meta_model = _meta_models[0]
             meta_results = self._eval(meta_model, self.mct_export)
             meta_values = {v: k for k, v in meta_results['meta_values'].items()}
             pred_meta_values = []
@@ -408,7 +420,7 @@ def meta_anns_concept_summary(self) -> List[Dict]:
             # Store results for this concept
             meta_performance[cui] = {
                 'cui': cui,
-                'concept_name': self.cat.cdb.cui2preferred_name[cui],
+                'concept_name': self.cat.cdb.cui2info[cui]['preferred_name'],
                 'meta_tasks': meta_task_results
             }