Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
7743236
interim changes for medcat-v2
Jun 12, 2025
29f3b76
updated TODOs
Jun 19, 2025
954608f
Update dependency to medcat v2
mart-r Jun 23, 2025
d143c9b
Update CDB/Vocab load to use the load classmethod again
mart-r Jun 23, 2025
9db1120
Move away from pkg_resources (deprecated)
mart-r Jun 25, 2025
530fbb0
Use v2 based API for loading addons (MetaCATs)
mart-r Jun 26, 2025
bc51626
Update MetaCAT loading
mart-r Jun 26, 2025
e77c14d
Update metrics to v2 format
mart-r Jun 27, 2025
570c14b
Do config parsing locally
mart-r Jun 27, 2025
13cf62a
Update to correct attribute name
mart-r Jun 27, 2025
1ca0cfa
Update solr utils to v2
mart-r Jun 30, 2025
fb20b87
Fix config access for v2
mart-r Jun 30, 2025
a26fad0
Remove addons from CDB config upon load
mart-r Jul 1, 2025
4cd8d62
Fix syntax error
mart-r Jul 1, 2025
ebe8dc2
Update Meta Annotation getting so as to avoid error if none set
mart-r Jul 1, 2025
4b40c76
Fix entity CUI / start/end char access
mart-r Jul 1, 2025
775261c
Fix some more entity detail access
mart-r Jul 1, 2025
181d668
Remove unigram table error (irrelevant / redundant)
mart-r Jul 1, 2025
bc0605d
Log more info regarding failure upon document preparation
mart-r Jul 1, 2025
3015c1e
Centralising clearnig CDB addons afer explicit load
mart-r Jul 2, 2025
5085bde
More specific import
mart-r Jul 2, 2025
e5b1a88
Clear CDB config addons everywhere if/when applicable
mart-r Jul 2, 2025
05ad081
Avoid circular imports by importing dynamically
mart-r Jul 2, 2025
9ea7a46
Correctly set CDB path within v2 model packs
mart-r Jul 2, 2025
6587f43
Update (very old) notebook to v2
mart-r Aug 6, 2025
d5b1861
Update (very old) notebook for v2 installation
mart-r Aug 6, 2025
c857e5c
CU-869aknppd: medcattrainer: upgrade dep
Sep 24, 2025
521aa7d
Merge branch 'main' into medcat-v2-testing
tomolopolis Sep 29, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 17 additions & 17 deletions medcat-trainer/notebook_docs/API_Examples.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -200,11 +200,11 @@
"for name, d_s in datasets:\n",
" payload = {\n",
" 'dataset_name': name, # Name that appears in each\n",
" 'dataset': d_s.loc[:, ['name', 'text']].to_dict(), # Dictionary representation of only \n",
" 'dataset': d_s.loc[:, ['name', 'text']].to_dict(), # Dictionary representation of only\n",
" 'description': f'{name} first 20 notes from each category' # Description that appears in the trainer\n",
" }\n",
" resp = requests.post(f'{URL}/api/create-dataset/', json=payload, headers=headers)\n",
" dataset_ids.append(json.loads(resp.text)['dataset_id']) \n",
" dataset_ids.append(json.loads(resp.text)['dataset_id'])\n",
"# New datasets created in the trainer have the following IDs\n",
"dataset_ids"
]
Expand Down Expand Up @@ -262,7 +262,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": null,
"metadata": {
"tags": []
},
Expand All @@ -273,7 +273,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": null,
"metadata": {
"tags": []
},
Expand All @@ -290,7 +290,7 @@
}
],
"source": [
"CDB.load('../../medcat-models/deid_medcat_n2c2_modelpack/cdb.dat')"
"cdb = CDB.load('../../medcat-models/deid_medcat_n2c2_modelpack/cdb.dat')"
]
},
{
Expand All @@ -301,8 +301,8 @@
},
"outputs": [],
"source": [
"txt = json.loads(requests.post(f'{URL}/api/concept-dbs/', headers=headers, \n",
" data={'name': 'example_cdb', 'use_for_training': True}, \n",
"txt = json.loads(requests.post(f'{URL}/api/concept-dbs/', headers=headers,\n",
" data={'name': 'example_cdb', 'use_for_training': True},\n",
" files={'cdb_file': open('../../medcat-models/deid_medcat_n2c2_modelpack/cdb.dat', 'rb')}).text)"
]
},
Expand Down Expand Up @@ -342,8 +342,8 @@
},
"outputs": [],
"source": [
"txt = json.loads(requests.put(f'{URL}/api/concept-dbs/21/', headers=headers, \n",
" data={'name': 'example_cdb-EDITED', 'use_for_training': True}, \n",
"txt = json.loads(requests.put(f'{URL}/api/concept-dbs/21/', headers=headers,\n",
" data={'name': 'example_cdb-EDITED', 'use_for_training': True},\n",
" files={'cdb_file': open('../../medcat-models/deid_medcat_n2c2_modelpack/cdb.dat', 'rb')}).text)"
]
},
Expand Down Expand Up @@ -379,8 +379,8 @@
}
],
"source": [
"requests.post(f'{URL}/api/concept-dbs/', headers=headers, \n",
" data={'name': 'example_cdb', 'use_for_training': True}, \n",
"requests.post(f'{URL}/api/concept-dbs/', headers=headers,\n",
" data={'name': 'example_cdb', 'use_for_training': True},\n",
" files={'cdb_file': open('../../medcat-models/deid_medcat_n2c2_modelpack/cdb.dat', 'rb')}).text)"
]
},
Expand All @@ -404,7 +404,7 @@
"metadata": {},
"outputs": [],
"source": [
"txt = json.loads(requests.post(f'{URL}/api/vocab/', headers=headers, \n",
"txt = json.loads(requests.post(f'{URL}/api/vocab/', headers=headers,\n",
" files={'cdb_file': open('<<LOCATION OF vocab>>', 'rb')}).text)"
]
},
Expand Down Expand Up @@ -465,7 +465,7 @@
"all_cdbs = json.loads(requests.get(f'{URL}/api/concept-dbs/', headers=headers).text)['results']\n",
"# the CDB ID we'll use for this example\n",
"cdb_to_use = all_cdbs[0]['id']\n",
"# you might have many CDBs here. First 2 cdbs: \n",
"# you might have many CDBs here. First 2 cdbs:\n",
"all_cdbs[0:2]"
]
},
Expand Down Expand Up @@ -521,12 +521,12 @@
"for d_id, p_name in zip(dataset_ids, project_names):\n",
" payload = {\n",
" 'name': f'{p_name} Annotation Project',\n",
" 'description': 'Example projects', \n",
" 'cuis': '', \n",
" 'description': 'Example projects',\n",
" 'cuis': '',\n",
" 'tuis': '',\n",
" 'dataset': d_id,\n",
" 'concept_db': cdb_to_use, \n",
" 'vocab': vocab_to_use, \n",
" 'concept_db': cdb_to_use,\n",
" 'vocab': vocab_to_use,\n",
" 'members': users_ids\n",
" }\n",
" project_ids.append(json.loads(requests.post(f'{URL}/api/project-annotate-entities/', json=payload, headers=headers).text))"
Expand Down
44 changes: 23 additions & 21 deletions medcat-trainer/notebook_docs/Train_MedCAT_Models.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -177,7 +177,7 @@
],
"source": [
"# install medcat\n",
"!pip install medcat\n",
"!pip install \"medcat[spacy,meta-cat,rel-cat,deid]>=2.0.0\"\n",
"# scispacy medium models\n",
"!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_md-0.2.5.tar.gz\n",
"# ipywidgets\n",
Expand All @@ -186,7 +186,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-08T11:27:34.270631Z",
Expand All @@ -195,9 +195,11 @@
},
"outputs": [],
"source": [
"import json\n",
"\n",
"from medcat.cat import CAT\n",
"from medcat.cdb import CDB\n",
"from medcat.utils.vocab import Vocab"
"from medcat.vocab import Vocab"
]
},
{
Expand Down Expand Up @@ -310,7 +312,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-08T11:27:59.782731Z",
Expand All @@ -319,16 +321,14 @@
},
"outputs": [],
"source": [
"cdb = CDB()\n",
"cdb.load_dict(cdb_path)\n",
"vocab = Vocab()\n",
"vocab.load_dict(vocab_path)\n",
"cdb = CDB.load(cdb_path)\n",
"vocab = Vocab.load(vocab_path)\n",
"cat = CAT(cdb, vocab)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-08T11:37:38.546552Z",
Expand Down Expand Up @@ -382,7 +382,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"\r\n",
"Epoch: 0, Prec: 0.36538461538461536, Rec: 0.8444444444444444, F1: 0.6049145299145299\n",
"\n",
"Docs with false positives: Psych Text 1; Psych Text 2\n",
Expand Down Expand Up @@ -1383,12 +1383,13 @@
}
],
"source": [
"cat.train_supervised(data_path=\"example_data/MedCAT_Export_With_Text_2020-05-22_10_34_09.json\", \n",
" nepochs=1,\n",
" lr=0.1,\n",
" anneal=False, # Unless we are reseting the CDB or cui_count this is False\n",
" print_stats=True, \n",
" use_filters=True)"
"with open(\"example_data/MedCAT_Export_With_Text_2020-05-22_10_34_09.json\") as f:\n",
" data = json.load(f)\n",
"cat.trainer.train_supervised_raw(\n",
" data=data,\n",
" nepochs=1,\n",
" print_stats=True,\n",
" use_filters=True)"
]
},
{
Expand All @@ -1402,7 +1403,7 @@
},
{
"cell_type": "code",
"execution_count": 50,
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-08T15:04:02.394607Z",
Expand All @@ -1411,14 +1412,14 @@
},
"outputs": [],
"source": [
"from medcat.meta_cat import MetaCAT\n",
"from medcat.components.addons.meta_cat import MetaCAT\n",
"from tokenizers import ByteLevelBPETokenizer\n",
"from itertools import chain"
]
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-08T14:46:39.070589Z",
Expand All @@ -1427,6 +1428,7 @@
},
"outputs": [],
"source": [
"import numpy as np\n",
"# Tokenizer instantiation\n",
"tokenizer = ByteLevelBPETokenizer(vocab_file='data/medmen-vocab.json', merges_file='data/medmen-merges.txt')\n",
"embeddings = np.load(open('data/embeddings.npy', 'rb'))"
Expand All @@ -1443,7 +1445,7 @@
},
"outputs": [],
"source": [
"metacat = MetaCAT(tokenizer=tokenizer, embeddings=embeddings, \n",
"metacat = MetaCAT(tokenizer=tokenizer, embeddings=embeddings,\n",
" pad_id=len(embeddings) -1, save_dir='mc_status', device='cpu')"
]
},
Expand Down
11 changes: 6 additions & 5 deletions medcat-trainer/webapp/api/api/admin/actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@

from api.models import AnnotatedEntity, MetaAnnotation, EntityRelation, Document, ConceptDB
from api.solr_utils import drop_collection, import_all_concepts
from api.utils import clear_cdb_cnf_addons

from medcat.cdb import CDB

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -356,20 +359,18 @@ def dataset_document_counts(dataset):

@background(schedule=5)
def _reset_cdb_filters(id):
from medcat.cdb import CDB
concept_db = ConceptDB.objects.get(id=id)
cdb = CDB.load(concept_db.cdb_file.path)
cdb.config.linking['filters'] = {'cuis': set()}
clear_cdb_cnf_addons(cdb, id)
cdb.config.components.linking.filters = {'cuis': set()}
cdb.save(concept_db.cdb_file.path)


@background(schedule=5)
def import_concepts_from_cdb(cdb_model_id: int):
from medcat.cdb import CDB

cdb_model = ConceptDB.objects.get(id=cdb_model_id)
cdb = CDB.load(cdb_model.cdb_file.path)

clear_cdb_cnf_addons(cdb, cdb_model_id)
import_all_concepts(cdb, cdb_model)


Expand Down
46 changes: 29 additions & 17 deletions medcat-trainer/webapp/api/api/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,20 @@
from background_task.models import Task
from django.contrib.auth.models import User
from django.db.models import QuerySet
from medcat.stats.stats import get_stats
from medcat.cat import CAT
from medcat.cdb import CDB
from medcat.config_meta_cat import ConfigMetaCAT
from medcat.meta_cat import MetaCAT
from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBase
from medcat.utils.meta_cat.data_utils import prepare_from_json, encode_category_values
from medcat.utils.meta_cat.ml_utils import create_batch_piped_data
from medcat.config.config_meta_cat import ConfigMetaCAT
from medcat.components.addons.meta_cat.meta_cat import MetaCATAddon
from medcat.components.addons.meta_cat.mctokenizers.tokenizers import TokenizerWrapperBase
from medcat.components.addons.meta_cat.data_utils import prepare_from_json, encode_category_values
from medcat.components.addons.meta_cat.ml_utils import create_batch_piped_data
from medcat.vocab import Vocab
from torch import nn

from api.admin import retrieve_project_data
from api.models import AnnotatedEntity, ProjectAnnotateEntities, ProjectMetrics as AppProjectMetrics
from api.utils import clear_cdb_cnf_addons
from core.settings import MEDIA_ROOT

_dt_fmt = '%Y-%m-%d %H:%M:%S.%f'
Expand All @@ -51,6 +53,7 @@ def calculate_metrics(project_ids: List[int], report_name: str):
else:
# assume the cdb / vocab is set in these projects
cdb = CDB.load(projects[0].concept_db.cdb_file.path)
clear_cdb_cnf_addons(cdb, projects[0].concept_db.name)
vocab = Vocab.load(projects[0].vocab.vocab_file.path)
cat = CAT(cdb, vocab, config=cdb.config)
project_data = retrieve_project_data(projects)
Expand Down Expand Up @@ -116,7 +119,7 @@ def annotation_df(self):
"""
annotation_df = pd.DataFrame(self.annotations)
if self.cat:
annotation_df.insert(5, 'concept_name', annotation_df['cui'].map(self.cat.cdb.cui2preferred_name))
annotation_df.insert(5, 'concept_name', annotation_df['cui'].map(self.cat.cdb.get_name))
annotation_df['last_modified'] = pd.to_datetime(annotation_df['last_modified']).dt.tz_localize(None)
return annotation_df

Expand All @@ -138,9 +141,10 @@ def concept_summary(self, extra_cui_filter=None):
concept_count_df['count_variations_ratio'] = round(concept_count_df['concept_count'] /
concept_count_df['variations'], 3)
if self.cat:
fps, fns, tps, cui_prec, cui_rec, cui_f1, cui_counts, examples = self.cat._print_stats(data=self.mct_export,
use_project_filters=True,
extra_cui_filter=extra_cui_filter)
fps, fns, tps, cui_prec, cui_rec, cui_f1, cui_counts, examples = get_stats(self.cat,
data=self.mct_export,
use_project_filters=True,
extra_cui_filter=extra_cui_filter)
# remap tps, fns, fps to specific user annotations
examples = self.enrich_medcat_metrics(examples)
concept_count_df['fps'] = concept_count_df['cui'].map(fps)
Expand Down Expand Up @@ -242,11 +246,11 @@ def rename_meta_anns(self, meta_anns2rename=dict(), meta_ann_values2rename=dict(
return

def _eval_model(self, model: nn.Module, data: List, config: ConfigMetaCAT, tokenizer: TokenizerWrapperBase) -> Dict:
device = torch.device(config.general['device']) # Create a torch device
batch_size_eval = config.general['batch_size_eval']
pad_id = config.model['padding_idx']
ignore_cpos = config.model['ignore_cpos']
class_weights = config.train['class_weights']
device = torch.device(config.general.device) # Create a torch device
batch_size_eval = config.general.batch_size_eval
pad_id = config.model.padding_idx
ignore_cpos = config.model.ignore_cpos
class_weights = config.train.class_weights

if class_weights is not None:
class_weights = torch.FloatTensor(class_weights).to(device)
Expand Down Expand Up @@ -323,9 +327,17 @@ def full_annotation_df(self) -> pd.DataFrame:
~anns_df['killed'] & ~anns_df['irrelevant']]
meta_df = meta_df.reset_index(drop=True)

for meta_model in self.cat._meta_cats:
logger.info(f'Checking metacat model: {meta_model}')
meta_model_task = meta_model.name
all_meta_cats = self.cat.get_addons_of_type(MetaCATAddon)

for meta_model_card in self.cat.get_model_card(as_dict=True)['MetaCAT models']:
meta_model_task = meta_model_card['Category Name']
logger.info(f'Checking metacat model: {meta_model_task}')
_meta_models = [mc for mc in all_meta_cats
if mc.config.general.category_name == meta_model_task]
if not _meta_models:
logger.warning(f'MetaCAT model {meta_model_task} not found in the CAT instance.')
continue
meta_model = _meta_models[0]
meta_results = self._eval(meta_model, self.mct_export)
meta_values = {v: k for k, v in meta_results['meta_values'].items()}
pred_meta_values = []
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Generated by Django 2.2.28 on 2023-12-11 15:26

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('api', '0073_auto_20231022_0028'),
]

operations = [
migrations.AlterField(
model_name='projectmetrics',
name='projects',
field=models.ManyToManyField(blank=True, to='api.ProjectAnnotateEntities'),
),
]
Loading