Skip to content
This repository was archived by the owner on Sep 9, 2025. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
1edcdfb
interim changes for medcat-v2
Jun 12, 2025
63f51a4
updated TODOs
Jun 19, 2025
e10d594
Update dependency to medcat v2
mart-r Jun 23, 2025
eb676e7
Update CDB/Vocab load to use the load classmethod again
mart-r Jun 23, 2025
524a1cd
Update requirements - use latest (0.9.0) v2 beta
mart-r Jun 25, 2025
ad796cf
Move away from pkg_resources (deprecated)
mart-r Jun 25, 2025
e6097fe
Update install target to new version and monorepo install
mart-r Jun 26, 2025
27f08b6
Use v2 based API for loading addons (MetaCATs)
mart-r Jun 26, 2025
cdef1f2
Update MetaCAT loading
mart-r Jun 26, 2025
4c368ba
Update requirements to 0.11.0 of medcat v2 beta
mart-r Jun 27, 2025
232ec72
Update metrics to v2 format
mart-r Jun 27, 2025
c854ea7
Do config parsing locally
mart-r Jun 27, 2025
5ec59fc
Bump version to latest
mart-r Jun 27, 2025
3e6ea19
Bump version to latest
mart-r Jun 27, 2025
84422fe
Update to correct attribute name
mart-r Jun 27, 2025
3d94e55
Update solr utils to v2
mart-r Jun 30, 2025
bec4945
Fix config access for v2
mart-r Jun 30, 2025
239806b
Remove addons from CDB config upon load
mart-r Jul 1, 2025
c40e17b
Fix syntax error
mart-r Jul 1, 2025
98efed8
Update Meta Annotation getting so as to avoid error if none set
mart-r Jul 1, 2025
6e2c1c0
Fix entity CUI / start/end char access
mart-r Jul 1, 2025
bac5294
Fix some more entity detail access
mart-r Jul 1, 2025
3ef9805
Remove unigram table error (irrelevant / redundant)
mart-r Jul 1, 2025
c60692d
Log more info regarding failure upon document preparation
mart-r Jul 1, 2025
b9b3c3e
Bump dependency version to latest (0.12.0)
mart-r Jul 1, 2025
d394fb8
Centralising clearnig CDB addons afer explicit load
mart-r Jul 2, 2025
503b7c2
More specific import
mart-r Jul 2, 2025
3485446
Clear CDB config addons everywhere if/when applicable
mart-r Jul 2, 2025
0ddfb61
Avoid circular imports by importing dynamically
mart-r Jul 2, 2025
be70c2c
Correctly set CDB path within v2 model packs
mart-r Jul 2, 2025
694310a
Merge branch 'main' into medcat-v2-testing
mart-r Jul 2, 2025
2ba6491
Update dependency to pypy-based version
mart-r Aug 5, 2025
508738e
Merge branch 'main' into medcat-v2-testing
mart-r Aug 6, 2025
b82a1e3
Update (very old) notebook to v2
mart-r Aug 6, 2025
1a4a18d
Update (very old) notebook for v2 installation
mart-r Aug 6, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 17 additions & 17 deletions notebook_docs/API_Examples.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -200,11 +200,11 @@
"for name, d_s in datasets:\n",
" payload = {\n",
" 'dataset_name': name, # Name that appears in each\n",
" 'dataset': d_s.loc[:, ['name', 'text']].to_dict(), # Dictionary representation of only \n",
" 'dataset': d_s.loc[:, ['name', 'text']].to_dict(), # Dictionary representation of only\n",
" 'description': f'{name} first 20 notes from each category' # Description that appears in the trainer\n",
" }\n",
" resp = requests.post(f'{URL}/api/create-dataset/', json=payload, headers=headers)\n",
" dataset_ids.append(json.loads(resp.text)['dataset_id']) \n",
" dataset_ids.append(json.loads(resp.text)['dataset_id'])\n",
"# New datasets created in the trainer have the following IDs\n",
"dataset_ids"
]
Expand Down Expand Up @@ -262,7 +262,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": null,
"metadata": {
"tags": []
},
Expand All @@ -273,7 +273,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": null,
"metadata": {
"tags": []
},
Expand All @@ -290,7 +290,7 @@
}
],
"source": [
"CDB.load('../../medcat-models/deid_medcat_n2c2_modelpack/cdb.dat')"
"cdb = CDB.load('../../medcat-models/deid_medcat_n2c2_modelpack/cdb.dat')"
]
},
{
Expand All @@ -301,8 +301,8 @@
},
"outputs": [],
"source": [
"txt = json.loads(requests.post(f'{URL}/api/concept-dbs/', headers=headers, \n",
" data={'name': 'example_cdb', 'use_for_training': True}, \n",
"txt = json.loads(requests.post(f'{URL}/api/concept-dbs/', headers=headers,\n",
" data={'name': 'example_cdb', 'use_for_training': True},\n",
" files={'cdb_file': open('../../medcat-models/deid_medcat_n2c2_modelpack/cdb.dat', 'rb')}).text)"
]
},
Expand Down Expand Up @@ -342,8 +342,8 @@
},
"outputs": [],
"source": [
"txt = json.loads(requests.put(f'{URL}/api/concept-dbs/21/', headers=headers, \n",
" data={'name': 'example_cdb-EDITED', 'use_for_training': True}, \n",
"txt = json.loads(requests.put(f'{URL}/api/concept-dbs/21/', headers=headers,\n",
" data={'name': 'example_cdb-EDITED', 'use_for_training': True},\n",
" files={'cdb_file': open('../../medcat-models/deid_medcat_n2c2_modelpack/cdb.dat', 'rb')}).text)"
]
},
Expand Down Expand Up @@ -379,8 +379,8 @@
}
],
"source": [
"requests.post(f'{URL}/api/concept-dbs/', headers=headers, \n",
" data={'name': 'example_cdb', 'use_for_training': True}, \n",
"requests.post(f'{URL}/api/concept-dbs/', headers=headers,\n",
" data={'name': 'example_cdb', 'use_for_training': True},\n",
" files={'cdb_file': open('../../medcat-models/deid_medcat_n2c2_modelpack/cdb.dat', 'rb')}).text)"
]
},
Expand All @@ -404,7 +404,7 @@
"metadata": {},
"outputs": [],
"source": [
"txt = json.loads(requests.post(f'{URL}/api/vocab/', headers=headers, \n",
"txt = json.loads(requests.post(f'{URL}/api/vocab/', headers=headers,\n",
" files={'cdb_file': open('<<LOCATION OF vocab>>', 'rb')}).text)"
]
},
Expand Down Expand Up @@ -465,7 +465,7 @@
"all_cdbs = json.loads(requests.get(f'{URL}/api/concept-dbs/', headers=headers).text)['results']\n",
"# the CDB ID we'll use for this example\n",
"cdb_to_use = all_cdbs[0]['id']\n",
"# you might have many CDBs here. First 2 cdbs: \n",
"# you might have many CDBs here. First 2 cdbs:\n",
"all_cdbs[0:2]"
]
},
Expand Down Expand Up @@ -521,12 +521,12 @@
"for d_id, p_name in zip(dataset_ids, project_names):\n",
" payload = {\n",
" 'name': f'{p_name} Annotation Project',\n",
" 'description': 'Example projects', \n",
" 'cuis': '', \n",
" 'description': 'Example projects',\n",
" 'cuis': '',\n",
" 'tuis': '',\n",
" 'dataset': d_id,\n",
" 'concept_db': cdb_to_use, \n",
" 'vocab': vocab_to_use, \n",
" 'concept_db': cdb_to_use,\n",
" 'vocab': vocab_to_use,\n",
" 'members': users_ids\n",
" }\n",
" project_ids.append(json.loads(requests.post(f'{URL}/api/project-annotate-entities/', json=payload, headers=headers).text))"
Expand Down
44 changes: 23 additions & 21 deletions notebook_docs/Train_MedCAT_Models.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -177,7 +177,7 @@
],
"source": [
"# install medcat\n",
"!pip install medcat\n",
"!pip install \"medcat[spacy,meta-cat,rel-cat,deid]>=2.0.0\"\n",
"# scispacy medium models\n",
"!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_md-0.2.5.tar.gz\n",
"# ipywidgets\n",
Expand All @@ -186,7 +186,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-08T11:27:34.270631Z",
Expand All @@ -195,9 +195,11 @@
},
"outputs": [],
"source": [
"import json\n",
"\n",
"from medcat.cat import CAT\n",
"from medcat.cdb import CDB\n",
"from medcat.utils.vocab import Vocab"
"from medcat.vocab import Vocab"
]
},
{
Expand Down Expand Up @@ -310,7 +312,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-08T11:27:59.782731Z",
Expand All @@ -319,16 +321,14 @@
},
"outputs": [],
"source": [
"cdb = CDB()\n",
"cdb.load_dict(cdb_path)\n",
"vocab = Vocab()\n",
"vocab.load_dict(vocab_path)\n",
"cdb = CDB.load(cdb_path)\n",
"vocab = Vocab.load(vocab_path)\n",
"cat = CAT(cdb, vocab)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-08T11:37:38.546552Z",
Expand Down Expand Up @@ -382,7 +382,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"\r\n",
"Epoch: 0, Prec: 0.36538461538461536, Rec: 0.8444444444444444, F1: 0.6049145299145299\n",
"\n",
"Docs with false positives: Psych Text 1; Psych Text 2\n",
Expand Down Expand Up @@ -1383,12 +1383,13 @@
}
],
"source": [
"cat.train_supervised(data_path=\"example_data/MedCAT_Export_With_Text_2020-05-22_10_34_09.json\", \n",
" nepochs=1,\n",
" lr=0.1,\n",
" anneal=False, # Unless we are reseting the CDB or cui_count this is False\n",
" print_stats=True, \n",
" use_filters=True)"
"with open(\"example_data/MedCAT_Export_With_Text_2020-05-22_10_34_09.json\") as f:\n",
" data = json.load(f)\n",
"cat.trainer.train_supervised_raw(\n",
" data=data,\n",
" nepochs=1,\n",
" print_stats=True,\n",
" use_filters=True)"
]
},
{
Expand All @@ -1402,7 +1403,7 @@
},
{
"cell_type": "code",
"execution_count": 50,
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-08T15:04:02.394607Z",
Expand All @@ -1411,14 +1412,14 @@
},
"outputs": [],
"source": [
"from medcat.meta_cat import MetaCAT\n",
"from medcat.components.addons.meta_cat import MetaCAT\n",
"from tokenizers import ByteLevelBPETokenizer\n",
"from itertools import chain"
]
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-08T14:46:39.070589Z",
Expand All @@ -1427,6 +1428,7 @@
},
"outputs": [],
"source": [
"import numpy as np\n",
"# Tokenizer instantiation\n",
"tokenizer = ByteLevelBPETokenizer(vocab_file='data/medmen-vocab.json', merges_file='data/medmen-merges.txt')\n",
"embeddings = np.load(open('data/embeddings.npy', 'rb'))"
Expand All @@ -1443,7 +1445,7 @@
},
"outputs": [],
"source": [
"metacat = MetaCAT(tokenizer=tokenizer, embeddings=embeddings, \n",
"metacat = MetaCAT(tokenizer=tokenizer, embeddings=embeddings,\n",
" pad_id=len(embeddings) -1, save_dir='mc_status', device='cpu')"
]
},
Expand Down
11 changes: 6 additions & 5 deletions webapp/api/api/admin/actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@

from api.models import AnnotatedEntity, MetaAnnotation, EntityRelation, Document, ConceptDB
from api.solr_utils import drop_collection, import_all_concepts
from api.utils import clear_cdb_cnf_addons

from medcat.cdb import CDB

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -356,20 +359,18 @@ def dataset_document_counts(dataset):

@background(schedule=5)
def _reset_cdb_filters(id):
from medcat.cdb import CDB
concept_db = ConceptDB.objects.get(id=id)
cdb = CDB.load(concept_db.cdb_file.path)
cdb.config.linking['filters'] = {'cuis': set()}
clear_cdb_cnf_addons(cdb, id)
cdb.config.components.linking.filters = {'cuis': set()}
cdb.save(concept_db.cdb_file.path)


@background(schedule=5)
def import_concepts_from_cdb(cdb_model_id: int):
from medcat.cdb import CDB

cdb_model = ConceptDB.objects.get(id=cdb_model_id)
cdb = CDB.load(cdb_model.cdb_file.path)

clear_cdb_cnf_addons(cdb, cdb_model_id)
import_all_concepts(cdb, cdb_model)


Expand Down
48 changes: 30 additions & 18 deletions webapp/api/api/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,20 @@
from background_task.models import Task
from django.contrib.auth.models import User
from django.db.models import QuerySet
from medcat.stats.stats import get_stats
from medcat.cat import CAT
from medcat.cdb import CDB
from medcat.config_meta_cat import ConfigMetaCAT
from medcat.meta_cat import MetaCAT
from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBase
from medcat.utils.meta_cat.data_utils import prepare_from_json, encode_category_values
from medcat.utils.meta_cat.ml_utils import create_batch_piped_data
from medcat.config.config_meta_cat import ConfigMetaCAT
from medcat.components.addons.meta_cat.meta_cat import MetaCATAddon
from medcat.components.addons.meta_cat.mctokenizers.tokenizers import TokenizerWrapperBase
from medcat.components.addons.meta_cat.data_utils import prepare_from_json, encode_category_values
from medcat.components.addons.meta_cat.ml_utils import create_batch_piped_data
from medcat.vocab import Vocab
from torch import nn

from api.admin import retrieve_project_data
from api.models import AnnotatedEntity, ProjectAnnotateEntities, ProjectMetrics as AppProjectMetrics
from api.utils import clear_cdb_cnf_addons
from core.settings import MEDIA_ROOT

_dt_fmt = '%Y-%m-%d %H:%M:%S.%f'
Expand All @@ -51,6 +53,7 @@ def calculate_metrics(project_ids: List[int], report_name: str):
else:
# assume the cdb / vocab is set in these projects
cdb = CDB.load(projects[0].concept_db.cdb_file.path)
clear_cdb_cnf_addons(cdb, projects[0].concept_db.name)
vocab = Vocab.load(projects[0].vocab.vocab_file.path)
cat = CAT(cdb, vocab, config=cdb.config)
project_data = retrieve_project_data(projects)
Expand Down Expand Up @@ -116,7 +119,7 @@ def annotation_df(self):
"""
annotation_df = pd.DataFrame(self.annotations)
if self.cat:
annotation_df.insert(5, 'concept_name', annotation_df['cui'].map(self.cat.cdb.cui2preferred_name))
annotation_df.insert(5, 'concept_name', annotation_df['cui'].map(self.cat.cdb.get_name))
annotation_df['last_modified'] = pd.to_datetime(annotation_df['last_modified']).dt.tz_localize(None)
return annotation_df

Expand All @@ -138,9 +141,10 @@ def concept_summary(self, extra_cui_filter=None):
concept_count_df['count_variations_ratio'] = round(concept_count_df['concept_count'] /
concept_count_df['variations'], 3)
if self.cat:
fps, fns, tps, cui_prec, cui_rec, cui_f1, cui_counts, examples = self.cat._print_stats(data=self.mct_export,
use_project_filters=True,
extra_cui_filter=extra_cui_filter)
fps, fns, tps, cui_prec, cui_rec, cui_f1, cui_counts, examples = get_stats(self.cat,
data=self.mct_export,
use_project_filters=True,
extra_cui_filter=extra_cui_filter)
# remap tps, fns, fps to specific user annotations
examples = self.enrich_medcat_metrics(examples)
concept_count_df['fps'] = concept_count_df['cui'].map(fps)
Expand Down Expand Up @@ -242,11 +246,11 @@ def rename_meta_anns(self, meta_anns2rename=dict(), meta_ann_values2rename=dict(
return

def _eval_model(self, model: nn.Module, data: List, config: ConfigMetaCAT, tokenizer: TokenizerWrapperBase) -> Dict:
device = torch.device(config.general['device']) # Create a torch device
batch_size_eval = config.general['batch_size_eval']
pad_id = config.model['padding_idx']
ignore_cpos = config.model['ignore_cpos']
class_weights = config.train['class_weights']
device = torch.device(config.general.device) # Create a torch device
batch_size_eval = config.general.batch_size_eval
pad_id = config.model.padding_idx
ignore_cpos = config.model.ignore_cpos
class_weights = config.train.class_weights

if class_weights is not None:
class_weights = torch.FloatTensor(class_weights).to(device)
Expand Down Expand Up @@ -323,9 +327,17 @@ def full_annotation_df(self) -> pd.DataFrame:
~anns_df['killed'] & ~anns_df['irrelevant']]
meta_df = meta_df.reset_index(drop=True)

for meta_model in self.cat._meta_cats:
logger.info(f'Checking metacat model: {meta_model}')
meta_model_task = meta_model.name
all_meta_cats = self.cat.get_addons_of_type(MetaCATAddon)

for meta_model_card in self.cat.get_model_card(as_dict=True)['MetaCAT models']:
meta_model_task = meta_model_card['Category Name']
logger.info(f'Checking metacat model: {meta_model_task}')
_meta_models = [mc for mc in all_meta_cats
if mc.config.general.category_name == meta_model_task]
if not _meta_models:
logger.warning(f'MetaCAT model {meta_model_task} not found in the CAT instance.')
continue
meta_model = _meta_models[0]
meta_results = self._eval(meta_model, self.mct_export)
meta_values = {v: k for k, v in meta_results['meta_values'].items()}
pred_meta_values = []
Expand Down Expand Up @@ -408,7 +420,7 @@ def meta_anns_concept_summary(self) -> List[Dict]:
# Store results for this concept
meta_performance[cui] = {
'cui': cui,
'concept_name': self.cat.cdb.cui2preferred_name[cui],
'concept_name': self.cat.cdb.cui2info[cui]['preferred_name'],
'meta_tasks': meta_task_results
}

Expand Down
Loading