Skip to content

Commit b2b023b

Browse files
tomolopolisTom Searlemart-r
authored
Upload Export docs fix (#150)
* interim changes for medcat-v2 * updated TODOs * Update dependency to medcat v2 * Update CDB/Vocab load to use the load classmethod again * Move away from pkg_resources (deprecated) * Use v2 based API for loading addons (MetaCATs) * Update MetaCAT loading * Update metrics to v2 format * Do config parsing locally * Update to correct attribute name * Update solr utils to v2 * Fix config access for v2 * Remove addons from CDB config upon load * Fix syntax error * Update Meta Annotation getting so as to avoid error if none set * Fix entity CUI / start/end char access * Fix some more entity detail access * Remove unigram table error (irrelevant / redundant) * Log more info regarding failure upon document preparation * Centralising clearnig CDB addons afer explicit load * More specific import * Clear CDB config addons everywhere if/when applicable * Avoid circular imports by importing dynamically * Correctly set CDB path within v2 model packs * Update (very old) notebook to v2 * Update (very old) notebook for v2 installation * CU-869aknppd: medcattrainer: upgrade dep * CU-869an9w8y: medcat-trainer: fix: imported exported projects needs to have an available model for project validation requirements --------- Co-authored-by: Tom Searle <[email protected]> Co-authored-by: mart-r <[email protected]>
1 parent 764eeeb commit b2b023b

File tree

9 files changed

+231
-25
lines changed

9 files changed

+231
-25
lines changed

medcat-trainer/client/mctclient.py

Lines changed: 58 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import json
44
import os
55
from abc import ABC
6-
from typing import List, Tuple, Union
6+
from typing import Any, Dict, List, Tuple, Union
77

88
import requests
99

@@ -495,6 +495,24 @@ def get_models(self) -> Tuple[List[str], List[str]]:
495495
mct_vocabs = [MCTVocab(id=v['id'], name=v['name'], vocab_file=v['vocab_file']) for v in vocabs]
496496
return mct_cdbs, mct_vocabs
497497

498+
def get_concept_dbs(self) -> List[MCTConceptDB]:
499+
"""Get all concept databases in the MedCATTrainer instance.
500+
501+
Returns:
502+
List[MCTConceptDB]: A list of all concept databases in the MedCATTrainer instance
503+
"""
504+
cdbs = json.loads(requests.get(f'{self.server}/api/concept-dbs/', headers=self.headers).text)['results']
505+
return [MCTConceptDB(id=cdb['id'], name=cdb['name'], conceptdb_file=cdb['cdb_file']) for cdb in cdbs]
506+
507+
def get_vocabs(self) -> List[MCTVocab]:
508+
"""Get all vocabularies in the MedCATTrainer instance.
509+
510+
Returns:
511+
List[MCTVocab]: A list of all vocabularies in the MedCATTrainer instance
512+
"""
513+
vocabs = json.loads(requests.get(f'{self.server}/api/vocabs/', headers=self.headers).text)['results']
514+
return [MCTVocab(id=v['id'], name=v['name'], vocab_file=v['vocab_file']) for v in vocabs]
515+
498516
def get_model_packs(self) -> List[MCTModelPack]:
499517
"""Get all MedCAT model packs in the MedCATTrainer instance.
500518
@@ -559,7 +577,7 @@ def get_datasets(self) -> List[MCTDataset]:
559577
return mct_datasets
560578

561579
def get_project_annos(self, projects: List[MCTProject]):
562-
"""Get the annotations for a list of projects. Schema is documented here: https://github.com/medcat/MedCATtrainer/blob/main/docs/api.md#download-annotations
580+
"""Get the annotations for a list of projects.
563581
564582
Args:
565583
projects (List[MCTProject]): A list of projects to get annotations for
@@ -574,6 +592,44 @@ def get_project_annos(self, projects: List[MCTProject]):
574592
headers=self.headers).text)
575593
return resp
576594

595+
def upload_projects_export(self, projects: Dict[str, Any],
596+
cdb: Union[MCTConceptDB, str]=None,
597+
vocab: Union[MCTVocab, str]=None,
598+
modelpack: Union[MCTModelPack, str]=None):
599+
"""Upload Trainer export as a list of projects to a MedCATTrainer instance.
600+
601+
Args:
602+
projects (List[MCTProject]): A list of projects to upload
603+
cdb (Union[MCTConceptDB, str]): The concept database to be used in the project - CDB name or the MCTCDB Object
604+
vocab (Union[MCTVocab, str]): The vocabulary to be used in the project - Vocab name or the MCTVocab Object
605+
modelpack (Union[MCTModelPack, str]): The model pack to be used in the project - ModelPack name or the MCTModelPack Object
606+
"""
607+
if isinstance(cdb, str):
608+
cdb = [c for c in self.get_concept_dbs() if c.name == cdb].pop()
609+
if isinstance(vocab, str):
610+
vocab = [v for v in self.get_vocabs() if v.name == vocab].pop()
611+
if isinstance(modelpack, str):
612+
modelpack = [m for m in self.get_model_packs() if m.name == modelpack].pop()
613+
614+
payload = {
615+
'exported_projects': projects
616+
}
617+
618+
if cdb and vocab:
619+
payload['cdb_id'] = cdb.id
620+
payload['vocab_id'] = vocab.id
621+
elif modelpack:
622+
payload['modelpack_id'] = modelpack.id
623+
else:
624+
raise MCTUtilsException('No cdb, vocab, or modelpack provided, use a ')
625+
626+
resp = requests.post(f'{self.server}/api/upload-deployment/', headers=self.headers,
627+
json=payload)
628+
if 200 <= resp.status_code < 300:
629+
return resp.json()
630+
else:
631+
raise MCTUtilsException(f'Failed to upload projects export: {resp.text}')
632+
577633
def __str__(self) -> str:
578634
return f'{self.server} \t {self.username} \t {self.password}'
579635

medcat-trainer/notebook_docs/Client_API_Tutorials.ipynb

Lines changed: 102 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@
5050
},
5151
{
5252
"cell_type": "code",
53-
"execution_count": 5,
53+
"execution_count": 3,
5454
"metadata": {},
5555
"outputs": [],
5656
"source": [
@@ -63,7 +63,7 @@
6363
},
6464
{
6565
"cell_type": "code",
66-
"execution_count": 6,
66+
"execution_count": 4,
6767
"metadata": {},
6868
"outputs": [],
6969
"source": [
@@ -90,7 +90,7 @@
9090
},
9191
{
9292
"cell_type": "code",
93-
"execution_count": 7,
93+
"execution_count": 3,
9494
"metadata": {},
9595
"outputs": [
9696
{
@@ -304,7 +304,37 @@
304304
"cell_type": "code",
305305
"execution_count": null,
306306
"metadata": {},
307-
"outputs": [],
307+
"outputs": [
308+
{
309+
"name": "stdout",
310+
"output_type": "stream",
311+
"text": [
312+
"Users:\n",
313+
"2 : annotator1\n",
314+
"1 : admin\n",
315+
"\n",
316+
"Datasets:\n",
317+
"1 : Example Annotation Project - Model pack \t http://localhost:8001/media/cardio.csv\n",
318+
"2 : Example Project - SNOMED CT All IMPORTED_dataset \t http://localhost:8001/media/Users/k1897038/projects/cogstack-nlp/medcat-trainer/webapp/api/media/Example%20Project%20-%20SNOMED%20CT%20All%20IMPORTED_dataset.csv\n",
319+
"3 : Example Project - SNOMED CT All - ModelPack IMPORTED_dataset \t http://localhost:8001/media/Users/k1897038/projects/cogstack-nlp/medcat-trainer/webapp/api/media/Example%20Project%20-%20SNOMED%20CT%20All%20-%20ModelPack%20IMPORTED_dataset.csv\n",
320+
"\n",
321+
"Concept DBs:\n",
322+
"\n",
323+
"Vocabularies:\n",
324+
"\n",
325+
"ModelPacks:\n",
326+
"1 : snomed_2023_htn_modelpack \t http://localhost:8001/media/snomed_2023_base_model_dm_htn_copd_only_f86505ba72beff08.zipv2_48299cf9ff983030.zip\n",
327+
"\n",
328+
"Meta Tasks:\n",
329+
"1 : Presence\n",
330+
"2 : Subject\n",
331+
"3 : Time\n",
332+
"\n",
333+
"Relation Tasks:\n",
334+
"1 : Spatial\n"
335+
]
336+
}
337+
],
308338
"source": [
309339
"# Get users\n",
310340
"users = session.get_users()\n",
@@ -378,7 +408,15 @@
378408
"cell_type": "code",
379409
"execution_count": null,
380410
"metadata": {},
381-
"outputs": [],
411+
"outputs": [
412+
{
413+
"name": "stdout",
414+
"output_type": "stream",
415+
"text": [
416+
"Created project with model pack: 2 : Demo General Medical Annotation \t Annotation of neurology medical conditions \t 3 : Example Project - SNOMED CT All - ModelPack IMPORTED_dataset \t http://localhost:8001/media/Users/k1897038/projects/cogstack-nlp/medcat-trainer/webapp/api/media/Example%20Project%20-%20SNOMED%20CT%20All%20-%20ModelPack%20IMPORTED_dataset.csv\n"
417+
]
418+
}
419+
],
382420
"source": [
383421
"# Method 2: Create a project with a modelpack\n",
384422
"\n",
@@ -408,7 +446,17 @@
408446
"cell_type": "code",
409447
"execution_count": null,
410448
"metadata": {},
411-
"outputs": [],
449+
"outputs": [
450+
{
451+
"name": "stdout",
452+
"output_type": "stream",
453+
"text": [
454+
"Downloaded annotations for 2 projects:\n",
455+
"Example Project - SNOMED CT All - ModelPack IMPORTED\n",
456+
"Demo General Medical Annotation\n"
457+
]
458+
}
459+
],
412460
"source": [
413461
"# Get all projects\n",
414462
"mct_projects = session.get_projects()\n",
@@ -436,15 +484,22 @@
436484
"metadata": {},
437485
"source": [
438486
"## 6. Saving Annotations for Analysis\n",
439-
"\n",
440-
"Finally, let's save the annotations to a file for later analysis:"
487+
"Once annotations have been collected they can be downloaded."
441488
]
442489
},
443490
{
444491
"cell_type": "code",
445-
"execution_count": null,
492+
"execution_count": 12,
446493
"metadata": {},
447-
"outputs": [],
494+
"outputs": [
495+
{
496+
"name": "stdout",
497+
"output_type": "stream",
498+
"text": [
499+
"Annotations saved to ./example_data/medical_annotations.json\n"
500+
]
501+
}
502+
],
448503
"source": [
449504
"# Save MCT export / annotations to a file\n",
450505
"with open(\"./example_data/medical_annotations.json\", \"w\") as f:\n",
@@ -453,6 +508,41 @@
453508
"print(\"Annotations saved to ./example_data/medical_annotations.json\")"
454509
]
455510
},
511+
{
512+
"cell_type": "markdown",
513+
"metadata": {},
514+
"source": [
515+
"## 7. Save Annotations as a Project\n",
516+
"Annotatons can be 'imported' into a trainer instance:"
517+
]
518+
},
519+
{
520+
"cell_type": "code",
521+
"execution_count": 5,
522+
"metadata": {},
523+
"outputs": [],
524+
"source": [
525+
"projects = json.load(open('./example_data/MedCAT_Export_With_Text_2020-05-22_10_34_09.json'))"
526+
]
527+
},
528+
{
529+
"cell_type": "code",
530+
"execution_count": 6,
531+
"metadata": {},
532+
"outputs": [],
533+
"source": [
534+
"model_pack = session.get_model_packs()"
535+
]
536+
},
537+
{
538+
"cell_type": "code",
539+
"execution_count": null,
540+
"metadata": {},
541+
"outputs": [],
542+
"source": [
543+
"session.upload_projects_export(projects, modelpack=model_pack[0])"
544+
]
545+
},
456546
{
457547
"cell_type": "markdown",
458548
"metadata": {},
@@ -463,9 +553,9 @@
463553
],
464554
"metadata": {
465555
"kernelspec": {
466-
"display_name": "bioext-medcat-env",
556+
"display_name": "Python [conda env:cattrainer]",
467557
"language": "python",
468-
"name": "python3"
558+
"name": "conda-env-cattrainer-py"
469559
},
470560
"language_info": {
471561
"codemirror_mode": {

medcat-trainer/webapp/api/api/admin/models.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ class VocabularyAdmin(admin.ModelAdmin):
164164
model = Vocabulary
165165
list_display = ('name', 'create_time', 'last_modified', 'last_modified_by')
166166
fields = ('name', 'vocab_file', 'create_time', 'last_modified', 'last_modified_by')
167-
167+
168168
def save_model(self, request, obj, form, change):
169169
obj.last_modified_by = request.user
170170
super().save_model(request, obj, form, change)

medcat-trainer/webapp/api/api/data_utils.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ def delete_orphan_docs(dataset: Dataset):
6666
Document.objects.filter(dataset__id=dataset.id).delete()
6767

6868

69-
def upload_projects_export(medcat_export: Dict):
69+
def upload_projects_export(medcat_export: Dict, cdb_id: str, vocab_id: str, modelpack_id: str):
7070
for proj in medcat_export['projects']:
7171
p = ProjectAnnotateEntities()
7272
p.name = proj['name'] + ' IMPORTED'
@@ -79,6 +79,14 @@ def upload_projects_export(medcat_export: Dict):
7979
else:
8080
p.cuis = proj['cuis']
8181

82+
if cdb_id is not None and vocab_id is not None:
83+
p.concept_db = ConceptDB.objects.get(id=cdb_id)
84+
p.vocab = Vocabulary.objects.get(id=vocab_id)
85+
elif modelpack_id is not None:
86+
p.model_pack = ModelPack.objects.get(id=modelpack_id)
87+
else:
88+
raise InvalidParameterError("No cdb, vocab, or modelpack provided")
89+
8290
# ensure current deployment has the neccessary - Entity, MetaTak, Relation, and warn on not present User objects.
8391
ent_labels, meta_tasks, rels, unavailable_users, available_users = set(), defaultdict(set), set(), set(), dict()
8492
for doc in proj['documents']:
@@ -196,13 +204,13 @@ def upload_projects_export(medcat_export: Dict):
196204
# link relations with start and end anno ents
197205
er.start_entity = anno_to_doc_ind[relation['start_entity_start_idx']]
198206
er.end_entity = anno_to_doc_ind[relation['end_entity_start_idx']]
199-
try:
207+
if relation.get('create_time') is not None:
200208
er.create_time = datetime.strptime(relation['create_time'], _dt_fmt)
201-
except ValueError:
209+
else:
202210
er.create_time = datetime.now()
203-
try:
211+
if relation.get('last_modified_time') is not None:
204212
er.last_modified = datetime.strptime(relation['last_modified_time'], _dt_fmt)
205-
except ValueError:
213+
else:
206214
er.last_modified = datetime.now()
207215
er.save()
208216
logger.info(f"Finished annotation import for project {proj['name']}")
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# Generated by Django 5.1.7 on 2025-09-29 16:16
2+
3+
import django.db.models.deletion
4+
from django.db import migrations, models
5+
6+
7+
class Migration(migrations.Migration):
8+
9+
dependencies = [
10+
('api', '0090_merge_20250623_1330'),
11+
]
12+
13+
operations = [
14+
migrations.AddField(
15+
model_name='exportedproject',
16+
name='cdb_id',
17+
field=models.ForeignKey(blank=True, default=None, help_text='The ConceptDB to be set for this exported project', null=True, on_delete=django.db.models.deletion.SET_NULL, to='api.conceptdb'),
18+
),
19+
migrations.AddField(
20+
model_name='exportedproject',
21+
name='modelpack_id',
22+
field=models.ForeignKey(blank=True, default=None, help_text='The ModelPack to be set for this exported project', null=True, on_delete=django.db.models.deletion.SET_NULL, to='api.modelpack'),
23+
),
24+
migrations.AddField(
25+
model_name='exportedproject',
26+
name='vocab_id',
27+
field=models.ForeignKey(blank=True, default=None, help_text='The Vocabulary to be set for this exported project', null=True, on_delete=django.db.models.deletion.SET_NULL, to='api.vocabulary'),
28+
),
29+
]

medcat-trainer/webapp/api/api/models.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -524,9 +524,12 @@ def __str__(self):
524524

525525
class ExportedProject(models.Model):
526526
trainer_export_file = models.FileField(help_text='Previously exported MedCATtrainer .json file')
527+
cdb_id = models.ForeignKey('ConceptDB', on_delete=models.SET_NULL, blank=True, null=True, default=None, help_text='The ConceptDB to be set for this exported project')
528+
vocab_id = models.ForeignKey('Vocabulary', on_delete=models.SET_NULL, blank=True, null=True, default=None, help_text='The Vocabulary to be set for this exported project')
529+
modelpack_id = models.ForeignKey('ModelPack', on_delete=models.SET_NULL, blank=True, null=True, default=None, help_text='The ModelPack to be set for this exported project')
527530

528531
def __str__(self):
529-
return self.trainer_export_file.name
532+
return f'{self.trainer_export_file.name} - {self.cdb_id} - {self.vocab_id} - {self.modelpack_id}'
530533

531534

532535
class ProjectMetrics(models.Model):

medcat-trainer/webapp/api/api/signals.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,19 @@ def remove_dataset_file(sender, instance, **kwargs):
4343
def save_exported_projects(sender, instance, **kwargs):
4444
if not instance.trainer_export_file.path.endswith('.json'):
4545
raise Exception("Please make sure the file is a .json file")
46-
upload_projects_export(json.load(open(instance.trainer_export_file.path)))
46+
cdb = instance.cdb_id
47+
vocab = instance.vocab_id
48+
modelpack = instance.modelpack_id
49+
50+
cdb = None if cdb is None else cdb.id
51+
vocab = None if vocab is None else vocab.id
52+
modelpack = None if modelpack is None else modelpack.id
53+
54+
upload_projects_export(
55+
json.load(open(instance.trainer_export_file.path)),
56+
cdb_id=cdb,
57+
vocab_id=vocab,
58+
modelpack_id=modelpack)
4759

4860

4961
@receiver(pre_delete, sender=ModelPack)

0 commit comments

Comments
 (0)