Upload Export docs fix (#150)

tomolopolis · Tom Searle · mart-r · web-flow · commit b2b023b90a10 · 2025-09-30T11:12:17.000+01:00
* interim changes for medcat-v2

* updated TODOs

* Update dependency to medcat v2

* Update CDB/Vocab load to use the load classmethod again

* Move away from pkg_resources (deprecated)

* Use v2 based API for loading addons (MetaCATs)

* Update MetaCAT loading

* Update metrics to v2 format

* Do config parsing locally

* Update to correct attribute name

* Update solr utils to v2

* Fix config access for v2

* Remove addons from CDB config upon load

* Fix syntax error

* Update Meta Annotation getting so as to avoid error if none set

* Fix entity CUI / start/end char access

* Fix some more entity detail access

* Remove unigram table error (irrelevant / redundant)

* Log more info regarding failure upon document preparation

* Centralising clearnig CDB addons afer explicit load

* More specific import

* Clear CDB config addons everywhere if/when applicable

* Avoid circular imports by importing dynamically

* Correctly set CDB path within v2 model packs

* Update (very old) notebook to v2

* Update (very old) notebook for v2 installation

* CU-869aknppd: medcattrainer: upgrade dep

* CU-869an9w8y: medcat-trainer: fix: imported exported projects needs to have an available model for project validation requirements

---------

Co-authored-by: Tom Searle &lt;tom@cogstack.org&gt;
Co-authored-by: mart-r &lt;mart.ratas@gmail.com&gt;
diff --git a/medcat-trainer/client/mctclient.py b/medcat-trainer/client/mctclient.py
@@ -3,7 +3,7 @@
 import json
 import os
 from abc import ABC
-from typing import List, Tuple, Union
+from typing import Any, Dict, List, Tuple, Union
 
 import requests
 
@@ -495,6 +495,24 @@ def get_models(self) -> Tuple[List[str], List[str]]:
         mct_vocabs = [MCTVocab(id=v['id'], name=v['name'], vocab_file=v['vocab_file']) for v in vocabs]
         return mct_cdbs, mct_vocabs
 
+    def get_concept_dbs(self) -> List[MCTConceptDB]:
+        """Get all concept databases in the MedCATTrainer instance.
+
+        Returns:
+            List[MCTConceptDB]: A list of all concept databases in the MedCATTrainer instance
+        """
+        cdbs = json.loads(requests.get(f'{self.server}/api/concept-dbs/', headers=self.headers).text)['results']
+        return [MCTConceptDB(id=cdb['id'], name=cdb['name'], conceptdb_file=cdb['cdb_file']) for cdb in cdbs]
+
+    def get_vocabs(self) -> List[MCTVocab]:
+        """Get all vocabularies in the MedCATTrainer instance.
+
+        Returns:
+            List[MCTVocab]: A list of all vocabularies in the MedCATTrainer instance
+        """
+        vocabs = json.loads(requests.get(f'{self.server}/api/vocabs/', headers=self.headers).text)['results']
+        return [MCTVocab(id=v['id'], name=v['name'], vocab_file=v['vocab_file']) for v in vocabs]
+
     def get_model_packs(self) -> List[MCTModelPack]:
         """Get all MedCAT model packs in the MedCATTrainer instance.
 
@@ -559,7 +577,7 @@ def get_datasets(self) -> List[MCTDataset]:
         return mct_datasets
 
     def get_project_annos(self, projects: List[MCTProject]):
-        """Get the annotations for a list of projects. Schema is documented here: https://github.com/medcat/MedCATtrainer/blob/main/docs/api.md#download-annotations
+        """Get the annotations for a list of projects.
 
         Args:
             projects (List[MCTProject]): A list of projects to get annotations for
@@ -574,6 +592,44 @@ def get_project_annos(self, projects: List[MCTProject]):
                                        headers=self.headers).text)
         return resp
 
+    def upload_projects_export(self, projects: Dict[str, Any],
+                               cdb: Union[MCTConceptDB, str]=None,
+                               vocab: Union[MCTVocab, str]=None,
+                               modelpack: Union[MCTModelPack, str]=None):
+        """Upload Trainer export as a list of projects to a MedCATTrainer instance.
+
+        Args:
+            projects (List[MCTProject]): A list of projects to upload
+            cdb (Union[MCTConceptDB, str]): The concept database to be used in the project - CDB name or the MCTCDB Object
+            vocab (Union[MCTVocab, str]): The vocabulary to be used in the project - Vocab name or the MCTVocab Object
+            modelpack (Union[MCTModelPack, str]): The model pack to be used in the project - ModelPack name or the MCTModelPack Object
+        """
+        if isinstance(cdb, str):
+            cdb = [c for c in self.get_concept_dbs() if c.name == cdb].pop()
+        if isinstance(vocab, str):
+            vocab = [v for v in self.get_vocabs() if v.name == vocab].pop()
+        if isinstance(modelpack, str):
+            modelpack = [m for m in self.get_model_packs() if m.name == modelpack].pop()
+
+        payload = {
+            'exported_projects': projects
+        }
+
+        if cdb and vocab:
+            payload['cdb_id'] = cdb.id
+            payload['vocab_id'] = vocab.id
+        elif modelpack:
+            payload['modelpack_id'] = modelpack.id
+        else:
+            raise MCTUtilsException('No cdb, vocab, or modelpack provided, use a ')
+
+        resp = requests.post(f'{self.server}/api/upload-deployment/', headers=self.headers,
+                             json=payload)
+        if 200 <= resp.status_code < 300:
+            return resp.json()
+        else:
+            raise MCTUtilsException(f'Failed to upload projects export: {resp.text}')
+
     def __str__(self) -> str:
         return f'{self.server} \t {self.username} \t {self.password}'
 
diff --git a/medcat-trainer/notebook_docs/Client_API_Tutorials.ipynb b/medcat-trainer/notebook_docs/Client_API_Tutorials.ipynb
@@ -50,7 +50,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -63,7 +63,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -90,7 +90,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -304,7 +304,37 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Users:\n",
+      "2 : annotator1\n",
+      "1 : admin\n",
+      "\n",
+      "Datasets:\n",
+      "1 : Example Annotation Project - Model pack \t http://localhost:8001/media/cardio.csv\n",
+      "2 : Example Project - SNOMED CT All IMPORTED_dataset \t http://localhost:8001/media/Users/k1897038/projects/cogstack-nlp/medcat-trainer/webapp/api/media/Example%20Project%20-%20SNOMED%20CT%20All%20IMPORTED_dataset.csv\n",
+      "3 : Example Project - SNOMED CT All - ModelPack IMPORTED_dataset \t http://localhost:8001/media/Users/k1897038/projects/cogstack-nlp/medcat-trainer/webapp/api/media/Example%20Project%20-%20SNOMED%20CT%20All%20-%20ModelPack%20IMPORTED_dataset.csv\n",
+      "\n",
+      "Concept DBs:\n",
+      "\n",
+      "Vocabularies:\n",
+      "\n",
+      "ModelPacks:\n",
+      "1 : snomed_2023_htn_modelpack \t http://localhost:8001/media/snomed_2023_base_model_dm_htn_copd_only_f86505ba72beff08.zipv2_48299cf9ff983030.zip\n",
+      "\n",
+      "Meta Tasks:\n",
+      "1 : Presence\n",
+      "2 : Subject\n",
+      "3 : Time\n",
+      "\n",
+      "Relation Tasks:\n",
+      "1 : Spatial\n"
+     ]
+    }
+   ],
    "source": [
     "# Get users\n",
     "users = session.get_users()\n",
@@ -378,7 +408,15 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Created project with model pack: 2 : Demo General Medical Annotation \t Annotation of neurology medical conditions \t 3 : Example Project - SNOMED CT All - ModelPack IMPORTED_dataset \t http://localhost:8001/media/Users/k1897038/projects/cogstack-nlp/medcat-trainer/webapp/api/media/Example%20Project%20-%20SNOMED%20CT%20All%20-%20ModelPack%20IMPORTED_dataset.csv\n"
+     ]
+    }
+   ],
    "source": [
     "# Method 2: Create a project with a modelpack\n",
     "\n",
@@ -408,7 +446,17 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloaded annotations for 2 projects:\n",
+      "Example Project - SNOMED CT All - ModelPack IMPORTED\n",
+      "Demo General Medical Annotation\n"
+     ]
+    }
+   ],
    "source": [
     "# Get all projects\n",
     "mct_projects = session.get_projects()\n",
@@ -436,15 +484,22 @@
    "metadata": {},
    "source": [
     "## 6. Saving Annotations for Analysis\n",
-    "\n",
-    "Finally, let's save the annotations to a file for later analysis:"
+    "Once annotations have been collected they can be downloaded."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Annotations saved to ./example_data/medical_annotations.json\n"
+     ]
+    }
+   ],
    "source": [
     "# Save MCT export / annotations to a file\n",
     "with open(\"./example_data/medical_annotations.json\", \"w\") as f:\n",
@@ -453,6 +508,41 @@
     "print(\"Annotations saved to ./example_data/medical_annotations.json\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 7. Save Annotations as a Project\n",
+    "Annotatons can be 'imported' into a trainer instance:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "projects = json.load(open('./example_data/MedCAT_Export_With_Text_2020-05-22_10_34_09.json'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_pack = session.get_model_packs()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "session.upload_projects_export(projects, modelpack=model_pack[0])"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -463,9 +553,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "bioext-medcat-env",
+   "display_name": "Python [conda env:cattrainer]",
    "language": "python",
-   "name": "python3"
+   "name": "conda-env-cattrainer-py"
   },
   "language_info": {
    "codemirror_mode": {
diff --git a/medcat-trainer/webapp/api/api/admin/models.py b/medcat-trainer/webapp/api/api/admin/models.py
@@ -164,7 +164,7 @@ class VocabularyAdmin(admin.ModelAdmin):
     model = Vocabulary
     list_display = ('name', 'create_time', 'last_modified', 'last_modified_by')
     fields = ('name', 'vocab_file', 'create_time', 'last_modified', 'last_modified_by')
-    
+
     def save_model(self, request, obj, form, change):
         obj.last_modified_by = request.user
         super().save_model(request, obj, form, change)
diff --git a/medcat-trainer/webapp/api/api/data_utils.py b/medcat-trainer/webapp/api/api/data_utils.py
@@ -66,7 +66,7 @@ def delete_orphan_docs(dataset: Dataset):
     Document.objects.filter(dataset__id=dataset.id).delete()
 
 
-def upload_projects_export(medcat_export: Dict):
+def upload_projects_export(medcat_export: Dict, cdb_id: str, vocab_id: str, modelpack_id: str):
     for proj in medcat_export['projects']:
         p = ProjectAnnotateEntities()
         p.name = proj['name'] + ' IMPORTED'
@@ -79,6 +79,14 @@ def upload_projects_export(medcat_export: Dict):
         else:
             p.cuis = proj['cuis']
 
+        if cdb_id is not None and vocab_id is not None:
+            p.concept_db = ConceptDB.objects.get(id=cdb_id)
+            p.vocab = Vocabulary.objects.get(id=vocab_id)
+        elif modelpack_id is not None:
+            p.model_pack = ModelPack.objects.get(id=modelpack_id)
+        else:
+            raise InvalidParameterError("No cdb, vocab, or modelpack provided")
+
         # ensure current deployment has the neccessary - Entity, MetaTak, Relation, and warn on not present User objects.
         ent_labels, meta_tasks, rels, unavailable_users, available_users = set(), defaultdict(set), set(), set(), dict()
         for doc in proj['documents']:
@@ -196,13 +204,13 @@ def upload_projects_export(medcat_export: Dict):
                 # link relations with start and end anno ents
                 er.start_entity = anno_to_doc_ind[relation['start_entity_start_idx']]
                 er.end_entity = anno_to_doc_ind[relation['end_entity_start_idx']]
-                try:
+                if relation.get('create_time') is not None:
                     er.create_time = datetime.strptime(relation['create_time'], _dt_fmt)
-                except ValueError:
+                else:
                     er.create_time = datetime.now()
-                try:
+                if relation.get('last_modified_time') is not None:
                     er.last_modified = datetime.strptime(relation['last_modified_time'], _dt_fmt)
-                except ValueError:
+                else:
                     er.last_modified = datetime.now()
                 er.save()
         logger.info(f"Finished annotation import for project {proj['name']}")
diff --git a/medcat-trainer/webapp/api/api/migrations/0091_exportedproject_cdb_id_exportedproject_modelpack_id_and_more.py b/medcat-trainer/webapp/api/api/migrations/0091_exportedproject_cdb_id_exportedproject_modelpack_id_and_more.py
@@ -0,0 +1,29 @@
+# Generated by Django 5.1.7 on 2025-09-29 16:16
+
+import django.db.models.deletion
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('api', '0090_merge_20250623_1330'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='exportedproject',
+            name='cdb_id',
+            field=models.ForeignKey(blank=True, default=None, help_text='The ConceptDB to be set for this exported project', null=True, on_delete=django.db.models.deletion.SET_NULL, to='api.conceptdb'),
+        ),
+        migrations.AddField(
+            model_name='exportedproject',
+            name='modelpack_id',
+            field=models.ForeignKey(blank=True, default=None, help_text='The ModelPack to be set for this exported project', null=True, on_delete=django.db.models.deletion.SET_NULL, to='api.modelpack'),
+        ),
+        migrations.AddField(
+            model_name='exportedproject',
+            name='vocab_id',
+            field=models.ForeignKey(blank=True, default=None, help_text='The Vocabulary to be set for this exported project', null=True, on_delete=django.db.models.deletion.SET_NULL, to='api.vocabulary'),
+        ),
+    ]
diff --git a/medcat-trainer/webapp/api/api/models.py b/medcat-trainer/webapp/api/api/models.py
@@ -524,9 +524,12 @@ def __str__(self):
 
 class ExportedProject(models.Model):
     trainer_export_file = models.FileField(help_text='Previously exported MedCATtrainer .json file')
+    cdb_id = models.ForeignKey('ConceptDB', on_delete=models.SET_NULL, blank=True, null=True, default=None, help_text='The ConceptDB to be set for this exported project')
+    vocab_id = models.ForeignKey('Vocabulary', on_delete=models.SET_NULL, blank=True, null=True, default=None, help_text='The Vocabulary to be set for this exported project')
+    modelpack_id = models.ForeignKey('ModelPack', on_delete=models.SET_NULL, blank=True, null=True, default=None, help_text='The ModelPack to be set for this exported project')
 
     def __str__(self):
-        return self.trainer_export_file.name
+        return f'{self.trainer_export_file.name} - {self.cdb_id} - {self.vocab_id} - {self.modelpack_id}'
 
 
 class ProjectMetrics(models.Model):
diff --git a/medcat-trainer/webapp/api/api/signals.py b/medcat-trainer/webapp/api/api/signals.py
@@ -43,7 +43,19 @@ def remove_dataset_file(sender, instance, **kwargs):
 def save_exported_projects(sender, instance, **kwargs):
     if not instance.trainer_export_file.path.endswith('.json'):
         raise Exception("Please make sure the file is a .json file")
-    upload_projects_export(json.load(open(instance.trainer_export_file.path)))
+    cdb = instance.cdb_id
+    vocab = instance.vocab_id
+    modelpack = instance.modelpack_id
+
+    cdb = None if cdb is None else cdb.id
+    vocab = None if vocab is None else vocab.id
+    modelpack = None if modelpack is None else modelpack.id
+
+    upload_projects_export(
+        json.load(open(instance.trainer_export_file.path)),
+        cdb_id=cdb,
+        vocab_id=vocab,
+        modelpack_id=modelpack)
 
 
 @receiver(pre_delete, sender=ModelPack)
diff --git a/medcat-trainer/webapp/api/api/views.py b/medcat-trainer/webapp/api/api/views.py
diff --git a/medcat-trainer/webapp/requirements.txt b/medcat-trainer/webapp/requirements.txt