fix: preserve the original texts order during bulk processing

baixiac · baixiac · commit 0982f949e751 · 2025-03-12T12:07:01.000Z
ci: use the GHA cache during CMS docker build
diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml
@@ -50,14 +50,6 @@ jobs:
           username: ${{ secrets.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_TOKEN }}
 
-      - name: Cache Docker layers
-        uses: actions/cache@v4
-        with:
-          path: /tmp/.buildx-cache
-          key: ${{ runner.os }}-buildx-${{ github.sha }}
-          restore-keys: |
-            ${{ runner.os }}-buildx-
-
       - name: Build and push CMS
         uses: docker/build-push-action@v6
         id: build_and_push_cms
@@ -68,8 +60,8 @@ jobs:
           push: true
           tags: |
             ${{ env.REGISTRY }}/${{ env.DOCKER_IMAGE_NAME }}:dev
-          cache-from: type=local,src=/tmp/.buildx-cache
-          cache-to: type=local,dest=/tmp/.buildx-cache,mode=max
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
 
       - name: Attest image artifacts
         uses: actions/attest-build-provenance@v2
diff --git a/app/api/routers/training_operations.py b/app/api/routers/training_operations.py
@@ -45,7 +45,7 @@ async def train_eval_info(request: Request,
 
 
 @router.post("/evaluate",
-             tags=[Tags.Evaluating.name],
+             tags=[Tags.Training.name],
              response_class=JSONResponse,
              dependencies=[Depends(cms_globals.props.current_active_user)],
              description="Evaluate the model being served with a trainer export")
diff --git a/app/model_services/medcat_model.py b/app/model_services/medcat_model.py
@@ -112,6 +112,7 @@ def batch_annotate(self, texts: List[str]) -> List[List[Annotation]]:
             nproc=max(int(cpu_count() / 2), 1),
             addl_info=["cui2icd10", "cui2ontologies", "cui2snomed", "cui2athena_ids"]
         )
+        docs = dict(sorted(docs.items(), key=lambda x: x[0]))
         annotations_list = []
         for _, doc in docs.items():
             annotations_list.append([Annotation.parse_obj(record) for record in self.get_records_from_doc(doc)])
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
@@ -1,4 +1,3 @@
 import os
 
 os.environ["PYTHONPATH"] = os.path.join(os.path.dirname(__file__), "..", "..")
-print(os.environ["PYTHONPATH"])
diff --git a/tests/integration/features/serving.feature b/tests/integration/features/serving.feature
@@ -1,6 +1,7 @@
 Feature:
   CogStack ModelServe APIs
 
+  @status
   Scenario Outline: Get general information about server healthiness, readiness and the running model
     Given CMS app is up and running
     When I send a GET request to <endpoint>
@@ -12,92 +13,103 @@ Feature:
       | /readyz   | medcat_umls                 | 200         |
       | /info     | "model_type":"medcat_umls"  | 200         |
 
+  @ner
   Scenario: Extract entities from free texts
     Given CMS app is up and running
     When I send a POST request with the following content
       | endpoint        | data             | content_type |
       | /process        | Spinal stenosis  | text/plain   |
     Then the response should contain annotations
 
+  @ner
   Scenario: Extract entities from JSON Lines
     Given CMS app is up and running
     When I send a POST request with the following jsonlines content
       | endpoint        | data                                                                                            | content_type          |
       | /process_jsonl  | {"name": "doc1", "text": "Spinal stenosis"}\n{"name": "doc2", "text": "Spinal stenosis"}        | application/x-ndjson  |
     Then the response should contain json lines
 
-  @skip
+  @ner
   Scenario: Extract entities from bulk texts
     Given CMS app is up and running
     When I send a POST request with the following content
-      | endpoint        | data                                        | content_type          |
-      | /process_bulk   | ["Spinal stenosis", "Spinal stenosis"]      | application/json      |
+      | endpoint        | data                                                          | content_type          |
+      | /process_bulk   | ["Spinal stenosis", "Intracerebral hemorrhage", "Cerebellum"] | application/json      |
     Then the response should contain bulk annotations
 
-  @skip
+  @ner
   Scenario: Extract entities from a file with bulk texts
     Given CMS app is up and running
     When I send a POST request with the following content where data as a file
-      | endpoint             | data                                        | content_type          |
-      | /process_bulk_file   | ["Spinal stenosis", "Spinal stenosis"]      | multipart/form-data   |
+      | endpoint             | data                                                           | content_type          |
+      | /process_bulk_file   | ["Spinal stenosis", "Intracerebral hemorrhage", "Cerebellum"]  | multipart/form-data   |
     Then the response should contain bulk annotations
 
+  @redaction
   Scenario: Extract and redact entities from free texts
     Given CMS app is up and running
     When I send a POST request with the following content
       | endpoint        | data             | content_type |
       | /redact         | Spinal stenosis  | text/plain   |
     Then the response should contain text [spinal stenosis]
 
+  @redaction
   Scenario: Extract and redact entities from free texts with a mask
     Given CMS app is up and running
     When I send a POST request with the following content
       | endpoint          | data             | content_type |
       | /redact?mask=***  | Spinal stenosis  | text/plain   |
     Then the response should contain text ***
 
+  @redaction
   Scenario: Extract and redact entities from free texts with a hash
     Given CMS app is up and running
     When I send a POST request with the following content
       | endpoint                    | data             | content_type |
       | /redact?mask=any&hash=true  | Spinal stenosis  | text/plain   |
     Then the response should contain text 4c86af83314100034ad83fae3227e595fc54cb864c69ea912cd5290b8d0f41a4
 
+  @redaction
   Scenario: Warn when no entities are detected for redaction
     Given CMS app is up and running
     When I send a POST request with the following content
       | endpoint                          | data            | content_type |
       | /redact?warn_on_no_redaction=true | abcdefgh  | text/plain   |
     Then the response should contain text warning: no entities were detected for redaction.
 
+  @redaction
   Scenario: Extract and redact entities if not filtered out
     Given CMS app is up and running
     When I send a POST request with the following content
       | endpoint                          | data             | content_type |
       | /redact?concepts_to_keep=C0037944 | Spinal stenosis  | text/plain   |
     Then the response should contain text spinal stenosis
 
+  @redaction
   Scenario: Extract and redact entities with encryption
     Given CMS app is up and running
     When I send a POST request with the following content
       | endpoint                | data                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |  content_type       |
       | /redact_with_encryption | {"text": "Spinal stenosis", "public_key_pem": "-----BEGIN PUBLIC KEY-----\nMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEA3ITkTP8Tm/5FygcwY2EQ7LgVsuCF0OH7psUqvlXnOPNCfX86CobHBiSFjG9o5ZeajPtTXaf1thUodgpJZVZSqpVTXwGKo8r0COMO87IcwYigkZZgG/WmZgoZART+AA0+JvjFGxflJAxSv7puGlf82E+u5Wz2psLBSDO5qrnmaDZTvPh5eX84cocahVVI7X09/kI+sZiKauM69yoy1bdx16YIIeNm0M9qqS3tTrjouQiJfZ8jUKSZ44Na/81LMVw5O46+5GvwD+OsR43kQ0TexMwgtHxQQsiXLWHCDNy2ZzkzukDYRwA3V2lwVjtQN0WjxHg24BTBDBM+v7iQ7cbweQIDAQAB\n-----END PUBLIC KEY-----"} |  application/json   |
     Then the response should contain encrypted labels
 
+  @preview
   Scenario: Extract and preview entities
     Given CMS app is up and running
     When I send a POST request with the following content
       | endpoint        | data             | content_type |
       | /preview        | Spinal stenosis  | text/plain   |
     Then the response should contain a preview page
 
+  @preview
   Scenario: Preview trainer export
     Given CMS app is up and running
     When I send a POST request with the following trainer export
       | endpoint                                                | file_name           | content_type        |
       | /preview_trainer_export?project_id=14&document_id=3204  | trainer_export.json | multipart/form-data |
     Then the response should contain a preview page
 
+  @train
   Scenario: Train supervised
     Given CMS app is up and running
     When I send a POST request with the following trainer export
@@ -109,6 +121,7 @@ Feature:
     When I send a GET request to /train_eval_metrics with that ID
     Then the response should contain the supervised evaluation metrics
 
+  @train
   Scenario: Train unsupervised
     Given CMS app is up and running
     When I send a POST request with the following training data
@@ -120,6 +133,7 @@ Feature:
     When I send a GET request to /train_eval_metrics with that ID
     Then the response should contain the unsupervised evaluation metrics
 
+  @train
   Scenario: Evaluate served model
     Given CMS app is up and running
     When I send a POST request with the following trainer export
@@ -128,3 +142,41 @@ Feature:
     Then the response should contain the evaluation ID
     When I send a GET request to /train_eval_info with that ID
     Then the response should contain the evaluation information
+
+  @misc
+  Scenario: Sanity check the model with a trainer export
+      Given CMS app is up and running
+      When I send a POST request with the following trainer export
+        | endpoint      | file_name           | content_type        |
+        | /sanity-check | trainer_export.json | multipart/form-data |
+      Then the response should contain evaluation metrics per concept
+
+  @misc
+  Scenario Outline: Calculate Inter Annotator Agreement (IAA) scores between two annotation projects
+    Given CMS app is up and running
+    When I send a POST request with the following trainer export
+      | endpoint                                                                       | file_name                                       | content_type     |
+      | /iaa-scores?annotator_a_project_id=14&annotator_b_project_id=15&scope=<scope>  | trainer_export.json,another_trainer_export.json | application/json |
+    Then the response should contain IAA scores
+
+    Examples:
+      | scope         |
+      | per_concept   |
+      | per_document  |
+      | per_span      |
+
+  @misc
+  Scenario: Concatenate multiple trainer export files into a single file
+    Given CMS app is up and running
+    When I send a POST request with the following trainer export
+      | endpoint                 | file_name                                       | content_type        |
+      | /concat_trainer_exports  | trainer_export.json,another_trainer_export.json | multipart/form-data |
+    Then the response should contain a concatenated trainer export
+
+  @misc
+  Scenario: Get annotation stats of trainer export files
+    Given CMS app is up and running
+    When I send a POST request with the following trainer export
+      | endpoint                 | file_name                                       | content_type        |
+      | /annotation-stats        | trainer_export.json,another_trainer_export.json | multipart/form-data |
+    Then the response should contain annotation stats
diff --git a/tests/integration/features/serving_stream.feature b/tests/integration/features/serving_stream.feature
@@ -1,13 +1,15 @@
 Feature:
   CogStack ModelServe Stream APIs
 
+  @ner-stream
   Scenario: Stream entities extracted from free texts
       Given CMS stream app is up and running
       When I send an async POST request with the following jsonlines content
         | endpoint        | data                                                                                      | content_type          |
         | /stream/process | {"name": "doc1", "text": "Spinal stenosis"}\n{"name": "doc2", "text": "Spinal stenosis"}  | application/x-ndjson  |
       Then the response should contain annotation stream
 
+  @ner-chat
   Scenario: Interactively extract entities from free texts
       Given CMS stream app is up and running
       When I send a piece of text to the WS endpoint
diff --git a/tests/integration/test_steps.py b/tests/integration/test_steps.py
@@ -147,17 +147,22 @@ def check_response_bulk(context):
     assert context["response"].headers["Content-Type"] == "application/json"
     bulk_results = context["response"].json()
     assert isinstance(bulk_results, list)
-    assert len(bulk_results) == 2
+    assert len(bulk_results) == 3
     assert bulk_results[0]["text"] == "Spinal stenosis"
     assert bulk_results[0]["annotations"][0]["start"] == 0
     assert bulk_results[0]["annotations"][0]["end"] == 15
     assert bulk_results[0]["annotations"][0]["label_name"].lower() == "spinal stenosis"
     assert isinstance(bulk_results[0]["annotations"][0]["label_id"], str)
-    assert bulk_results[1]["text"] == "Spinal stenosis"
+    assert bulk_results[1]["text"] == "Intracerebral hemorrhage"
     assert bulk_results[1]["annotations"][0]["start"] == 0
-    assert bulk_results[1]["annotations"][0]["end"] == 15
-    assert bulk_results[1]["annotations"][0]["label_name"].lower() == "spinal stenosis"
+    assert bulk_results[1]["annotations"][0]["end"] == 24
+    assert bulk_results[1]["annotations"][0]["label_name"].lower() == "cerebral hemorrhage"
     assert isinstance(bulk_results[1]["annotations"][0]["label_id"], str)
+    assert bulk_results[2]["text"] == "Cerebellum"
+    assert bulk_results[2]["annotations"][0]["start"] == 0
+    assert bulk_results[2]["annotations"][0]["end"] == 10
+    assert bulk_results[2]["annotations"][0]["label_name"].lower() == "cerebellum"
+    assert isinstance(bulk_results[2]["annotations"][0]["label_id"], str)
     context["response"].close()
 
 @then(parsers.parse("the response should contain text {redaction}"))
@@ -174,10 +179,16 @@ def check_response_previewed(context):
 
 @when(data_table("I send a POST request with the following trainer export", fixture="request", orient="dict"))
 def send_post_training_request_file(context, request):
-    trainer_export_path = os.path.join(os.path.dirname(__file__), "..", "resources", "fixture", request[0]["file_name"])
-    with open(trainer_export_path, "rb") as f:
-        context["response"] = requests.post(f"{context['base_url']}{request[0]['endpoint']}",
-                                            files=[("trainer_export", f)])
+    trainer_export_names = request[0]["file_name"].split(",")
+
+    files = []
+    for trainer_export_name in trainer_export_names:
+        trainer_export_path = os.path.join(os.path.dirname(__file__), "..", "resources", "fixture", trainer_export_name)
+        file = open(trainer_export_path, "rb")
+        files.append(("trainer_export", file))
+
+    context["response"] = requests.post(f"{context['base_url']}{request[0]['endpoint']}", files=files)
+    [file.close() for _, file in files]
 
 @when(data_table("I send a POST request with the following training data", fixture="request", orient="dict"))
 def send_post_training_request_file(context, request):
@@ -245,6 +256,38 @@ def check_response_training_id(context):
     assert "encryption" in response_json["encryptions"][0]
     context["response"].close()
 
+@then("the response should contain evaluation metrics per concept")
+def check_response_sanity_check(context):
+    assert context["response"].status_code == 200
+    assert context["response"].headers["Content-Type"] == "text/csv; charset=utf-8"
+    response_lines = context["response"].content.decode("utf-8").splitlines()
+    assert len(response_lines) > 1
+    assert "concept,name,precision,recall,f1" == response_lines[0]
+    context["response"].close()
+
+@then("the response should contain IAA scores")
+def check_response_iaa(context):
+    assert context["response"].status_code == 200
+    assert context["response"].headers["Content-Type"] == "text/csv; charset=utf-8"
+    response_lines = context["response"].content.decode("utf-8").splitlines()
+    assert "iaa_percentage,cohens_kappa,iaa_percentage_meta,cohens_kappa_meta" in response_lines[0]
+    context["response"].close()
+
+@then("the response should contain a concatenated trainer export")
+def check_response_concatenated_trainer_export(context):
+    assert context["response"].status_code == 200
+    assert context["response"].headers["Content-Type"] == "application/json; charset=utf-8"
+    assert len(context["response"].text) == 36918
+
+@then("the response should contain annotation stats")
+def check_response_annotation_stats(context):
+    assert context["response"].status_code == 200
+    assert context["response"].headers["Content-Type"] == "text/csv; charset=utf-8"
+    response_lines = context["response"].content.decode("utf-8").splitlines()
+    assert len(response_lines) > 1
+    assert "concept,anno_count,anno_unique_counts,anno_ignorance_counts" == response_lines[0]
+    context["response"].close()
+
 @when(data_table("I send an async POST request with the following jsonlines content", fixture="request", orient="dict"))
 @async_to_sync
 async def send_async_post_request(context_stream, request):

Original file line number	Diff line number	Diff line change
`@@ -112,6 +112,7 @@ def batch_annotate(self, texts: List[str]) -> List[List[Annotation]]:`
`112`	`112`	`nproc=max(int(cpu_count() / 2), 1),`
`113`	`113`	`addl_info=["cui2icd10", "cui2ontologies", "cui2snomed", "cui2athena_ids"]`
`114`	`114`	`)`
	`115`	`+ docs = dict(sorted(docs.items(), key=lambda x: x[0]))`
`115`	`116`	`annotations_list = []`
`116`	`117`	`for _, doc in docs.items():`
`117`	`118`	`annotations_list.append([Annotation.parse_obj(record) for record in self.get_records_from_doc(doc)])`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,3 @@`
`1`	`1`	`import os`
`2`	`2`
`3`	`3`	`os.environ["PYTHONPATH"] = os.path.join(os.path.dirname(__file__), "..", "..")`
`4`		`-print(os.environ["PYTHONPATH"])`