feat(LAB-3098): dynamic llm export completion level classification (#1828)

RuellePaul · paulruelle · web-flow · commit 04f8b6383f4d · 2024-12-11T09:47:32.000+01:00
Co-authored-by: paulruelle &lt;paul.ruelle@kili-technology.com&gt;
diff --git a/src/kili/llm/services/export/dynamic.py b/src/kili/llm/services/export/dynamic.py
@@ -28,6 +28,8 @@
     "modelName",
 ]
 
+DEFAULT_JOB_LEVEL = "round"
+
 
 class LLMDynamicExporter:
     """Handle exports of LLM_RLHF projects."""
@@ -72,8 +74,10 @@ def export(
                         "label_type": label["labelType"],
                         "label": {},
                     }
-                    if formatted_response["turn"]:
-                        label_data["label"]["turn"] = formatted_response["turn"]
+                    if formatted_response["round"]:
+                        label_data["label"]["round"] = formatted_response["round"]
+                    if formatted_response["completion"]:
+                        label_data["label"]["completion"] = formatted_response["completion"]
                     if step == total_rounds - 1 and formatted_response["conversation"]:
                         label_data["label"]["conversation"] = formatted_response["conversation"]
 
@@ -238,7 +242,7 @@ def _format_comparison_annotation(annotation, completions, job, obfuscated_model
 def _format_json_response(
     jobs_config: Dict, annotations: List[Dict], completions: List[Dict], obfuscated_models: Dict
 ) -> Dict[str, Dict[str, Union[str, List[str]]]]:
-    result = {"turn": {}, "conversation": {}}
+    result = {"round": {}, "conversation": {}, "completion": {}}
     for annotation in annotations:
         formatted_response = None
         job = jobs_config[annotation["job"]]
@@ -251,14 +255,20 @@ def _format_json_response(
                 annotation, completions, job, obfuscated_models
             )
 
+        job_level = job.get("level", DEFAULT_JOB_LEVEL)
+
         if formatted_response is None:
             logging.warning(
                 f"Annotation with job {annotation['job']} with mlTask {job['mlTask']} not supported. Ignored in the export."
             )
-        elif "level" in job and job["level"] == "conversation":
-            result["conversation"][annotation["job"]] = formatted_response
+
+        elif job_level == "completion":
+            result.setdefault(job_level, {}).setdefault(annotation["job"], {})[
+                annotation["chatItemId"]
+            ] = formatted_response
+
         else:
-            result["turn"][annotation["job"]] = formatted_response
+            result[job_level][annotation["job"]] = formatted_response
 
     return result
 
diff --git a/tests/unit/llm/services/export/test_dynamic.py b/tests/unit/llm/services/export/test_dynamic.py
@@ -281,7 +281,10 @@
                     "created_at": "2024-08-06T12:30:42.122Z",
                     "label_type": "DEFAULT",
                     "label": {
-                        "turn": {"COMPARISON_JOB": "A_3", "CLASSIFICATION_JOB": ["BOTH_ARE_GOOD"]},
+                        "round": {
+                            "COMPARISON_JOB": "A_3",
+                            "CLASSIFICATION_JOB": ["BOTH_ARE_GOOD"],
+                        },
                     },
                 }
             ],
@@ -358,7 +361,7 @@
                     "created_at": "2024-08-06T12:30:42.122Z",
                     "label_type": "DEFAULT",
                     "label": {
-                        "turn": {"COMPARISON_JOB": "B_1"},
+                        "round": {"COMPARISON_JOB": "B_1"},
                     },
                 }
             ],
@@ -449,7 +452,7 @@
                     "created_at": "2024-08-06T12:30:42.122Z",
                     "label_type": "DEFAULT",
                     "label": {
-                        "turn": {"COMPARISON_JOB": "A_2"},
+                        "round": {"COMPARISON_JOB": "A_2"},
                     },
                 }
             ],
@@ -709,3 +712,256 @@ def test_export_dynamic_with_conversation_level(mocker):
         project_id="project_id",
     )
     assert result == updated_expected_export
+
+
+def test_export_dynamic_with_completion_level(mocker):
+    updated_mock_json_interface = copy.deepcopy(mock_json_interface)
+
+    updated_mock_json_interface["jobs"].update(
+        {
+            "CLASSIFICATION_JOB_AT_COMPLETION_LEVEL": {
+                "content": {
+                    "categories": {
+                        "TOO_SHORT": {"children": [], "name": "Too short", "id": "category1"},
+                        "JUST_RIGHT": {"children": [], "name": "Just right", "id": "category2"},
+                        "TOO_VERBOSE": {"children": [], "name": "Too verbose", "id": "category3"},
+                    },
+                    "input": "radio",
+                },
+                "instruction": "Verbosity",
+                "level": "completion",
+                "mlTask": "CLASSIFICATION",
+                "required": 0,
+                "isChild": False,
+                "isNew": False,
+            },
+            "CLASSIFICATION_JOB_AT_COMPLETION_LEVEL_1": {
+                "content": {
+                    "categories": {
+                        "NO_ISSUES": {"children": [], "name": "No issues", "id": "category4"},
+                        "MINOR_ISSUES": {
+                            "children": [],
+                            "name": "Minor issue(s)",
+                            "id": "category5",
+                        },
+                        "MAJOR_ISSUES": {
+                            "children": [],
+                            "name": "Major issue(s)",
+                            "id": "category6",
+                        },
+                    },
+                    "input": "radio",
+                },
+                "instruction": "Instructions Following",
+                "level": "completion",
+                "mlTask": "CLASSIFICATION",
+                "required": 0,
+                "isChild": False,
+                "isNew": False,
+            },
+            "CLASSIFICATION_JOB_AT_COMPLETION_LEVEL_2": {
+                "content": {
+                    "categories": {
+                        "NO_ISSUES": {"children": [], "name": "No issues", "id": "category7"},
+                        "MINOR_INACCURACY": {
+                            "children": [],
+                            "name": "Minor inaccuracy",
+                            "id": "category8",
+                        },
+                        "MAJOR_INACCURACY": {
+                            "children": [],
+                            "name": "Major inaccuracy",
+                            "id": "category9",
+                        },
+                    },
+                    "input": "radio",
+                },
+                "instruction": "Truthfulness",
+                "level": "completion",
+                "mlTask": "CLASSIFICATION",
+                "required": 0,
+                "isChild": False,
+                "isNew": False,
+            },
+            "CLASSIFICATION_JOB_AT_COMPLETION_LEVEL_3": {
+                "content": {
+                    "categories": {
+                        "NO_ISSUES": {"children": [], "name": "No issues", "id": "category10"},
+                        "MINOR_SAFETY_CONCERN": {
+                            "children": [],
+                            "name": "Minor safety concern",
+                            "id": "category11",
+                        },
+                        "MAJOR_SAFETY_CONCERN": {
+                            "children": [],
+                            "name": "Major safety concern",
+                            "id": "category12",
+                        },
+                    },
+                    "input": "radio",
+                },
+                "instruction": "Harmlessness/Safety",
+                "level": "completion",
+                "mlTask": "CLASSIFICATION",
+                "required": 0,
+                "isChild": False,
+                "isNew": False,
+            },
+        }
+    )
+
+    updated_mock_fetch_assets = copy.deepcopy(mock_fetch_assets)
+    updated_mock_fetch_assets[0]["labels"][0]["annotations"].extend(
+        [
+            {
+                "id": "20241209092703759-1",
+                "job": "CLASSIFICATION_JOB_AT_COMPLETION_LEVEL",
+                "path": [],
+                "labelId": "clzief6q2003e7tc91jm46uii",
+                "chatItemId": "clzieuhlc005a7tc9bx6f0mb5",
+                "annotationValue": {
+                    "categories": ["TOO_SHORT"],
+                    "id": "20241209092703759-1",
+                    "isPrediction": False,
+                    "__typename": "ClassificationAnnotationValue",
+                },
+                "__typename": "ClassificationAnnotation",
+            },
+            {
+                "id": "20241209092704576-2",
+                "job": "CLASSIFICATION_JOB_AT_COMPLETION_LEVEL_1",
+                "path": [],
+                "labelId": "clzief6q2003e7tc91jm46uii",
+                "chatItemId": "clzieuhlc005a7tc9bx6f0mb5",
+                "annotationValue": {
+                    "categories": ["MINOR_ISSUES"],
+                    "id": "20241209092704576-2",
+                    "isPrediction": False,
+                    "__typename": "ClassificationAnnotationValue",
+                },
+                "__typename": "ClassificationAnnotation",
+            },
+            {
+                "id": "20241209092705314-3",
+                "job": "CLASSIFICATION_JOB_AT_COMPLETION_LEVEL_2",
+                "path": [],
+                "labelId": "clzief6q2003e7tc91jm46uii",
+                "chatItemId": "clzieuhlc005a7tc9bx6f0mb5",
+                "annotationValue": {
+                    "categories": ["MAJOR_INACCURACY"],
+                    "id": "20241209092705314-3",
+                    "isPrediction": False,
+                    "__typename": "ClassificationAnnotationValue",
+                },
+                "__typename": "ClassificationAnnotation",
+            },
+            {
+                "id": "20241209092706381-4",
+                "job": "CLASSIFICATION_JOB_AT_COMPLETION_LEVEL_3",
+                "path": [],
+                "labelId": "clzief6q2003e7tc91jm46uii",
+                "chatItemId": "clzieuhlc005a7tc9bx6f0mb5",
+                "annotationValue": {
+                    "categories": ["MAJOR_SAFETY_CONCERN"],
+                    "id": "20241209092706381-4",
+                    "isPrediction": False,
+                    "__typename": "ClassificationAnnotationValue",
+                },
+                "__typename": "ClassificationAnnotation",
+            },
+            {
+                "id": "20241209092707543-5",
+                "job": "CLASSIFICATION_JOB_AT_COMPLETION_LEVEL",
+                "path": [],
+                "labelId": "clzief6q2003e7tc91jm46uii",
+                "chatItemId": "clzieuhm1005b7tc9b747clxw",
+                "annotationValue": {
+                    "categories": ["JUST_RIGHT"],
+                    "id": "20241209092707543-5",
+                    "isPrediction": False,
+                    "__typename": "ClassificationAnnotationValue",
+                },
+                "__typename": "ClassificationAnnotation",
+            },
+            {
+                "id": "20241209092710361-6",
+                "job": "CLASSIFICATION_JOB_AT_COMPLETION_LEVEL_1",
+                "path": [],
+                "labelId": "clzief6q2003e7tc91jm46uii",
+                "chatItemId": "clzieuhm1005b7tc9b747clxw",
+                "annotationValue": {
+                    "categories": ["NO_ISSUES"],
+                    "id": "20241209092710361-6",
+                    "isPrediction": False,
+                    "__typename": "ClassificationAnnotationValue",
+                },
+                "__typename": "ClassificationAnnotation",
+            },
+            {
+                "id": "20241209092711511-7",
+                "job": "CLASSIFICATION_JOB_AT_COMPLETION_LEVEL_2",
+                "path": [],
+                "labelId": "clzief6q2003e7tc91jm46uii",
+                "chatItemId": "clzieuhm1005b7tc9b747clxw",
+                "annotationValue": {
+                    "categories": ["NO_ISSUES"],
+                    "id": "20241209092711511-7",
+                    "isPrediction": False,
+                    "__typename": "ClassificationAnnotationValue",
+                },
+                "__typename": "ClassificationAnnotation",
+            },
+            {
+                "id": "20241209092713123-8",
+                "job": "CLASSIFICATION_JOB_AT_COMPLETION_LEVEL_3",
+                "path": [],
+                "labelId": "clzief6q2003e7tc91jm46uii",
+                "chatItemId": "clzieuhm1005b7tc9b747clxw",
+                "annotationValue": {
+                    "categories": ["NO_ISSUES"],
+                    "id": "20241209092713123-8",
+                    "isPrediction": False,
+                    "__typename": "ClassificationAnnotationValue",
+                },
+                "__typename": "ClassificationAnnotation",
+            },
+        ]
+    )
+
+    updated_expected_export = copy.deepcopy(expected_export)
+    updated_expected_export[0]["2"]["labels"][0]["label"]["completion"] = {
+        "CLASSIFICATION_JOB_AT_COMPLETION_LEVEL": {
+            "clzieuhlc005a7tc9bx6f0mb5": ["TOO_SHORT"],
+            "clzieuhm1005b7tc9b747clxw": ["JUST_RIGHT"],
+        },
+        "CLASSIFICATION_JOB_AT_COMPLETION_LEVEL_1": {
+            "clzieuhlc005a7tc9bx6f0mb5": ["MINOR_ISSUES"],
+            "clzieuhm1005b7tc9b747clxw": ["NO_ISSUES"],
+        },
+        "CLASSIFICATION_JOB_AT_COMPLETION_LEVEL_2": {
+            "clzieuhlc005a7tc9bx6f0mb5": ["MAJOR_INACCURACY"],
+            "clzieuhm1005b7tc9b747clxw": ["NO_ISSUES"],
+        },
+        "CLASSIFICATION_JOB_AT_COMPLETION_LEVEL_3": {
+            "clzieuhlc005a7tc9bx6f0mb5": ["MAJOR_SAFETY_CONCERN"],
+            "clzieuhm1005b7tc9b747clxw": ["NO_ISSUES"],
+        },
+    }
+    get_project_return_val = {
+        "jsonInterface": updated_mock_json_interface,
+        "inputType": "LLM_INSTR_FOLLOWING",
+        "title": "Test project with classifications at completion level",
+        "id": "project_id",
+        "dataConnections": None,
+    }
+    kili_api_gateway = mocker.MagicMock()
+    kili_api_gateway.count_assets.return_value = 3
+    kili_api_gateway.get_project.return_value = get_project_return_val
+    kili_api_gateway.list_assets.return_value = updated_mock_fetch_assets
+
+    kili_llm = LlmClientMethods(kili_api_gateway)
+
+    result = kili_llm.export(
+        project_id="project_id",
+    )
+    assert result == updated_expected_export