Release v1.3.4 (#55)

lumburovskalina · JWittmeyer · JWittmeyer · web-flow · commit f8b3507c6abf · 2022-10-07T14:27:37.000+02:00
* Added update notifications for every update of the attribute

* Back end for running lf on 10 records

* Full record data returned for lf sample records

* Adds formatting

* PR comments

* Submodule change

Co-authored-by: JWittmeyer &lt;91723236+JWittmeyer@users.noreply.github.com&gt;
Co-authored-by: JWittmeyer &lt;jens.wittmeyer@onetask.ai&gt;
diff --git a/controller/attribute/manager.py b/controller/attribute/manager.py
@@ -87,7 +87,8 @@ def update_attribute(
     name: str,
     source_code: str,
 ) -> None:
-    attribute.update(
+
+    attribute_item: Attribute = attribute.update(
         project_id,
         attribute_id,
         data_type,
@@ -96,12 +97,10 @@ def update_attribute(
         source_code,
         with_commit=True,
     )
-    if attribute.get(project_id, attribute_id).state in [
-        AttributeState.UPLOADED.value,
-        AttributeState.AUTOMATICALLY_CREATED.value,
-        AttributeState.USABLE.value,
-    ]:
-        notification.send_organization_update(project_id, "attributes_updated")
+
+    notification.send_organization_update(
+        project_id=project_id, message=f"calculate_attribute:updated:{str(attribute_item.id)}"
+    )
 
 
 def delete_attribute(project_id: str, attribute_id: str) -> None:
diff --git a/controller/payload/manager.py b/controller/payload/manager.py
@@ -1,8 +1,11 @@
-from typing import Any, Optional
-
+from typing import Any, Dict, List, Optional, Tuple
 from controller.payload import payload_scheduler
-from submodules.model import InformationSourcePayload
-from submodules.model.business_objects import payload
+from graphql_api.types import (
+    LabelingFunctionSampleRecordWrapper,
+    LabelingFunctionSampleRecords,
+)
+from submodules.model import InformationSourcePayload, enums
+from submodules.model.business_objects import information_source, payload
 
 
 def get_payload(project_id: str, payload_id: str) -> InformationSourcePayload:
@@ -39,3 +42,43 @@ def update_payload_status(
     project_id: str, payload_id: str, status: str
 ) -> InformationSourcePayload:
     return payload.update_status(project_id, payload_id, status)
+
+
+def get_labeling_function_on_10_records(
+    project_id: str, information_source_id: str
+) -> LabelingFunctionSampleRecords:
+    doc_bin_samples, sample_records = payload_scheduler.prepare_sample_records_doc_bin(
+        project_id=project_id, information_source_id=information_source_id
+    )
+    (
+        calculated_labels,
+        container_logs,
+        code_has_errors,
+    ) = payload_scheduler.run_labeling_function_exec_env(
+        project_id=project_id,
+        information_source_id=information_source_id,
+        prefixed_doc_bin=doc_bin_samples,
+    )
+    calculated_labels = fill_missing_record_ids(sample_records, calculated_labels)
+
+    return LabelingFunctionSampleRecords(
+        records=[
+            LabelingFunctionSampleRecordWrapper(
+                record_id=record_item[0],
+                full_record_data=record_item[1],
+                calculated_labels=calculated_labels[record_item[0]],
+            )
+            for record_item in sample_records
+        ],
+        container_logs=container_logs,
+        code_has_errors=code_has_errors,
+    )
+
+
+def fill_missing_record_ids(sample_records: List[str], calculated_labels: Dict[str, List[Any]]) -> List[str]:
+    for record_item in sample_records:
+        record_id = record_item[0]
+        if record_id not in calculated_labels:
+            calculated_labels[record_id] = []
+
+    return calculated_labels
diff --git a/controller/payload/payload_scheduler.py b/controller/payload/payload_scheduler.py
@@ -13,6 +13,7 @@
 from graphql.error.base import GraphQLError
 from submodules.model import enums, events
 from submodules.model.business_objects import (
+    attribute,
     information_source,
     embedding,
     labeling_task,
@@ -33,7 +34,10 @@
     get_label_ids_by_names,
 )
 from submodules.model.business_objects.payload import get_max_token, get
-from submodules.model.business_objects.tokenization import get_doc_bin_progress
+from submodules.model.business_objects.tokenization import (
+    get_doc_bin_progress,
+    get_doc_bin_table_to_json,
+)
 from submodules.model.models import (
     InformationSource,
     InformationSourceStatisticsExclusion,
@@ -694,3 +698,108 @@ def add_information_source_statistics_exclusion(
         if idx % 2 == 0
     ]
     general.add_all(exclusions, with_commit=True)
+
+
+def prepare_sample_records_doc_bin(
+    project_id: str, information_source_id: str
+) -> Tuple[str, List[str]]:
+    sample_records = record.get_attribute_calculation_sample_records(project_id)
+
+    sample_records_doc_bin = get_doc_bin_table_to_json(
+        project_id=project_id,
+        missing_columns=get_missing_columns_tokenization(project_id),
+        record_ids=[r[0] for r in sample_records],
+    )
+    project_item = project.get(project_id)
+    org_id = str(project_item.organization_id)
+    prefixed_doc_bin = f"{information_source_id}_doc_bin.json"
+    s3.put_object(
+        org_id,
+        project_id + "/" + prefixed_doc_bin,
+        sample_records_doc_bin,
+    )
+
+    return prefixed_doc_bin, sample_records
+
+
+def run_labeling_function_exec_env(
+    project_id: str, information_source_id: str, prefixed_doc_bin: str
+) -> Tuple[List[str], List[List[str]], bool]:
+
+    information_source_item = information_source.get(project_id, information_source_id)
+
+    prefixed_function_name = f"{information_source_id}_fn"
+    prefixed_payload = f"{information_source_id}_payload.json"
+    prefixed_knowledge_base = f"{information_source_id}_knowledge"
+    project_item = project.get(project_id)
+    org_id = str(project_item.organization_id)
+
+    s3.put_object(
+        org_id,
+        project_id + "/" + prefixed_function_name,
+        information_source_item.source_code,
+    )
+
+    s3.put_object(
+        org_id,
+        project_id + "/" + prefixed_knowledge_base,
+        knowledge_base.build_knowledge_base_from_project(project_id),
+    )
+
+    tokenization_progress = get_doc_bin_progress(project_id)
+
+    command = [
+        s3.create_access_link(org_id, project_id + "/" + prefixed_doc_bin),
+        s3.create_access_link(org_id, project_id + "/" + prefixed_function_name),
+        s3.create_access_link(org_id, project_id + "/" + prefixed_knowledge_base),
+        tokenization_progress,
+        project_item.tokenizer_blank,
+        s3.create_file_upload_link(org_id, project_id + "/" + prefixed_payload),
+    ]
+
+    container = client.containers.run(
+        image=lf_exec_env_image,
+        command=command,
+        remove=True,
+        detach=True,
+        network=exec_env_network,
+    )
+
+    container_logs = [
+        line.decode("utf-8").strip("\n")
+        for line in container.logs(
+            stream=True, stdout=True, stderr=True, timestamps=True
+        )
+    ]
+
+    code_has_errors = False
+
+    try:
+        payload = s3.get_object(org_id, project_id + "/" + prefixed_payload)
+        calculated_labels = json.loads(payload)
+    except Exception:
+        print("Could not grab data from s3 -- labeling function")
+        code_has_errors = True
+        calculated_labels = {}
+
+    if not prefixed_doc_bin == "docbin_full":
+        # sample records docbin should be deleted after calculation
+        s3.delete_object(org_id, project_id + "/" + prefixed_doc_bin)
+    s3.delete_object(org_id, project_id + "/" + prefixed_function_name)
+    s3.delete_object(org_id, project_id + "/" + prefixed_payload)
+    s3.delete_object(org_id, project_id + "/" + prefixed_knowledge_base)
+
+    return calculated_labels, container_logs, code_has_errors
+
+
+def get_missing_columns_tokenization(project_id: str) -> str:
+    missing_columns = [
+        attribute_item.name
+        for attribute_item in attribute.get_all(project_id)
+        if attribute_item.data_type != enums.DataTypes.TEXT.value
+    ]
+    missing_columns_str = ",\n".join(
+        ["'" + k + "',r.data->'" + k + "'" for k in missing_columns]
+    )
+
+    return missing_columns_str
diff --git a/graphql_api/query/payload.py b/graphql_api/query/payload.py
@@ -1,7 +1,7 @@
 import graphene
 
 from controller.auth import manager as auth
-from graphql_api.types import InformationSourcePayload
+from graphql_api.types import InformationSourcePayload, LabelingFunctionSampleRecords
 from controller.payload import manager
 
 
@@ -13,9 +13,24 @@ class PayloadQuery(graphene.ObjectType):
         project_id=graphene.ID(required=True),
     )
 
+    get_labeling_function_on_10_records = graphene.Field(
+        LabelingFunctionSampleRecords,
+        project_id=graphene.ID(required=True),
+        information_source_id=graphene.ID(required=True),
+    )
+
     def resolve_payload_by_payload_id(
         self, info, payload_id: str, project_id: str
     ) -> InformationSourcePayload:
         auth.check_demo_access(info)
         auth.check_project_access(info, project_id)
         return manager.get_payload(project_id, payload_id)
+
+    def resolve_get_labeling_function_on_10_records(
+        self, info, project_id: str, information_source_id: str
+    ) -> LabelingFunctionSampleRecords:
+        auth.check_demo_access(info)
+        auth.check_project_access(info, project_id)
+        return manager.get_labeling_function_on_10_records(
+            project_id, information_source_id
+        )
diff --git a/graphql_api/types.py b/graphql_api/types.py
@@ -750,3 +750,15 @@ class LastRunAttributesResult(graphene.ObjectType):
 class UserAttributeSampleRecordsResult(graphene.ObjectType):
     record_ids = graphene.List(graphene.ID)
     calculated_attributes = graphene.List(graphene.String)
+
+
+class LabelingFunctionSampleRecordWrapper(graphene.ObjectType):
+    record_id = graphene.ID()
+    calculated_labels = graphene.List(graphene.String)
+    full_record_data = graphene.JSONString()
+
+
+class LabelingFunctionSampleRecords(graphene.ObjectType):
+    records = graphene.List(LabelingFunctionSampleRecordWrapper)
+    container_logs = graphene.List(graphene.String)
+    code_has_errors = graphene.Boolean()
diff --git a/submodules/model b/submodules/model
@@ -1 +1 @@
-Subproject commit dce767f727a515c02407d8ef316e7f08d398c4f1
+Subproject commit d925ba09a4afee38e343f145304338e80b15562a