Option to edit records (#150)

JWittmeyer · web-flow · commit d6642fcbcb63 · 2023-09-12T11:38:12.000+02:00
* Endpoint without logic

* Working draft

* PR comments

* Submodule update
diff --git a/controller/embedding/connector.py b/controller/embedding/connector.py
@@ -1,9 +1,11 @@
 import os
-from typing import Any
+from typing import Any, Dict, Union, List
 
 from util import service_requests
+import requests
 
 BASE_URI = os.getenv("EMBEDDING_SERVICE")
+NEURAL_SEARCH_BASE_URI = os.getenv("NEURAL_SEARCH")
 
 
 def request_listing_recommended_encoders() -> Any:
@@ -28,3 +30,28 @@ def request_deleting_embedding(project_id: str, embedding_id: str) -> Any:
 def request_tensor_upload(project_id: str, embedding_id: str) -> None:
     url = f"{BASE_URI}/upload_tensor_data/{project_id}/{embedding_id}"
     service_requests.post_call_or_raise(url, {})
+
+
+def request_re_embed_records(
+    project_id: str, changes: Dict[str, List[Dict[str, Union[str, int]]]]
+) -> None:
+    # example changes structure:
+    # {"<embedding_id>":[{"record_id":"<record_id>","attribute_name":"<attribute_name>","sub_key":"<sub_key>"}]}
+    # note that sub_key is optional and only for embedding lists relevant
+    url = f"{BASE_URI}/re_embed_records/{project_id}"
+    service_requests.post_call_or_raise(url, {"changes": changes})
+
+
+def post_embedding_to_neural_search(project_id: str, embedding_id: str) -> None:
+    url = f"{NEURAL_SEARCH_BASE_URI}/recreate_collection"
+    params = {
+        "project_id": project_id,
+        "embedding_id": embedding_id,
+    }
+    requests.post(url, params=params)
+
+
+def delete_embedding_from_neural_search(embedding_id: str) -> None:
+    url = f"{NEURAL_SEARCH_BASE_URI}/delete_collection"
+    params = {"embedding_id": embedding_id}
+    requests.put(url, params=params)
diff --git a/controller/embedding/manager.py b/controller/embedding/manager.py
@@ -225,5 +225,5 @@ def update_embedding_payload(
     embedding.update_embedding_filter_attributes(
         project_id, embedding_id, filter_attributes, with_commit=True
     )
-    connector.request_deleting_embedding(project_id, embedding_id)
-    connector.request_tensor_upload(project_id, embedding_id)
+    connector.delete_embedding_from_neural_search(embedding_id)
+    connector.post_embedding_to_neural_search(project_id, embedding_id)
diff --git a/controller/record/manager.py b/controller/record/manager.py
@@ -1,4 +1,5 @@
 from typing import List, Dict, Any, Optional
+import os, copy
 
 from graphql_api.types import ExtendedSearch
 from submodules.model import Record, Attribute
@@ -7,12 +8,22 @@
     user_session,
     embedding,
     attribute,
+    general,
+    tokenization,
+    task_queue,
+    record_label_association,
 )
 from service.search import search
+from submodules.model import enums
 
+from controller.embedding import connector as embedding_connector
 from controller.record import neural_search_connector
 from controller.embedding import manager as embedding_manager
+from controller.tokenization import tokenization_service
 from util import daemon
+from util.miscellaneous_functions import chunk_list
+import time
+import traceback
 
 
 def get_record(project_id: str, record_id: str) -> Record:
@@ -113,3 +124,176 @@ def __reupload_embeddings(project_id: str) -> None:
 
 def get_unique_values_by_attributes(project_id: str) -> Dict[str, List[str]]:
     return attribute.get_unique_values_by_attributes(project_id)
+
+
+def edit_records(
+    user_id: str, project_id: str, changes: Dict[str, Any]
+) -> Optional[List[str]]:
+    prepped = __check_and_prep_edit_records(project_id, changes)
+    if "errors_found" in prepped:
+        return prepped["errors_found"]
+
+    records = prepped["records"]
+
+    for key in changes:
+        record = records[changes[key]["recordId"]]
+        # needs new object to detect changes for commit
+        new_data = copy.deepcopy(record.data)
+        if "subKey" in changes[key]:
+            new_data[changes[key]["attributeName"]][changes[key]["subKey"]] = changes[
+                key
+            ]["newValue"]
+        else:
+            new_data[changes[key]["attributeName"]] = changes[key]["newValue"]
+        record.data = new_data
+    general.commit()
+
+    # remove labels
+    for chunk in chunk_list(prepped["rla_delete_tuples"], 1):
+        record_label_association.delete_by_record_attribute_tuples(project_id, chunk)
+
+    general.commit()
+
+    try:
+        # tokenization currently with a complete rebuild of the docbins of touched records
+        # optimization possible by only rebuilding the changed record & attribute combinations and reuploading
+        tokenization.delete_record_docbins_by_id(project_id, records.keys(), True)
+        tokenization.delete_token_statistics_by_id(project_id, records.keys(), True)
+        tokenization_service.request_tokenize_project(project_id, user_id)
+        time.sleep(1)
+        # wait for tokenization to finish, the endpoint itself handles missing docbins
+        while tokenization.is_doc_bin_creation_running(project_id):
+            time.sleep(0.5)
+
+    except Exception as e:
+        __revert_record_data_changes(records, prepped["record_data_backup"])
+        print(traceback.format_exc(), flush=True)
+        return ["tokenization failed"]
+
+    try:
+        embedding_connector.request_re_embed_records(
+            project_id, prepped["embedding_rebuilds"]
+        )
+
+    except Exception as e:
+        __revert_record_data_changes(records, prepped["record_data_backup"])
+        print(traceback.format_exc(), flush=True)
+        return ["embedding failed"]
+
+    return None
+
+
+def __revert_record_data_changes(
+    records: Dict[str, Record], data_backup: Dict[str, Any]
+) -> None:
+    for record_id in data_backup:
+        records[record_id].data = data_backup[record_id]
+    general.commit()
+
+
+def __check_and_prep_edit_records(
+    project_id: str, changes: Dict[str, Any]
+) -> Dict[str, Any]:
+    # key example: <record_id>@<attribute_name>[@<sub_key>]
+
+    errors_found = []  # list of strings
+    useable_embeddings = {}  # dict of UUID(attribute_id): [embedding_item]
+    attributes = None  # dict of attribute_name: attribute_item
+    records = None  # dict of str(record_id): record_item
+    record_data_backup = None  # dict of str(record_id): record_data
+    embedding_rebuilds = {}  # dict of str(embedding_id): [str(record_id)]
+    record_ids = {changes[key]["recordId"] for key in changes}
+    attribute_names = {changes[key]["attributeName"] for key in changes}
+
+    records = record.get_by_record_ids(project_id, record_ids)
+    if len(record_ids) != len(records):
+        errors_found.append("can't match record ids to project")
+    records = {str(r.id): r for r in records}
+
+    attributes = attribute.get_all_by_names(project_id, attribute_names)
+    if len(attribute_names) != len(attributes):
+        errors_found.append("can't match attributes to project")
+    attributes = {a.name: a for a in attributes}
+
+    tmp = [
+        f"sub_key {changes[key]['subKey']} out of bounds for attribute {changes[key]['attributeName']} of record {changes[key]['recordId']}"
+        for key in changes
+        if "subKey" in changes[key]
+        and changes[key]["subKey"]
+        >= len(records[changes[key]["recordId"]].data[changes[key]["attributeName"]])
+    ]
+
+    if tmp and len(tmp) > 0:
+        errors_found += tmp
+
+    # note that queues for embeddings will not be checked since they are not yet run so uninteresting for us here
+    embeddings = embedding.get_all_by_attribute_ids(
+        project_id, [a.id for a in attributes.values()]
+    )
+    for embedding_item in embeddings:
+        if embedding_item.state == enums.EmbeddingState.FAILED.value:
+            # can be ignored since nothing exists to rebuild yet
+            continue
+
+        if embedding_item.state != enums.EmbeddingState.FINISHED.value:
+            errors_found.append(
+                f"embedding {embedding_item.name} is not finished. Wait for it to finish before editing records."
+            )
+            continue
+
+        emb_path = os.path.join(
+            "/inference", project_id, f"embedder-{str(embedding_item.id)}.pkl"
+        )
+        if not os.path.exists(emb_path):
+            errors_found.append(
+                f"can't find embedding PCA for {embedding_item.name}. Try rebuilding or removing the embeddings on settings page."
+            )
+            continue
+        if not embedding_item.attribute_id in useable_embeddings:
+            useable_embeddings[embedding_item.attribute_id] = []
+        useable_embeddings[embedding_item.attribute_id].append(embedding_item)
+
+    if tokenization.is_doc_bin_creation_running(project_id):
+        errors_found.append(
+            "tokenization is currently running. Wait for it to finish before editing records."
+        )
+
+    if task_queue.get_by_tokenization(project_id) is not None:
+        errors_found.append(
+            "tokenization is currently queued. Wait for it to finish before editing records."
+        )
+
+    if errors_found:
+        return {"errors_found": errors_found}
+
+    record_data_backup = {str(r.id): copy.deepcopy(r.data) for r in records.values()}
+    rla_delete_tuples = [
+        (c["recordId"], str(attributes[c["attributeName"]].id))
+        for c in changes.values()
+        if "subKey" not in c
+        and attributes[c["attributeName"]].data_type == enums.DataTypes.TEXT.value
+    ]
+
+    if len(useable_embeddings) > 0:
+        for change in changes.values():
+            attribute_id = attributes[change["attributeName"]].id
+            if attribute_id not in useable_embeddings:
+                continue
+            for embedding_item in useable_embeddings[attribute_id]:
+                embedding_id = str(embedding_item.id)
+                if embedding_id not in embedding_rebuilds:
+                    embedding_rebuilds[embedding_id] = []
+                changed_record_info = {
+                    "record_id": change["recordId"],
+                    "attribute_name": change["attributeName"],
+                }
+                if "subKey" in change:
+                    changed_record_info["sub_key"] = change["subKey"]
+                embedding_rebuilds[embedding_id].append(changed_record_info)
+
+    return {
+        "records": records,
+        "record_data_backup": record_data_backup,
+        "rla_delete_tuples": rla_delete_tuples,
+        "embedding_rebuilds": embedding_rebuilds,
+    }
diff --git a/graphql_api/mutation/record.py b/graphql_api/mutation/record.py
@@ -1,4 +1,5 @@
 import graphene
+from typing import Dict, Any
 
 from controller.auth import manager as auth
 from util import notification
@@ -20,5 +21,27 @@ def mutate(self, info, project_id: str, record_id: str):
         return DeleteRecord(ok=True)
 
 
+class EditRecords(graphene.Mutation):
+    class Arguments:
+        project_id = graphene.ID()
+        changes = graphene.JSONString()
+
+    ok = graphene.Boolean()
+    errors = graphene.List(graphene.String, required=False)
+
+    def mutate(self, info, project_id: str, changes: Dict[str, Any]):
+        auth.check_demo_access(info)
+        auth.check_project_access(info, project_id)
+        user_id = auth.get_user_id_by_info(info)
+        errors = manager.edit_records(user_id, project_id, changes)
+        if errors and len(errors) > 0:
+            return EditRecords(ok=False, errors=errors)
+
+        # somewhat global atm since record specific might result in a lot of notifications
+        notification.send_organization_update(project_id, f"records_changed")
+        return DeleteRecord(ok=True)
+
+
 class RecordMutation(graphene.ObjectType):
     delete_record = DeleteRecord.Field()
+    edit_records = EditRecords.Field()
diff --git a/submodules/model b/submodules/model
@@ -1 +1 @@
-Subproject commit bfd06954dfb5669e3c812f406ecf69c83dd38991
+Subproject commit 39477d012c54a5360ea338786faa57f1622353e2

Original file line number	Diff line number	Diff line change
`@@ -225,5 +225,5 @@ def update_embedding_payload(`
`225`	`225`	`embedding.update_embedding_filter_attributes(`
`226`	`226`	`project_id, embedding_id, filter_attributes, with_commit=True`
`227`	`227`	`)`
`228`		`- connector.request_deleting_embedding(project_id, embedding_id)`
`229`		`- connector.request_tensor_upload(project_id, embedding_id)`
	`228`	`+ connector.delete_embedding_from_neural_search(embedding_id)`
	`229`	`+ connector.post_embedding_to_neural_search(project_id, embedding_id)`