Embedding list prep (#148)

JWittmeyer · web-flow · commit 5ba1e1265d9c · 2023-08-30T19:43:48.000+02:00
* Embedding lists
diff --git a/alembic/versions/53c561be097d_adds_embedding_tensor_subkey.py b/alembic/versions/53c561be097d_adds_embedding_tensor_subkey.py
@@ -0,0 +1,28 @@
+"""adds embedding tensor subkey
+
+Revision ID: 53c561be097d
+Revises: 0714589d508e
+Create Date: 2023-08-28 08:50:20.167644
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = '53c561be097d'
+down_revision = '0714589d508e'
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column('embedding_tensor', sa.Column('sub_key', sa.Integer(), nullable=True))
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_column('embedding_tensor', 'sub_key')
+    # ### end Alembic commands ###
diff --git a/controller/attribute/manager.py b/controller/attribute/manager.py
@@ -2,14 +2,20 @@
 from controller.tokenization.tokenization_service import (
     request_reupload_docbins,
 )
+import json
 from submodules.model.business_objects import (
     attribute,
     record,
     tokenization,
     general,
 )
 from submodules.model.models import Attribute
-from submodules.model.enums import AttributeState, DataTypes, RecordTokenizationScope
+from submodules.model.enums import (
+    AttributeState,
+    DataTypes,
+    RecordTokenizationScope,
+    AttributeVisibility,
+)
 from util import daemon, notification
 
 from controller.task_queue import manager as task_queue_manager
@@ -68,6 +74,9 @@ def create_user_attribute(project_id: str, name: str, data_type: str) -> Attribu
         relative_position = 1
     else:
         relative_position = prev_relative_position + 1
+    visibility = None  # default
+    if data_type == DataTypes.EMBEDDING_LIST.value:
+        visibility = AttributeVisibility.HIDE.value
 
     attribute_item: Attribute = attribute.create(
         project_id,
@@ -77,6 +86,7 @@ def create_user_attribute(project_id: str, name: str, data_type: str) -> Attribu
         is_primary_key=False,
         user_created=True,
         state=AttributeState.INITIAL.value,
+        visibility=visibility,
         with_commit=True,
     )
     notification.send_organization_update(
@@ -355,4 +365,15 @@ def calculate_user_attribute_sample_records(
     calculated_attributes = util.run_attribute_calculation_exec_env(
         attribute_id=attribute_id, project_id=project_id, doc_bin=doc_bin_samples
     )
-    return list(calculated_attributes.keys()), list(calculated_attributes.values())
+    values = None
+    if (
+        attribute.get(project_id, attribute_id).data_type
+        == DataTypes.EMBEDDING_LIST.value
+    ):
+        # values are json serialized so they can be easily transferred to the frontend.
+        # Since the return type is a list of strings, without json.dumps a str(xxxx) will be called
+        # which can't be easily deserialized if special characters are in the string
+        values = [json.dumps(v) for v in list(calculated_attributes.values())]
+    else:
+        values = list(calculated_attributes.values())
+    return list(calculated_attributes.keys()), values
diff --git a/controller/record/manager.py b/controller/record/manager.py
@@ -25,9 +25,10 @@ def get_records_by_similarity_search(
     embedding_id: str,
     record_id: str,
     att_filter: Optional[List[Dict[str, Any]]] = None,
+    record_sub_key: Optional[int] = None,
 ) -> ExtendedSearch:
     record_ids = neural_search_connector.request_most_similar_record_ids(
-        project_id, embedding_id, record_id, 100, att_filter
+        project_id, embedding_id, record_id, 100, att_filter, record_sub_key
     )
     if not len(record_ids):
         record_ids = [record_id]
diff --git a/controller/record/neural_search_connector.py b/controller/record/neural_search_connector.py
@@ -12,8 +12,11 @@ def request_most_similar_record_ids(
     record_id: str,
     limit: int,
     att_filter: Optional[List[Dict[str, Any]]] = None,
+    record_sub_key: Optional[int] = None,
 ) -> List[str]:
     url = f"{BASE_URI}/most_similar?project_id={project_id}&embedding_id={embedding_id}&record_id={record_id}&limit={limit}"
+    if record_sub_key is not None:
+        url += f"&record_sub_key={record_sub_key}"
 
     result = service_requests.post_call_or_raise(url, att_filter)
     return result
diff --git a/controller/transfer/project_transfer_manager.py b/controller/transfer/project_transfer_manager.py
@@ -150,30 +150,20 @@ def import_file(
     send_progress_update_throttle(project_id, task_id, 0)
     project_item = project.get(project_id)
     if not project_item.name:
-        project_item.name = data.get(
-            "project_details_data",
-        ).get(
+        project_item.name = data.get("project_details_data",).get(
             "name",
         )
-    project_item.description = data.get(
-        "project_details_data",
-    ).get(
+    project_item.description = data.get("project_details_data",).get(
         "description",
     )
-    project_item.tokenizer = data.get(
-        "project_details_data",
-    ).get(
+    project_item.tokenizer = data.get("project_details_data",).get(
         "tokenizer",
     )
-    spacy_language = data.get(
-        "project_details_data",
-    ).get(
+    spacy_language = data.get("project_details_data",).get(
         "tokenizer",
     )[:2]
     project_item.tokenizer_blank = spacy_language
-    project_item.status = data.get(
-        "project_details_data",
-    ).get(
+    project_item.status = data.get("project_details_data",).get(
         "status",
     )
     old_project_id = data.get(
@@ -429,6 +419,9 @@ def __transform_embedding_by_name(embedding_name: str):
                     data=embedding_tensor_item.get(
                         "data",
                     ),
+                    sub_key=embedding_tensor_item.get(
+                        "sub_key",
+                    ),
                 )
 
     def __replace_embedding_name(
@@ -1278,6 +1271,7 @@ def get_project_export_dump(
             "embedding_id": str(embedding_tensor_item[0]),
             "record_id": str(embedding_tensor_item[1]),
             "data": embedding_tensor_item[2],
+            "sub_key": embedding_tensor_item[3],
         }
         for embedding_tensor_item in embedding_tensors
     ]
diff --git a/graphql_api/query/record.py b/graphql_api/query/record.py
@@ -46,6 +46,7 @@ class RecordQuery(graphene.ObjectType):
         embedding_id=graphene.ID(required=True),
         record_id=graphene.ID(required=True),
         att_filter=graphene.JSONString(required=False),
+        record_sub_key=graphene.Int(required=False),  # only for embedding lists
     )
 
     tokenize_record = graphene.Field(
@@ -112,12 +113,13 @@ def resolve_search_records_by_similarity(
         embedding_id: str,
         record_id: str,
         att_filter: Optional[List[Dict[str, Any]]] = None,
+        record_sub_key: Optional[int] = None,
     ) -> ExtendedSearch:
         auth.check_demo_access(info)
         auth.check_project_access(info, project_id)
         user_id = auth.get_user_by_info(info).id
         return manager.get_records_by_similarity_search(
-            project_id, user_id, embedding_id, record_id, att_filter
+            project_id, user_id, embedding_id, record_id, att_filter, record_sub_key
         )
 
     def resolve_tokenize_record(self, info, record_id: str) -> TokenizedRecord:
diff --git a/submodules/model b/submodules/model
@@ -1 +1 @@
-Subproject commit d32ae73cbe86eb26ba57c228c40a7267b8f6498a
+Subproject commit 85461edd37c075d938c1df298daf4f13c4adac28
diff --git a/submodules/s3 b/submodules/s3
@@ -1 +1 @@
-Subproject commit eb0de358e37d02994fb94b17487abd8df0e3c400
+Subproject commit 28fdf468e083aa94870699213ef7c817ae028358