code-kern-ai
diff --git a/‎alembic/versions/1a25c862801f_adds_agreements_gdpr_flags_and_platform_.py‎
Lines changed: 62 additions & 0 deletions b/‎alembic/versions/1a25c862801f_adds_agreements_gdpr_flags_and_platform_.py‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎api/transfer.py‎
Lines changed: 15 additions & 88 deletions b/‎api/transfer.py‎
Lines changed: 15 additions & 88 deletions
diff --git a/‎controller/embedding/connector.py‎
Lines changed: 4 additions & 20 deletions b/‎controller/embedding/connector.py‎
Lines changed: 4 additions & 20 deletions
@@ -0,0 +1,62 @@
+"""Adds agreements, gdpr flags and platform support for embeddings
+
+Revision ID: 1a25c862801f
+Revises: 03d19eada266
+Create Date: 2023-06-06 13:58:16.634066
+
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = '1a25c862801f'
+down_revision = '03d19eada266'
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table('agreement',
+    sa.Column('id', postgresql.UUID(as_uuid=True), nullable=False),
+    sa.Column('project_id', postgresql.UUID(as_uuid=True), nullable=True),
+    sa.Column('user_id', postgresql.UUID(as_uuid=True), nullable=True),
+    sa.Column('xfkey', postgresql.UUID(as_uuid=True), nullable=True),
+    sa.Column('xftype', sa.String(), nullable=True),
+    sa.Column('terms_text', sa.String(), nullable=True),
+    sa.Column('terms_accepted', sa.Boolean(), nullable=True),
+    sa.Column('created_at', sa.DateTime(), nullable=True),
+    sa.ForeignKeyConstraint(['project_id'], ['project.id'], ondelete='CASCADE'),
+    sa.ForeignKeyConstraint(['user_id'], ['user.id'], ondelete='CASCADE'),
+    sa.PrimaryKeyConstraint('id')
+    )
+    op.create_index(op.f('ix_agreement_project_id'), 'agreement', ['project_id'], unique=False)
+    op.create_index(op.f('ix_agreement_user_id'), 'agreement', ['user_id'], unique=False)
+    op.create_index(op.f('ix_agreement_xfkey'), 'agreement', ['xfkey'], unique=False)
+    op.create_index(op.f('ix_agreement_xftype'), 'agreement', ['xftype'], unique=False)
+    op.add_column('embedding', sa.Column('created_by', postgresql.UUID(as_uuid=True), nullable=True))
+    op.add_column('embedding', sa.Column('api_token', sa.String(), nullable=True))
+    op.add_column('embedding', sa.Column('model', sa.String(), nullable=True))
+    op.add_column('embedding', sa.Column('platform', sa.String(), nullable=True))
+    op.create_index(op.f('ix_embedding_created_by'), 'embedding', ['created_by'], unique=False)
+    op.create_foreign_key(None, 'embedding', 'user', ['created_by'], ['id'], ondelete='CASCADE')
+    op.add_column('organization', sa.Column('gdpr_compliant', sa.Boolean(), nullable=True))
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_column('organization', 'gdpr_compliant')
+    op.drop_constraint(None, 'embedding', type_='foreignkey')
+    op.drop_index(op.f('ix_embedding_created_by'), table_name='embedding')
+    op.drop_column('embedding', 'platform')
+    op.drop_column('embedding', 'model')
+    op.drop_column('embedding', 'api_token')
+    op.drop_column('embedding', 'created_by')
+    op.drop_index(op.f('ix_agreement_xftype'), table_name='agreement')
+    op.drop_index(op.f('ix_agreement_xfkey'), table_name='agreement')
+    op.drop_index(op.f('ix_agreement_user_id'), table_name='agreement')
+    op.drop_index(op.f('ix_agreement_project_id'), table_name='agreement')
+    op.drop_table('agreement')
+    # ### end Alembic commands ###
@@ -8,6 +8,7 @@
 from controller.embedding import connector as embedding_connector
 from starlette.endpoints import HTTPEndpoint
 from starlette.responses import PlainTextResponse, JSONResponse
+from controller.embedding.manager import recreate_embeddings
 
 from controller.transfer.labelstudio import import_preperator
 from submodules.s3 import controller as s3
@@ -234,7 +235,12 @@ def init_file_import(task: UploadTask, project_id: str, is_global_update: bool)
             import_preperator.prepare_label_studio_import(project_id, task)
         else:
             transfer_manager.import_records_from_file(project_id, task)
-        calculate_missing_attributes(project_id, task.user_id)
+        daemon.run(
+            __recalculate_missing_attributes_and_embeddings,
+            project_id,
+            str(task.user_id),
+        )
+
     elif "project" in task.file_type:
         transfer_manager.import_project(project_id, task)
     elif "knowledge_base" in task.file_type:
@@ -284,12 +290,9 @@ def file_import_error_handling(
     )
 
 
-def calculate_missing_attributes(project_id: str, user_id: str) -> None:
-    daemon.run(
-        __calculate_missing_attributes,
-        project_id,
-        user_id,
-    )
+def __recalculate_missing_attributes_and_embeddings(project_id: str, user_id: str) -> None:
+    __calculate_missing_attributes(project_id, user_id)
+    recreate_embeddings(project_id)
 
 
 def __calculate_missing_attributes(project_id: str, user_id: str) -> None:
@@ -305,6 +308,7 @@ def __calculate_missing_attributes(project_id: str, user_id: str) -> None:
     )
     if len(attributes_usable) == 0:
         return
+
     # stored as list so connection results do not affect
     attribute_ids = [str(att_usable.id) for att_usable in attributes_usable]
     for att_id in attribute_ids:
@@ -313,7 +317,6 @@ def __calculate_missing_attributes(project_id: str, user_id: str) -> None:
     notification.send_organization_update(
         project_id=project_id, message="calculate_attribute:started:all"
     )
-
     try:
         # first check project tokenization completed
         i = 0
@@ -323,7 +326,7 @@ def __calculate_missing_attributes(project_id: str, user_id: str) -> None:
                 i = 0
                 ctx_token = general.remove_and_refresh_session(ctx_token, True)
             if tokenization.is_doc_bin_creation_running(project_id):
-                time.sleep(5)
+                time.sleep(2)
                 continue
             else:
                 break
@@ -350,15 +353,15 @@ def __calculate_missing_attributes(project_id: str, user_id: str) -> None:
                 if tokenization.is_doc_bin_creation_running_for_attribute(
                     project_id, current_att.name
                 ):
-                    time.sleep(5)
+                    time.sleep(2)
                     continue
                 else:
                     attribute_ids.pop(0)
                     notification.send_organization_update(
                         project_id=project_id,
                         message=f"calculate_attribute:finished:{current_att_id}",
                     )
-            time.sleep(5)
+            time.sleep(2)
     except Exception as e:
         print(
             f"Error while recreating attribute calculation for {project_id} when new records are uploaded : {e}"
@@ -381,80 +384,4 @@ def __calculate_missing_attributes(project_id: str, user_id: str) -> None:
             message="calculate_attribute:finished:all",
         )
         general.remove_and_refresh_session(ctx_token, False)
-        calculate_missing_embedding_tensors(project_id, user_id)
-
-
-def calculate_missing_embedding_tensors(project_id: str, user_id: str) -> None:
-    daemon.run(
-        __calculate_missing_embedding_tensors,
-        project_id,
-        user_id,
-    )
-
-
-def __calculate_missing_embedding_tensors(project_id: str, user_id: str) -> None:
-    ctx_token = general.get_ctx_token()
-    embeddings = embedding.get_finished_embeddings_by_started_at(project_id)
-    if len(embeddings) == 0:
-        return
-
-    embedding_ids = [str(embed.id) for embed in embeddings]
-    for embed_id in embedding_ids:
-        embedding.update_embedding_state_waiting(project_id, embed_id)
-    general.commit()
-
-    try:
-        ctx_token = __create_embeddings(project_id, embedding_ids, user_id, ctx_token)
-    except Exception as e:
-        print(
-            f"Error while recreating embeddings for {project_id} when new records are uploaded : {e}"
-        )
-        get_waiting_embeddings = embedding.get_waiting_embeddings(project_id)
-        for embed in get_waiting_embeddings:
-            embedding.update_embedding_state_failed(project_id, str(embed.id))
-        general.commit()
-    finally:
-        notification.send_organization_update(
-            project_id=project_id, message="embedding:finished:all"
-        )
-        general.remove_and_refresh_session(ctx_token, False)
-
-
-def __create_embeddings(
-    project_id: str,
-    embedding_ids: List[str],
-    user_id: str,
-    ctx_token: Any,
-) -> Any:
-    notification.send_organization_update(
-        project_id=project_id, message="embedding:started:all"
-    )
-    for embedding_id in embedding_ids:
-        ctx_token = general.remove_and_refresh_session(ctx_token, request_new=True)
-        embedding_item = embedding.get(project_id, embedding_id)
-        if not embedding_item:
-            continue
-
-        embedding_connector.request_deleting_embedding(project_id, embedding_id)
-
-        attribute_id = str(embedding_item.attribute_id)
-        attribute_name = attribute.get(project_id, attribute_id).name
-        if embedding_item.type == enums.EmbeddingType.ON_ATTRIBUTE.value:
-            prefix = f"{attribute_name}-classification-"
-            config_string = embedding_item.name[len(prefix) :]
-            embedding_connector.request_creating_attribute_level_embedding(
-                project_id, attribute_id, user_id, config_string
-            )
-        else:
-            prefix = f"{attribute_name}-extraction-"
-            config_string = embedding_item.name[len(prefix) :]
-            embedding_connector.request_creating_token_level_embedding(
-                project_id, attribute_id, user_id, config_string
-            )
-        time.sleep(5)
-        while embedding_util.has_encoder_running(project_id):
-            if embedding_item.state == enums.EmbeddingState.WAITING.value:
-                break
-            time.sleep(1)
-    return ctx_token
-
+        
@@ -10,29 +10,13 @@ def request_listing_recommended_encoders() -> Any:
     url = f"{BASE_URI}/classification/recommend/TEXT"  # TODO does here have to be a data type?
     return service_requests.get_call_or_raise(url)
 
-
-def request_creating_attribute_level_embedding(
-    project_id: str, attribute_id: str, user_id: str, config_string: str
-) -> Any:
-    url = f"{BASE_URI}/classification/encode"
-    data = {
-        "project_id": str(project_id),
-        "attribute_id": str(attribute_id),
-        "user_id": str(user_id),
-        "config_string": config_string,
-    }
-    return service_requests.post_call_or_raise(url, data)
-
-
-def request_creating_token_level_embedding(
-    project_id: str, attribute_id: str, user_id: str, config_string: str
+def request_embedding(
+    project_id: str, embedding_id: str
 ) -> Any:
-    url = f"{BASE_URI}/extraction/encode"
+    url = f"{BASE_URI}/embed"
     data = {
         "project_id": str(project_id),
-        "attribute_id": str(attribute_id),
-        "user_id": str(user_id),
-        "config_string": config_string,
+        "embedding_id": str(embedding_id),
     }
     return service_requests.post_call_or_raise(url, data)