code-kern-ai
diff --git a/‎alembic/versions/bb87177d46b5_adds_task_queue_table.py‎
Lines changed: 44 additions & 0 deletions b/‎alembic/versions/bb87177d46b5_adds_task_queue_table.py‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎api/project.py‎
Lines changed: 14 additions & 2 deletions b/‎api/project.py‎
Lines changed: 14 additions & 2 deletions
diff --git a/‎api/transfer.py‎
Lines changed: 15 additions & 6 deletions b/‎api/transfer.py‎
Lines changed: 15 additions & 6 deletions
diff --git a/‎app.py‎
Lines changed: 4 additions & 0 deletions b/‎app.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎controller/attribute/manager.py‎
Lines changed: 26 additions & 8 deletions b/‎controller/attribute/manager.py‎
Lines changed: 26 additions & 8 deletions
diff --git a/‎controller/embedding/manager.py‎
Lines changed: 23 additions & 0 deletions b/‎controller/embedding/manager.py‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎controller/notification/notification_data.py‎
Lines changed: 7 additions & 0 deletions b/‎controller/notification/notification_data.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎controller/payload/payload_scheduler.py‎
Lines changed: 30 additions & 15 deletions b/‎controller/payload/payload_scheduler.py‎
Lines changed: 30 additions & 15 deletions
@@ -0,0 +1,44 @@
+"""Adds task queue table
+
+Revision ID: bb87177d46b5
+Revises: 546e5cd7feaa
+Create Date: 2023-04-26 10:03:46.597003
+
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = "bb87177d46b5"
+down_revision = "546e5cd7feaa"
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table(
+        "task_queue",
+        sa.Column("id", postgresql.UUID(as_uuid=True), nullable=False),
+        sa.Column("project_id", postgresql.UUID(as_uuid=True), nullable=True),
+        sa.Column("task_type", sa.String(), nullable=True),
+        sa.Column("task_info", sa.JSON(), nullable=True),
+        sa.Column("priority", sa.Boolean(), nullable=True),
+        sa.Column("is_active", sa.Boolean(), nullable=True),
+        sa.Column("created_at", sa.DateTime(), nullable=True),
+        sa.Column("created_by", postgresql.UUID(as_uuid=True), nullable=True),
+        sa.ForeignKeyConstraint(
+            ["created_by"],
+            ["user.id"],
+        ),
+        sa.ForeignKeyConstraint(["project_id"], ["project.id"], ondelete="CASCADE"),
+        sa.PrimaryKeyConstraint("id"),
+    )
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_table("task_queue")
+    # ### end Alembic commands ###
@@ -12,6 +12,9 @@
 from submodules.s3.controller import bucket_exists, create_bucket
 from util import doc_ock, notification, adapter
 
+from controller.task_queue import manager as task_queue_manager
+from submodules.model.enums import TaskType, RecordTokenizationScope
+
 logging.basicConfig(level=logging.DEBUG)
 
 
@@ -74,13 +77,22 @@ async def post(self, request_body) -> JSONResponse:
             user_id=user.id, project_id=project.id, file_name=name, data=data
         )
 
-        tokenization_service.request_tokenize_project(str(project.id), str(user.id))
+        task_queue_manager.add_task(
+            str(project.id),
+            TaskType.TOKENIZATION,
+            str(user.id),
+            {
+                "scope": RecordTokenizationScope.PROJECT.value,
+                "include_rats": True,
+                "only_uploaded_attributes": False,
+            },
+        )
 
         notification.send_organization_update(
             project.id, f"project_created:{str(project.id)}", True
         )
         doc_ock.post_event(
-            user,
+            str(user.id),
             events.CreateProject(Name=f"{name}-{project.id}", Description=description),
         )
 
 
@@ -10,7 +10,6 @@
 from starlette.responses import PlainTextResponse, JSONResponse
 
 from controller.transfer.labelstudio import import_preperator
-from submodules.model.business_objects.tokenization import is_doc_bin_creation_running
 from submodules.s3 import controller as s3
 from submodules.model.business_objects import (
     attribute,
@@ -31,10 +30,13 @@
 
 from submodules.model import enums, exceptions
 from util.notification import create_notification
-from submodules.model.enums import AttributeState, NotificationType, UploadStates
-from submodules.model.models import Embedding, UploadTask
+from submodules.model.enums import NotificationType
+from submodules.model.models import UploadTask
 from util import daemon, notification
-from controller.tokenization import tokenization_service
+
+from controller.task_queue import manager as task_queue_manager
+from submodules.model.enums import TaskType, RecordTokenizationScope
+
 
 logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger(__name__)
@@ -247,8 +249,15 @@ def init_file_import(task: UploadTask, project_id: str, is_global_update: bool)
         )
     if task.file_type != "knowledge_base":
         only_usable_attributes = task.file_type == "records_add"
-        tokenization_service.request_tokenize_project(
-            project_id, str(task.user_id), True, only_usable_attributes
+        task_queue_manager.add_task(
+            project_id,
+            TaskType.TOKENIZATION,
+            task.user_id,
+            {
+                "scope": RecordTokenizationScope.PROJECT.value,
+                "include_rats": True,
+                "only_uploaded_attributes": only_usable_attributes,
+            },
         )
 
 
 
@@ -18,6 +18,8 @@
 from starlette.routing import Route
 
 from graphql_api import schema
+from controller.task_queue.task_queue import init_task_queue
+
 
 logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger(__name__)
@@ -48,3 +50,5 @@
 middleware = [Middleware(DatabaseSessionHandler)]
 
 app = Starlette(routes=routes, middleware=middleware)
+
+init_task_queue()
@@ -1,14 +1,19 @@
 from typing import List, Tuple
 from controller.tokenization.tokenization_service import (
-    request_tokenize_calculated_attribute,
-    request_tokenize_project,
     request_reupload_docbins,
 )
-from submodules.model.business_objects import attribute, record, tokenization, general
+from submodules.model.business_objects import (
+    attribute,
+    record,
+    tokenization,
+    general,
+)
 from submodules.model.models import Attribute
-from submodules.model.enums import AttributeState, DataTypes
+from submodules.model.enums import AttributeState, DataTypes, RecordTokenizationScope
 from util import daemon, notification
 
+from controller.task_queue import manager as task_queue_manager
+from submodules.model.enums import TaskType
 from . import util
 from sqlalchemy import sql
 
@@ -141,10 +146,15 @@ def add_running_id(
         project_id, attribute_name, for_retokenization, with_commit=True
     )
     if for_retokenization:
-        daemon.run(
-            request_tokenize_project,
+        task_queue_manager.add_task(
             project_id,
+            TaskType.TOKENIZATION,
             user_id,
+            {
+                "scope": RecordTokenizationScope.PROJECT.value,
+                "include_rats": True,
+                "only_uploaded_attributes": False,
+            },
         )
 
 
@@ -261,9 +271,17 @@ def __calculate_user_attribute_all_records(
             project_id, attribute_id, "Triggering tokenization."
         )
         try:
-            request_tokenize_calculated_attribute(
-                project_id, user_id, attribute_item.id, include_rats
+            task_queue_manager.add_task(
+                project_id,
+                TaskType.TOKENIZATION,
+                user_id,
+                {
+                    "scope": RecordTokenizationScope.ATTRIBUTE.value,
+                    "attribute_id": str(attribute_item.id),
+                    "include_rats": include_rats,
+                },
             )
+
         except Exception:
             record.delete_user_created_attribute(
                 project_id=project_id,
 
@@ -7,6 +7,7 @@
 from . import connector
 from controller.misc import manager as misc
 from controller.model_provider import manager as model_manager
+from submodules.model.business_objects import attribute
 
 
 def get_recommended_encoders() -> List[Any]:
@@ -112,3 +113,25 @@ def __embed_one_by_one_helper(
         time.sleep(5)
         while util.has_encoder_running(project_id):
             time.sleep(5)
+
+
+def get_embedding_name(
+    project_id: str, attribute_id: str, level: str, embedding_handle: str
+) -> str:
+    if level not in [
+        enums.EmbeddingType.ON_ATTRIBUTE.value,
+        enums.EmbeddingType.ON_TOKEN.value,
+    ]:
+        raise ValueError("level must be either attribute or token")
+    embedding_type = (
+        "classification"
+        if level == enums.EmbeddingType.ON_ATTRIBUTE.value
+        else "extraction"
+    )
+
+    attribute_item = attribute.get(project_id, attribute_id)
+    if attribute_item is None:
+        raise ValueError("attribute not found")
+    attribute_name = attribute_item.name
+
+    return f"{attribute_name}-{embedding_type}-{embedding_handle}"
@@ -158,6 +158,13 @@
         "page": enums.Pages.SETTINGS.value,
         "docs": enums.DOCS.INFORMATION_SOURCES.value,
     },
+    enums.NotificationType.INFORMATION_SOURCE_S3_DOCBIN_MISSING.value: {
+        "message_template": "Tokenization docs missing in S3 storage. Docs are present once tokenization process is started (not queued).",
+        "title": "Heuristic execution",
+        "level": enums.Notification.ERROR.value,
+        "page": enums.Pages.SETTINGS.value,
+        "docs": enums.DOCS.INFORMATION_SOURCES.value,
+    },
     enums.NotificationType.WEAK_SUPERVISION_TASK_STARTED.value: {
         "message_template": "Started weak supervision.",
         "title": "Weak supervision",
 
@@ -96,7 +96,6 @@ def create_payload(
     )
 
     def prepare_and_run_execution_pipeline(
-        user: User,
         payload_id: str,
         project_id: str,
         information_source_item: InformationSource,
@@ -107,7 +106,6 @@ def prepare_and_run_execution_pipeline(
                 information_source_item
             )
             execution_pipeline(
-                user,
                 payload_id,
                 project_id,
                 information_source_item,
@@ -132,11 +130,19 @@ def prepare_and_run_execution_pipeline(
     def prepare_input_data_for_payload(
         information_source_item: InformationSource,
     ) -> Tuple[str, Dict[str, Any]]:
+        org_id = organization.get_id_by_project_id(project_id)
         if (
             information_source_item.type
             == enums.InformationSourceType.LABELING_FUNCTION.value
         ):
-            # isn't collected every time but rather whenever tokenization needs to run again --> accesslink to the docbin file on s3
+            # check if docbins exist
+            if not s3.object_exists(org_id, project_id + "/" + "docbin_full"):
+                notification = create_notification(
+                    enums.NotificationType.INFORMATION_SOURCE_S3_DOCBIN_MISSING,
+                    user_id,
+                    project_id,
+                )
+                raise ValueError(notification.message)
             return None, None
 
         elif (
@@ -158,7 +164,6 @@ def prepare_input_data_for_payload(
             )
             embedding_file_name = f"embedding_tensors_{embedding_id}.csv.bz2"
             embedding_item = embedding.get(project_id, embedding_id)
-            org_id = organization.get_id_by_project_id(project_id)
             if not s3.object_exists(org_id, project_id + "/" + embedding_file_name):
                 notification = create_notification(
                     enums.NotificationType.INFORMATION_SOURCE_S3_EMBEDDING_MISSING,
@@ -200,7 +205,6 @@ def prepare_input_data_for_payload(
             return embedding_file_name, input_data
 
     def execution_pipeline(
-        user: User,
         payload_id: str,
         project_id: str,
         information_source_item: InformationSource,
@@ -309,7 +313,7 @@ def execution_pipeline(
 
         project_item = project.get(project_id)
         doc_ock.post_event(
-            user,
+            user_id,
             events.AddInformationSourceRun(
                 ProjectName=f"{project_item.name}-{project_item.id}",
                 Type=information_source_item.type,
@@ -319,18 +323,15 @@ def execution_pipeline(
             ),
         )
 
-    user = user_manager.get_user(user_id)
     if asynchronous:
         daemon.run(
             prepare_and_run_execution_pipeline,
-            user,
             payload.id,
             project_id,
             information_source_item,
         )
     else:
         prepare_and_run_execution_pipeline(
-            user,
             payload.id,
             project_id,
             information_source_item,
@@ -468,20 +469,33 @@ def read_container_logs_thread(
     payload_id: str,
     docker_container: Any,
 ):
+
+    ctx_token = general.get_ctx_token()
     # needs to be refetched since it is not thread safe
     information_source_payload = information_source.get_payload(project_id, payload_id)
     previous_progress = -1
     last_timestamp = None
+    c = 0
     while name in __containers_running:
         time.sleep(1)
+        c += 1
+        if c > 100:
+            ctx_token = general.remove_and_refresh_session(ctx_token, True)
+            information_source_payload = information_source.get_payload(
+                project_id, payload_id
+            )
         if not name in __containers_running:
             break
-        log_lines = docker_container.logs(
-            stdout=True,
-            stderr=True,
-            timestamps=True,
-            since=last_timestamp,
-        )
+        try:
+            log_lines = docker_container.logs(
+                stdout=True,
+                stderr=True,
+                timestamps=True,
+                since=last_timestamp,
+            )
+        except:
+            # failsafe for containers that shut down during the read
+            break
         current_logs = [
             l for l in str(log_lines.decode("utf-8")).split("\n") if len(l.strip()) > 0
         ]
@@ -506,6 +520,7 @@ def read_container_logs_thread(
         set_payload_progress(
             project_id, information_source_payload, last_entry, factor=0.8
         )
+    general.remove_and_refresh_session(ctx_token)
 
 
 def get_inference_dir() -> str: