Unprototyping ETL min (#179)

FelixKirschKern · JWittmeyer · web-flow · commit bd280901e0d1 · 2023-12-21T16:34:18.000+01:00
* creates revision to extend cognition markdown tables

* create new revision after merging additional changes into the submodule model

* transfer changes from the etl branch

* removes debug print

* Adds env var

* replace last db revision, adds env var column to md dataset

* Adds threading logic

* dev submodule change

---------

Co-authored-by: JWittmeyer &lt;jens.wittmeyer@kern.ai&gt;
diff --git a/alembic/versions/f6bca8990840_extends_cognition_markdown_tables.py b/alembic/versions/f6bca8990840_extends_cognition_markdown_tables.py
@@ -0,0 +1,210 @@
+"""extends cognition markdown tables
+
+Revision ID: f6bca8990840
+Revises: 3d0e01981f06
+Create Date: 2023-12-20 10:54:14.354971
+
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = "f6bca8990840"
+down_revision = "3d0e01981f06"
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table(
+        "markdown_dataset",
+        sa.Column("id", postgresql.UUID(as_uuid=True), nullable=False),
+        sa.Column("organization_id", postgresql.UUID(as_uuid=True), nullable=True),
+        sa.Column("refinery_project_id", postgresql.UUID(as_uuid=True), nullable=True),
+        sa.Column(
+            "environment_variable_id", postgresql.UUID(as_uuid=True), nullable=True
+        ),
+        sa.Column("created_by", postgresql.UUID(as_uuid=True), nullable=True),
+        sa.Column("created_at", sa.DateTime(), nullable=True),
+        sa.Column("name", sa.String(), nullable=True),
+        sa.Column("description", sa.String(), nullable=True),
+        sa.Column("tokenizer", sa.String(), nullable=True),
+        sa.Column("category_origin", sa.String(), nullable=True),
+        sa.ForeignKeyConstraint(["created_by"], ["user.id"], ondelete="SET NULL"),
+        sa.ForeignKeyConstraint(
+            ["environment_variable_id"],
+            ["cognition.environment_variable.id"],
+            ondelete="SET NULL",
+        ),
+        sa.ForeignKeyConstraint(
+            ["organization_id"], ["organization.id"], ondelete="CASCADE"
+        ),
+        sa.ForeignKeyConstraint(
+            ["refinery_project_id"], ["project.id"], ondelete="SET NULL"
+        ),
+        sa.PrimaryKeyConstraint("id"),
+        schema="cognition",
+    )
+    op.create_index(
+        op.f("ix_cognition_markdown_dataset_created_by"),
+        "markdown_dataset",
+        ["created_by"],
+        unique=False,
+        schema="cognition",
+    )
+    op.create_index(
+        op.f("ix_cognition_markdown_dataset_environment_variable_id"),
+        "markdown_dataset",
+        ["environment_variable_id"],
+        unique=False,
+        schema="cognition",
+    )
+    op.create_index(
+        op.f("ix_cognition_markdown_dataset_organization_id"),
+        "markdown_dataset",
+        ["organization_id"],
+        unique=False,
+        schema="cognition",
+    )
+    op.create_index(
+        op.f("ix_cognition_markdown_dataset_refinery_project_id"),
+        "markdown_dataset",
+        ["refinery_project_id"],
+        unique=False,
+        schema="cognition",
+    )
+    op.create_table(
+        "markdown_llm_logs",
+        sa.Column("id", postgresql.UUID(as_uuid=True), nullable=False),
+        sa.Column("markdown_file_id", postgresql.UUID(as_uuid=True), nullable=True),
+        sa.Column("created_at", sa.DateTime(), nullable=True),
+        sa.Column("finished_at", sa.DateTime(), nullable=True),
+        sa.Column("model_used", sa.String(), nullable=True),
+        sa.Column("input", sa.String(), nullable=True),
+        sa.Column("output", sa.String(), nullable=True),
+        sa.Column("error", sa.String(), nullable=True),
+        sa.ForeignKeyConstraint(
+            ["markdown_file_id"], ["cognition.markdown_file.id"], ondelete="CASCADE"
+        ),
+        sa.PrimaryKeyConstraint("id"),
+        schema="cognition",
+    )
+    op.create_index(
+        op.f("ix_cognition_markdown_llm_logs_markdown_file_id"),
+        "markdown_llm_logs",
+        ["markdown_file_id"],
+        unique=False,
+        schema="cognition",
+    )
+    op.add_column(
+        "environment_variable",
+        sa.Column("organization_id", postgresql.UUID(as_uuid=True), nullable=True),
+        schema="cognition",
+    )
+    op.create_index(
+        op.f("ix_cognition_environment_variable_organization_id"),
+        "environment_variable",
+        ["organization_id"],
+        unique=False,
+        schema="cognition",
+    )
+    op.create_foreign_key(
+        None,
+        "environment_variable",
+        "organization",
+        ["organization_id"],
+        ["id"],
+        source_schema="cognition",
+        ondelete="CASCADE",
+    )
+    op.add_column(
+        "markdown_file",
+        sa.Column("dataset_id", postgresql.UUID(as_uuid=True), nullable=True),
+        schema="cognition",
+    )
+    op.add_column(
+        "markdown_file",
+        sa.Column("started_at", sa.DateTime(), nullable=True),
+        schema="cognition",
+    )
+    op.add_column(
+        "markdown_file",
+        sa.Column("finished_at", sa.DateTime(), nullable=True),
+        schema="cognition",
+    )
+    op.add_column(
+        "markdown_file",
+        sa.Column("state", sa.String(), nullable=True),
+        schema="cognition",
+    )
+    op.create_index(
+        op.f("ix_cognition_markdown_file_dataset_id"),
+        "markdown_file",
+        ["dataset_id"],
+        unique=False,
+        schema="cognition",
+    )
+    op.create_foreign_key(
+        None,
+        "markdown_file",
+        "markdown_dataset",
+        ["dataset_id"],
+        ["id"],
+        source_schema="cognition",
+        referent_schema="cognition",
+        ondelete="CASCADE",
+    )
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_constraint(None, "markdown_file", schema="cognition", type_="foreignkey")
+    op.drop_index(
+        op.f("ix_cognition_markdown_file_dataset_id"),
+        table_name="markdown_file",
+        schema="cognition",
+    )
+    op.drop_column("markdown_file", "state", schema="cognition")
+    op.drop_column("markdown_file", "finished_at", schema="cognition")
+    op.drop_column("markdown_file", "started_at", schema="cognition")
+    op.drop_column("markdown_file", "dataset_id", schema="cognition")
+    op.drop_constraint(
+        None, "environment_variable", schema="cognition", type_="foreignkey"
+    )
+    op.drop_index(
+        op.f("ix_cognition_environment_variable_organization_id"),
+        table_name="environment_variable",
+        schema="cognition",
+    )
+    op.drop_column("environment_variable", "organization_id", schema="cognition")
+    op.drop_index(
+        op.f("ix_cognition_markdown_llm_logs_markdown_file_id"),
+        table_name="markdown_llm_logs",
+        schema="cognition",
+    )
+    op.drop_table("markdown_llm_logs", schema="cognition")
+    op.drop_index(
+        op.f("ix_cognition_markdown_dataset_refinery_project_id"),
+        table_name="markdown_dataset",
+        schema="cognition",
+    )
+    op.drop_index(
+        op.f("ix_cognition_markdown_dataset_organization_id"),
+        table_name="markdown_dataset",
+        schema="cognition",
+    )
+    op.drop_index(
+        op.f("ix_cognition_markdown_dataset_environment_variable_id"),
+        table_name="markdown_dataset",
+        schema="cognition",
+    )
+    op.drop_index(
+        op.f("ix_cognition_markdown_dataset_created_by"),
+        table_name="markdown_dataset",
+        schema="cognition",
+    )
+    op.drop_table("markdown_dataset", schema="cognition")
+    # ### end Alembic commands ###
diff --git a/api/transfer.py b/api/transfer.py
@@ -19,6 +19,7 @@
     general,
     organization,
     tokenization,
+    project as refinery_project,
 )
 
 from submodules.model.cognition_objects import project as cognition_project
@@ -233,6 +234,33 @@ def put(self, request) -> PlainTextResponse:
         return PlainTextResponse("OK")
 
 
+class CognitionParseMarkdownFile(HTTPEndpoint):
+    def post(self, request) -> PlainTextResponse:
+        refinery_project_id = request.path_params["project_id"]
+        refinery_project_item = refinery_project.get(refinery_project_id)
+        if not refinery_project_item:
+            return PlainTextResponse("Bad project id", status_code=400)
+
+        dataset_id = request.path_params["dataset_id"]
+        file_id = request.path_params["file_id"]
+
+        # via thread to ensure the endpoint returns immediately
+
+        daemon.run(
+            task_queue_manager.add_task,
+            refinery_project_id,
+            TaskType.PARSE_MARKDOWN_FILE,
+            refinery_project_item.created_by,
+            {
+                "org_id": str(refinery_project_item.organization_id),
+                "dataset_id": dataset_id,
+                "file_id": file_id,
+            },
+        )
+
+        return PlainTextResponse("OK")
+
+
 class AssociationsImport(HTTPEndpoint):
     async def post(self, request) -> JSONResponse:
         project_id = request.path_params["project_id"]
diff --git a/app.py b/app.py
@@ -13,6 +13,7 @@
     UploadTaskInfo,
     CognitionImport,
     CognitionPrepareProject,
+    CognitionParseMarkdownFile,
 )
 from middleware.database_session import DatabaseSessionHandler
 from starlette.applications import Starlette
@@ -55,6 +56,10 @@
         "/project/{cognition_project_id:str}/cognition/continue/{task_id:str}/finalize",
         CognitionPrepareProject,
     ),
+    Route(
+        "/project/{project_id:str}/cognition/datasets/{dataset_id:str}/files/{file_id:str}/queue",
+        CognitionParseMarkdownFile,
+    ),
     Route("/project/{project_id:str}/import/task/{task_id:str}", UploadTaskInfo),
     Route("/project", ProjectCreationFromWorkflow),
     Route("/is_managed", IsManagedRest),
diff --git a/controller/task_queue/handler/markdown_file.py b/controller/task_queue/handler/markdown_file.py
@@ -0,0 +1,53 @@
+from typing import Any, Dict, Tuple, Callable
+import os
+
+import requests
+from submodules.model.business_objects import (
+    task_queue as task_queue_db_bo,
+    general,
+)
+from submodules.model.cognition_objects import (
+    markdown_file as markdown_file_db_bo,
+)
+from submodules.model.enums import CognitionMarkdownFileState
+
+BASE_URI = os.getenv("COGNITION_GATEWAY")
+
+TASK_DONE_STATES = [
+    CognitionMarkdownFileState.FINISHED.value,
+    CognitionMarkdownFileState.FAILED.value,
+]
+
+
+def get_task_functions() -> Tuple[Callable, Callable, int]:
+    return __start_task, __check_finished, 1
+
+
+def __start_task(task: Dict[str, Any]) -> bool:
+    # check task still relevant
+    task_db_obj = task_queue_db_bo.get(task["id"])
+    if task_db_obj is None or task_db_obj.is_active:
+        return False
+
+    action = task["task_info"]
+    org_id = action["org_id"]
+    dataset_id = action["dataset_id"]
+    file_id = action["file_id"]
+
+    task_db_obj.is_active = True
+    general.commit()
+    requests.post(
+        f"{BASE_URI}/api/v1/converters/internal/datasets/{dataset_id}/files/{file_id}/parse",
+        json={"orgId": org_id},
+    )
+    return True
+
+
+def __check_finished(task: Dict[str, Any]) -> bool:
+    action = task["task_info"]
+    org_id = action["org_id"]
+    file_id = action["file_id"]
+    markdown_file_entity = markdown_file_db_bo.get(org_id=org_id, md_file_id=file_id)
+    if markdown_file_entity is None:
+        return True
+    return markdown_file_entity.state in TASK_DONE_STATES
diff --git a/controller/task_queue/manager.py b/controller/task_queue/manager.py
@@ -16,6 +16,7 @@
     tokenization as tokenization_handler,
     attribute_calculation as attribute_calculation_handler,
     task_queue as task_queue_handler,
+    markdown_file as markdown_file_handler,
 )
 from .util import if_task_queue_send_websocket
 
@@ -87,6 +88,8 @@ def get_task_function_by_type(task_type: str) -> Tuple[Callable, Callable, int]:
         return attribute_calculation_handler.get_task_functions()
     if task_type == enums.TaskType.TASK_QUEUE.value:
         return task_queue_handler.get_task_functions()
+    if task_type == enums.TaskType.PARSE_MARKDOWN_FILE.value:
+        return markdown_file_handler.get_task_functions()
     raise ValueError(f"Task type {task_type} not supported yet")
 
 
diff --git a/start b/start
@@ -72,6 +72,7 @@ docker run -d --rm \
 -e TOKENIZER=http://refinery-tokenizer:80 \
 -e DOC_OCK=http://refinery-doc-ock:80 \
 -e GATES=http://gates-gateway:80 \
+-e COGNITION_GATEWAY=http://cognition-gateway:80 \
 -e KRATOS_ADMIN_URL=http://kratos:4434 \
 -e TASK_QUEUE_SLOTS=1 \
 -e PRIORITY_TASK_QUEUE_SLOTS=1 \
diff --git a/submodules/model b/submodules/model
@@ -1 +1 @@
-Subproject commit b3dd0ccfbde0c1acae1b7f4d92b94736fd4ec1f9
+Subproject commit 19f5598f7f748c161e650c05f786f8f32faeb1bd