review @sanderegg

matusdrobuliak66 · matusdrobuliak66 · commit b79307ccad05 · 2025-05-28T11:10:01.000+02:00
diff --git a/packages/postgres-database/src/simcore_postgres_database/migration/versions/0019fbd911b6_add_comp_run_snapshot_tasks_table.py b/packages/postgres-database/src/simcore_postgres_database/migration/versions/0019fbd911b6_add_comp_run_snapshot_tasks_table.py
@@ -1,8 +1,8 @@
 """add comp_run_snapshot_tasks table
 
-Revision ID: ae0f63bb3c86
+Revision ID: 0019fbd911b6
 Revises: 278daef7e99d
-Create Date: 2025-05-27 14:12:10.926590+00:00
+Create Date: 2025-05-28 08:51:35.563513+00:00
 
 """
 
@@ -11,7 +11,7 @@
 from sqlalchemy.dialects import postgresql
 
 # revision identifiers, used by Alembic.
-revision = "ae0f63bb3c86"
+revision = "0019fbd911b6"
 down_revision = "278daef7e99d"
 branch_labels = None
 depends_on = None
@@ -82,12 +82,6 @@ def upgrade():
         sa.Column(
             "hardware_info", postgresql.JSONB(astext_type=sa.Text()), nullable=True
         ),
-        sa.Column(
-            "submit",
-            sa.DateTime(timezone=True),
-            server_default=sa.text("'1900-01-01T00:00:00Z'::timestamptz"),
-            nullable=True,
-        ),
         sa.ForeignKeyConstraint(
             ["run_id"],
             ["comp_runs.run_id"],
@@ -98,7 +92,10 @@ def upgrade():
         sa.PrimaryKeyConstraint("snapshot_task_id"),
     )
     op.add_column(
-        "comp_runs", sa.Column("dag_adjacency_list", sa.JSON(), nullable=True)
+        "comp_runs",
+        sa.Column(
+            "dag_adjacency_list", postgresql.JSONB(astext_type=sa.Text()), nullable=True
+        ),
     )
     # ### end Alembic commands ###
 
diff --git a/packages/postgres-database/src/simcore_postgres_database/models/comp_run_snapshot_tasks.py b/packages/postgres-database/src/simcore_postgres_database/models/comp_run_snapshot_tasks.py
@@ -118,12 +118,4 @@
         nullable=True,
         doc="Harware information of this task",
     ),
-    # deprecated columns must be kept due to legacy services
-    # utc timestamps for submission/start/end
-    sa.Column(
-        "submit",
-        sa.DateTime(timezone=True),
-        server_default=sa.text("'1900-01-01T00:00:00Z'::timestamptz"),
-        doc="[DEPRECATED unused but kept for legacy services and must be filled with a default value of 1 January 1900]",
-    ),
 )
diff --git a/packages/postgres-database/src/simcore_postgres_database/models/comp_runs.py b/packages/postgres-database/src/simcore_postgres_database/models/comp_runs.py
@@ -99,7 +99,7 @@
         doc="the run uses on demand clusters",
     ),
     sa.Column(
-        "dag_adjacency_list", sa.JSON, doc="Adjancey list for the pipeline's graph"
+        "dag_adjacency_list", JSONB, doc="Adjancey list for the pipeline's graph"
     ),
     sa.UniqueConstraint("project_uuid", "user_id", "iteration"),
     sa.Index("ix_comp_runs_user_id", "user_id"),
diff --git a/packages/postgres-database/src/simcore_postgres_database/utils_comp_run_snapshot_tasks.py b/packages/postgres-database/src/simcore_postgres_database/utils_comp_run_snapshot_tasks.py
@@ -1,3 +1,5 @@
+from typing import Any
+
 import sqlalchemy as sa
 from pydantic import PositiveInt
 from sqlalchemy.ext.asyncio import AsyncConnection, AsyncEngine
@@ -28,7 +30,6 @@
     comp_run_snapshot_tasks.c.modified,
     comp_run_snapshot_tasks.c.pricing_info,
     comp_run_snapshot_tasks.c.hardware_info,
-    comp_run_snapshot_tasks.c.submit,
 )
 
 
@@ -38,7 +39,7 @@ async def update_for_run_id_and_node_id(
     *,
     run_id: PositiveInt,
     node_id: str,
-    data: dict,
+    data: dict[str, Any],
 ):
     async with pass_or_acquire_connection(engine, connection=conn) as _conn:
         result = await _conn.stream(
diff --git a/services/director-v2/src/simcore_service_director_v2/api/routes/computations.py b/services/director-v2/src/simcore_service_director_v2/api/routes/computations.py
@@ -199,7 +199,6 @@ async def _try_start_pipeline(
     project: ProjectAtDB,
     users_repo: UsersRepository,
     projects_metadata_repo: ProjectsMetadataRepository,
-    tasks_to_run: list[CompTaskAtDB],
 ) -> None:
     if not minimal_dag.nodes():
         # 2 options here: either we have cycles in the graph or it's really done
@@ -242,7 +241,6 @@ async def _try_start_pipeline(
         )
         or {},
         use_on_demand_clusters=computation.use_on_demand_clusters,
-        tasks_to_run=tasks_to_run,
     )
 
 
@@ -342,13 +340,6 @@ async def create_computation(  # noqa: PLR0913 # pylint: disable=too-many-positi
             rabbitmq_rpc_client=rpc_client,
         )
 
-        # filter the tasks by the effective pipeline
-        filtered_tasks = [
-            t
-            for t in comp_tasks
-            if f"{t.node_id}" in set(minimal_computational_dag.nodes())
-        ]
-
         if computation.start_pipeline:
             await _try_start_pipeline(
                 request.app,
@@ -359,9 +350,14 @@ async def create_computation(  # noqa: PLR0913 # pylint: disable=too-many-positi
                 project=project,
                 users_repo=users_repo,
                 projects_metadata_repo=projects_metadata_repo,
-                tasks_to_run=filtered_tasks,
             )
 
+        # filter the tasks by the effective pipeline
+        filtered_tasks = [
+            t
+            for t in comp_tasks
+            if f"{t.node_id}" in set(minimal_computational_dag.nodes())
+        ]
         pipeline_state = utils.get_pipeline_state_from_task_states(filtered_tasks)
 
         # get run details if any
diff --git a/services/director-v2/src/simcore_service_director_v2/models/comp_tasks.py b/services/director-v2/src/simcore_service_director_v2/models/comp_tasks.py
@@ -150,10 +150,6 @@ class BaseCompTaskAtDB(BaseModel):
     pricing_info: dict | None
     hardware_info: HardwareInfo
 
-    submit: dt.datetime | None = Field(
-        default=None, deprecated=True, description="Required for legacy services"
-    )
-
     @field_validator("state", mode="before")
     @classmethod
     def _convert_state_from_state_type_enum_if_needed(cls, v):
@@ -183,6 +179,9 @@ def _backward_compatible_null_value(cls, v: HardwareInfo | None) -> HardwareInfo
 
 class CompTaskAtDB(BaseCompTaskAtDB):
     task_id: PositiveInt | None = None
+    submit: dt.datetime | None = Field(
+        default=None, deprecated=True, description="Required for legacy services"
+    )
 
     def to_db_model(self, **exclusion_rules) -> dict[str, Any]:
         # mode json is used to ensure the UUIDs are converted to strings
diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py
@@ -1,6 +1,7 @@
 import logging
 from typing import Final
 
+import networkx as nx
 from fastapi import FastAPI
 from models_library.projects import ProjectID
 from models_library.users import UserID
@@ -22,6 +23,7 @@
 from ..db import get_db_engine
 from ..db.repositories.comp_pipelines import CompPipelinesRepository
 from ..db.repositories.comp_runs import CompRunsRepository
+from ..db.repositories.comp_tasks import CompTasksRepository
 from ..rabbitmq import get_rabbitmq_client
 from ._constants import (
     MAX_CONCURRENT_PIPELINE_SCHEDULING,
@@ -41,13 +43,13 @@ async def run_new_pipeline(
     project_id: ProjectID,
     run_metadata: RunMetadataDict,
     use_on_demand_clusters: bool,
-    tasks_to_run: list[CompTaskAtDB],
 ) -> None:
     """Sets a new pipeline to be scheduled on the computational resources."""
     # ensure the pipeline exists and is populated with something
     db_engine = get_db_engine(app)
     comp_pipeline_at_db = await _get_pipeline_at_db(project_id, db_engine)
     dag = comp_pipeline_at_db.get_graph()
+
     if not dag:
         _logger.warning(
             "project %s has no computational dag defined. not scheduled for a run.",
@@ -63,11 +65,11 @@ async def run_new_pipeline(
         dag_adjacency_list=comp_pipeline_at_db.dag_adjacency_list,
     )
 
+    tasks_to_run = await _get_pipeline_tasks_at_db(db_engine, project_id, dag)
     db_create_snaphot_tasks = [
         {
-            **task.to_db_model(exclude={"created", "modified", "submit"}),
+            **task.to_db_model(exclude={"created", "modified"}),
             "run_id": new_run.run_id,
-            # "submit": datetime.fromisoformat(task.submit)
         }
         for task in tasks_to_run
     ]
@@ -130,6 +132,17 @@ async def _get_pipeline_at_db(
     return pipeline_at_db
 
 
+async def _get_pipeline_tasks_at_db(
+    db_engine: AsyncEngine, project_id: ProjectID, pipeline_dag: nx.DiGraph
+) -> list[CompTaskAtDB]:
+    comp_tasks_repo = CompTasksRepository.instance(db_engine)
+    return [
+        t
+        for t in await comp_tasks_repo.list_computational_tasks(project_id)
+        if (f"{t.node_id}" in list(pipeline_dag.nodes()))
+    ]
+
+
 _LOST_TASKS_FACTOR: Final[int] = 10
 
 
diff --git a/services/director-v2/tests/unit/_helpers.py b/services/director-v2/tests/unit/_helpers.py
@@ -21,7 +21,6 @@ class PublishedProject:
     project: ProjectAtDB
     pipeline: CompPipelineAtDB
     tasks: list[CompTaskAtDB]
-    tasks_to_run: list[CompTaskAtDB]
 
 
 @dataclass(kw_only=True)
diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py
@@ -156,7 +156,6 @@ async def test_schedule_all_pipelines(
         project_id=published_project.project.uuid,
         run_metadata=run_metadata,
         use_on_demand_clusters=False,
-        tasks_to_run=published_project.tasks_to_run,
     )
     # this directly schedule a new pipeline
     scheduler_rabbit_client_parser.assert_called_once_with(
@@ -258,7 +257,6 @@ async def test_schedule_all_pipelines_logs_error_if_it_find_old_pipelines(
         project_id=published_project.project.uuid,
         run_metadata=run_metadata,
         use_on_demand_clusters=False,
-        tasks_to_run=published_project.tasks_to_run,
     )
     # this directly schedule a new pipeline
     scheduler_rabbit_client_parser.assert_called_once_with(
@@ -342,7 +340,6 @@ async def test_empty_pipeline_is_not_scheduled(
             project_id=empty_project.uuid,
             run_metadata=run_metadata,
             use_on_demand_clusters=False,
-            tasks_to_run=[],
         )
     await assert_comp_runs_empty(sqlalchemy_async_engine)
     scheduler_rabbit_client_parser.assert_not_called()
@@ -358,7 +355,6 @@ async def test_empty_pipeline_is_not_scheduled(
             project_id=empty_project.uuid,
             run_metadata=run_metadata,
             use_on_demand_clusters=False,
-            tasks_to_run=[],
         )
     assert len(caplog.records) == 1
     assert "no computational dag defined" in caplog.records[0].message
diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py
@@ -169,7 +169,6 @@ async def _assert_start_pipeline(
         project_id=published_project.project.uuid,
         run_metadata=run_metadata,
         use_on_demand_clusters=False,
-        tasks_to_run=published_project.tasks_to_run,
     )
 
     # check the database is correctly updated, the run is published
@@ -1125,7 +1124,6 @@ async def test_broken_pipeline_configuration_is_not_scheduled_and_aborted(
         project_id=sleepers_project.uuid,
         run_metadata=run_metadata,
         use_on_demand_clusters=False,
-        tasks_to_run=[],
     )
     with_disabled_scheduler_publisher.assert_called_once()
     # we shall have a a new comp_runs row with the new pipeline job
@@ -1253,7 +1251,6 @@ async def test_handling_of_disconnected_scheduler_dask(
         project_id=published_project.project.uuid,
         run_metadata=run_metadata,
         use_on_demand_clusters=False,
-        tasks_to_run=published_project.tasks_to_run,
     )
 
     # since there is no cluster, there is no dask-scheduler,
@@ -1769,7 +1766,6 @@ async def test_pipeline_with_on_demand_cluster_with_not_ready_backend_waits(
         project_id=published_project.project.uuid,
         run_metadata=run_metadata,
         use_on_demand_clusters=True,
-        tasks_to_run=published_project.tasks_to_run,
     )
 
     # we ask to use an on-demand cluster, therefore the tasks are published first
@@ -1874,7 +1870,6 @@ async def test_pipeline_with_on_demand_cluster_with_no_clusters_keeper_fails(
         project_id=published_project.project.uuid,
         run_metadata=run_metadata,
         use_on_demand_clusters=True,
-        tasks_to_run=published_project.tasks_to_run,
     )
 
     # we ask to use an on-demand cluster, therefore the tasks are published first
diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py
@@ -69,7 +69,6 @@ async def test_worker_properly_autocalls_scheduler_api(
         project_id=published_project.project.uuid,
         run_metadata=run_metadata,
         use_on_demand_clusters=False,
-        tasks_to_run=published_project.tasks_to_run,
     )
     mocked_get_scheduler_worker.assert_called_once_with(initialized_app)
     mocked_get_scheduler_worker.return_value.apply.assert_called_once_with(
@@ -127,7 +126,6 @@ async def _project_pipeline_creation_workflow() -> None:
             project_id=published_project.project.uuid,
             run_metadata=run_metadata,
             use_on_demand_clusters=False,
-            tasks_to_run=published_project.tasks_to_run,
         )
 
     # whatever scheduling concurrency we call in here, we shall always see the same number of calls to the scheduler
diff --git a/services/director-v2/tests/unit/with_dbs/conftest.py b/services/director-v2/tests/unit/with_dbs/conftest.py