[DOP-29593] Add filter by Run.started_at and Run.ended_at

dolfinus · dolfinus · commit 40a825518987 · 2025-10-02T14:13:03.000+03:00
diff --git a/data_rentgen/db/repositories/run.py b/data_rentgen/db/repositories/run.py
@@ -9,6 +9,7 @@
     CompoundSelect,
     Select,
     SQLColumnExpression,
+    and_,
     any_,
     bindparam,
     desc,
@@ -19,7 +20,7 @@
 from sqlalchemy.dialects.postgresql import insert
 from sqlalchemy.orm import selectinload
 
-from data_rentgen.db.models import Job, Run, RunStartReason, RunStatus
+from data_rentgen.db.models import Job, Run, RunStartReason, RunStatus, User
 from data_rentgen.db.repositories.base import Repository
 from data_rentgen.db.utils.search import make_tsquery, ts_match, ts_rank
 from data_rentgen.dto import PaginationDTO, RunDTO
@@ -84,7 +85,7 @@
 
 
 class RunRepository(Repository[Run]):
-    async def paginate(
+    async def paginate(  # noqa: PLR0912, C901
         self,
         page: int,
         page_size: int,
@@ -95,7 +96,13 @@ async def paginate(
         parent_run_id: UUID | None,
         search_query: str | None,
         job_type: Collection[str],
-        status: Collection[str],
+        job_location_id: int | None,
+        status: Collection[RunStatus],
+        started_by_user: str | None,
+        started_since: datetime | None,
+        started_until: datetime | None,
+        ended_since: datetime | None,
+        ended_until: datetime | None,
     ) -> PaginationDTO[Run]:
         # do not use `tuple_(Run.created_at, Run.id).in_(...),
         # as this is too complex filter for Postgres to make an optimal query plan
@@ -133,8 +140,15 @@ async def paginate(
         if parent_run_id:
             where.append(Run.parent_run_id == parent_run_id)
         if status:
-            serialize_status: Collection[RunStatus] = [RunStatus[status] for status in status]
-            where.append(Run.status == any_(serialize_status))  # type: ignore[arg-type]
+            where.append(Run.status == any_(status))  # type: ignore[arg-type]
+        if started_since:
+            where.append(Run.started_at >= started_since)
+        if started_until:
+            where.append(Run.started_at <= started_until)
+        if ended_since:
+            where.append(Run.ended_at >= ended_since)
+        if ended_until:
+            where.append(Run.ended_at <= ended_until)
 
         query: Select | CompoundSelect
         order_by: list[ColumnElement | SQLColumnExpression]
@@ -165,8 +179,16 @@ async def paginate(
             query = select(Run).where(*where)
             order_by = [Run.created_at.desc(), Run.id.desc()]
 
+        job_where = []
         if job_type:
-            query = query.join(Job, Run.job_id == Job.id).where(Job.type == any_(job_type))  # type: ignore[arg-type]
+            job_where.append(Job.type == any_(list(job_type)))  # type: ignore[arg-type]
+        if job_location_id is not None:
+            job_where.append(Job.location_id == job_location_id)
+        if job_where:
+            query = query.join(Job, and_(Run.job_id == Job.id, *job_where))
+
+        if started_by_user:
+            query = query.join(User, and_(Run.started_by_user_id == User.id, User.name == started_by_user))
 
         options = [selectinload(Run.started_by_user)]
         return await self._paginate_by_query(
diff --git a/data_rentgen/server/api/v1/router/run.py b/data_rentgen/server/api/v1/router/run.py
@@ -40,7 +40,13 @@ async def runs(
         parent_run_id=query_args.parent_run_id,
         search_query=query_args.search_query,
         job_type=query_args.job_type,
+        job_location_id=query_args.job_location_id,
         status=query_args.status,
+        started_by_user=query_args.started_by_user,
+        started_since=query_args.started_since,
+        started_until=query_args.started_until,
+        ended_since=query_args.ended_since,
+        ended_until=query_args.ended_until,
     )
     return PageResponseV1[RunDetailedResponseV1].from_pagination(pagination)
 
diff --git a/data_rentgen/server/schemas/v1/run.py b/data_rentgen/server/schemas/v1/run.py
@@ -43,10 +43,19 @@ def __str__(self) -> str:
 
 class RunStatusForQueryV1(StrEnum):
     UNKNOWN = "UNKNOWN"
+    """No data about status"""
+
     STARTED = "STARTED"
+    """Received START event"""
+
     SUCCEEDED = "SUCCEEDED"
+    """Finished successfully"""
+
     FAILED = "FAILED"
+    """Internal failure"""
+
     KILLED = "KILLED"
+    """Killed externally, e.g. by user request or in case of OOM"""
 
 
 class RunResponseV1(BaseModel):
@@ -150,16 +159,54 @@ class RunsQueryV1(PaginateQueryV1):
     job_type: list[str] = Field(
         default_factory=list,
         description="Filter runs by type of a Job",
-        examples=["SPARK_APPLICATION", "AIRFLOW_TASK"],
+        examples=[["SPARK_APPLICATION", "AIRFLOW_TASK"]],
+    )
+    job_location_id: int | None = Field(
+        default=None,
+        description="Filter runs by location of a Job",
+        examples=[123, 234],
+    )
+
+    status: list[RunStatusForQueryV1] = Field(
+        default_factory=list,
+        description="Filter runs by status",
+        examples=[["SUCCEEDED", "FAILED"]],
+    )
+
+    started_by_user: str | None = Field(
+        default=None,
+        description="User who started the Run",
+        examples=["someuser"],
     )
-    status: list[RunStatusForQueryV1] = Field(default_factory=list, description="Filter runs by status")
 
     search_query: str | None = Field(
         default=None,
         min_length=3,
         description="Search query",
     )
 
+    started_since: datetime | None = Field(
+        default=None,
+        description="Minimum value of Run 'started_at' field, in ISO 8601 format",
+        examples=["2008-09-15T15:53:00+05:00"],
+    )
+    started_until: datetime | None = Field(
+        default=None,
+        description="Maximum value of Run 'started_at' field, in ISO 8601 format",
+        examples=["2008-09-15T15:53:00+05:00"],
+    )
+
+    ended_since: datetime | None = Field(
+        default=None,
+        description="Minimum value of Run 'ended_at' field, in ISO 8601 format",
+        examples=["2008-09-15T15:53:00+05:00"],
+    )
+    ended_until: datetime | None = Field(
+        default=None,
+        description="Maximum value of Run 'ended_at' field, in ISO 8601 format",
+        examples=["2008-09-15T15:53:00+05:00"],
+    )
+
     model_config = ConfigDict(extra="forbid")
 
     @field_validator("until", mode="after")
diff --git a/data_rentgen/server/services/run.py b/data_rentgen/server/services/run.py
@@ -9,7 +9,7 @@
 from fastapi import Depends
 from sqlalchemy import Row
 
-from data_rentgen.db.models.run import Run
+from data_rentgen.db.models import Run, RunStatus
 from data_rentgen.dto.pagination import PaginationDTO
 from data_rentgen.services.uow import UnitOfWork
 
@@ -81,7 +81,13 @@ async def paginate(
         parent_run_id: UUID | None,
         search_query: str | None,
         job_type: Collection[str],
+        job_location_id: int | None,
         status: Collection[str],
+        started_by_user: str | None,
+        started_since: datetime | None,
+        started_until: datetime | None,
+        ended_since: datetime | None,
+        ended_until: datetime | None,
     ) -> RunServicePaginatedResult:
         pagination = await self._uow.run.paginate(
             page=page,
@@ -93,7 +99,13 @@ async def paginate(
             parent_run_id=parent_run_id,
             search_query=search_query,
             job_type=job_type,
-            status=status,
+            job_location_id=job_location_id,
+            status=[RunStatus[s] for s in status],
+            started_by_user=started_by_user,
+            started_since=started_since,
+            started_until=started_until,
+            ended_since=ended_since,
+            ended_until=ended_until,
         )
         run_ids = [item.id for item in pagination.items]
         input_stats = await self._uow.input.get_stats_by_run_ids(run_ids)
diff --git a/docs/changelog/next_release/319.feature.rst b/docs/changelog/next_release/319.feature.rst
@@ -0,0 +1,5 @@
+Add new filters for ``GET /v1/jobs`` endpoint.
+- location_id: ``int``  id to filter by specific location
+- job_type: ``list[str]`` filter by job's type
+
+Add new endpoint GET ``/v1/jobs/types`` - to get distinct job types from DataRentgen.
diff --git a/docs/changelog/next_release/319.improvement.rst b/docs/changelog/next_release/319.improvement.rst
diff --git a/docs/changelog/next_release/322.feature.rst b/docs/changelog/next_release/322.feature.rst
@@ -1,4 +1,4 @@
-Add new query parameters for ``/api/v1/runs`` endpoint:
+Add new query parameters for ``GET /v1/runs`` endpoint:
 
 - job_type: ``list[str]`` - filter by corresponding job type. For example ``SPARK_APLICATION``. You can use ``/api/v1/jobs/types`` to get all job types.
-- status:``list[RunStatus]`` - filter by runs statuses.
+- status: ``list[RunStatus]`` - filter by runs statuses.
diff --git a/docs/changelog/next_release/323.feature.rst b/docs/changelog/next_release/323.feature.rst
@@ -0,0 +1,9 @@
+Add new query parameters for ``GET /v1/runs`` endpoint:
+
+- started_since: ``datetime | None``
+- started_until: ``datetime | None``
+- ended_since: ``datetime | None``
+- ended_until: ``datetime | None``
+- job_location_id: ``int | None``
+- started_by_user: ``str | None``
+
diff --git a/tests/test_server/fixtures/factories/run.py b/tests/test_server/fixtures/factories/run.py
@@ -12,6 +12,7 @@
 from tests.test_server.fixtures.factories.job import create_job
 from tests.test_server.fixtures.factories.job_type import create_job_type
 from tests.test_server.fixtures.factories.location import create_location
+from tests.test_server.fixtures.factories.user import create_user
 from tests.test_server.utils.delete import clean_db
 
 if TYPE_CHECKING:
@@ -188,50 +189,95 @@ async def runs_with_same_parent(
 @pytest_asyncio.fixture()
 async def runs_search(
     async_session_maker: Callable[[], AbstractAsyncContextManager[AsyncSession]],
-    user: User,
 ) -> AsyncGenerator[dict[str | None, Run], None]:
-    job_kwargs = [
-        {"name": "spark_application_name", "type": "SPARK_APPLICATION"},
-        {"name": "airflow_dag_name", "type": "AIRFLOW_DAG"},
-    ]
-    runs_kwargs = [
-        {"external_id": "application_1638922609021_0001", "status": RunStatus.KILLED},
-        {
-            "external_id": "application_1638922609021_0002",
-            "status": RunStatus.SUCCEEDED,
-        },
-        {"external_id": "extract_task_0001", "status": RunStatus.STARTED},
-        {"external_id": "extract_task_0002", "status": RunStatus.FAILED},
-    ]
-    started_at = datetime.now(tz=UTC)
+    created_at = datetime.now(tz=UTC)
     async with async_session_maker() as async_session:
-        jobs = []
-        for kwargs in job_kwargs:
-            location = await create_location(async_session)
-            job_type = await create_job_type(async_session, job_type_kwargs={"type": kwargs["type"]})
-            jobs.append(
-                await create_job(
-                    async_session,
-                    location_id=location.id,
-                    job_type_id=job_type.id,
-                    job_kwargs=kwargs,
-                ),
-            )
-        runs = [
-            await create_run(
-                async_session,
-                run_kwargs={
-                    "created_at": started_at + timedelta(seconds=0.1 * i),
-                    "job_id": job.id,
-                    "started_by_user_id": user.id,
-                    **kwargs,
-                },
-            )
-            for i, (job, kwargs) in enumerate(zip([job for job in jobs for _ in range(2)], runs_kwargs, strict=False))
-        ]
-
-        async_session.expunge_all()
-
+        spark_location = await create_location(async_session)
+        airflow_location = await create_location(async_session)
+
+        spark_user = await create_user(async_session)
+        airflow_user = await create_user(async_session)
+
+        spark_application_job_type = await create_job_type(async_session, job_type_kwargs={"type": "SPARK_APPLICATION"})
+        airflow_dag_job_type = await create_job_type(async_session, job_type_kwargs={"type": "AIRFLOW_DAG"})
+        airflow_task_job_type = await create_job_type(async_session, job_type_kwargs={"type": "AIRFLOW_TASK"})
+
+        spark_application = await create_job(
+            async_session,
+            location_id=spark_location.id,
+            job_type_id=spark_application_job_type.id,
+            job_kwargs={"name": "spark_application_name"},
+        )
+        airflow_dag = await create_job(
+            async_session,
+            location_id=airflow_location.id,
+            job_type_id=airflow_dag_job_type.id,
+            job_kwargs={"name": "airflow_dag_name"},
+        )
+        airflow_task = await create_job(
+            async_session,
+            location_id=airflow_location.id,
+            job_type_id=airflow_task_job_type.id,
+            job_kwargs={"name": "airflow_task_name"},
+        )
+
+        spark_app_run1 = await create_run(
+            async_session,
+            run_kwargs={
+                "job_id": spark_application.id,
+                "started_by_user_id": spark_user.id,
+                "external_id": "application_1638922609021_0001",
+                "status": RunStatus.KILLED,
+                "created_at": created_at + timedelta(seconds=0.1),
+                "started_at": created_at + timedelta(seconds=1),
+                "ended_at": created_at + timedelta(seconds=60),
+            },
+        )
+        spark_app_run2 = await create_run(
+            async_session,
+            run_kwargs={
+                "job_id": spark_application.id,
+                "started_by_user_id": spark_user.id,
+                "external_id": "application_1638922609021_0002",
+                "status": RunStatus.SUCCEEDED,
+                "created_at": created_at + timedelta(seconds=0.2),
+                "started_at": created_at + timedelta(seconds=2),
+                "ended_at": created_at + timedelta(seconds=120),
+            },
+        )
+
+        airflow_dag_run1 = await create_run(
+            async_session,
+            run_kwargs={
+                "job_id": airflow_dag.id,
+                "started_by_user_id": airflow_user.id,
+                "external_id": "dag_0001",
+                "status": RunStatus.STARTED,
+                "created_at": created_at + timedelta(seconds=0.3),
+                "started_at": created_at + timedelta(seconds=3),
+                "ended_at": None,
+            },
+        )
+        airflow_task_run1 = await create_run(
+            async_session,
+            run_kwargs={
+                "job_id": airflow_task.id,
+                "parent_run_id": airflow_dag_run1.id,
+                "started_by_user_id": airflow_user.id,
+                "external_id": "task_0001",
+                "status": RunStatus.FAILED,
+                "created_at": created_at + timedelta(seconds=0.4),
+                "started_at": created_at + timedelta(seconds=4),
+                "ended_at": created_at + timedelta(seconds=240),
+            },
+        )
+
+    runs = [
+        spark_app_run1,
+        spark_app_run2,
+        airflow_dag_run1,
+        airflow_task_run1,
+    ]
     yield {run.external_id: run for run in runs}
 
     async with async_session_maker() as async_session:
diff --git a/tests/test_server/test_runs/test_get_runs.py b/tests/test_server/test_runs/test_get_runs.py
diff --git a/tests/test_server/test_runs/test_search_runs.py b/tests/test_server/test_runs/test_search_runs.py
diff --git a/tests/test_server/utils/enrich.py b/tests/test_server/utils/enrich.py