get_run paginates tasks and iterations

gkiko10 · gkiko10 · commit 005dd7708b35 · 2024-11-08T15:28:25.000+01:00
diff --git a/databricks/sdk/mixins/jobs.py b/databricks/sdk/mixins/jobs.py
@@ -0,0 +1,49 @@
+from typing import Optional
+
+from databricks.sdk.service import jobs
+
+
+class JobsExt(jobs.JobsAPI):
+
+    def get_run(self,
+                run_id: int,
+                *,
+                include_history: Optional[bool] = None,
+                include_resolved_values: Optional[bool] = None,
+                page_token: Optional[str] = None) -> jobs.Run:
+        """
+        This method fetches the details of a run identified by `run_id`. If the run has multiple pages of tasks or iterations,
+        it will paginate through all pages and aggregate the results.
+        :param run_id: int
+          The canonical identifier of the run for which to retrieve the metadata. This field is required.
+        :param include_history: bool (optional)
+          Whether to include the repair history in the response.
+        :param include_resolved_values: bool (optional)
+          Whether to include resolved parameter values in the response.
+        :param page_token: str (optional)
+          To list the next page or the previous page of job tasks, set this field to the value of the
+          `next_page_token` or `prev_page_token` returned in the GetJob response.
+        :returns: :class:`Run`
+        """
+        run = super().get_run(run_id,
+                              include_history=include_history,
+                              include_resolved_values=include_resolved_values,
+                              page_token=page_token)
+
+        # When querying a Job run, a page token is returned when there are more than 100 tasks. No iterations are defined for a Job run. Therefore, the next page in the response only includes the next page of tasks.
+        # When querying a ForEach task run, a page token is returned when there are more than 100 iterations. Only a single task is returned, corresponding to the ForEach task itself. Therefore, the client only reads the iterations from the next page and not the tasks.
+        is_paginating_iterations = run.iterations is not None and len(run.iterations) > 0
+
+        while run.next_page_token is not None:
+            next_run = super().get_run(run_id,
+                                       include_history=include_history,
+                                       include_resolved_values=include_resolved_values,
+                                       page_token=run.next_page_token)
+            if is_paginating_iterations:
+                run.iterations.extend(next_run.iterations)
+            else:
+                run.tasks.extend(next_run.tasks)
+            run.next_page_token = next_run.next_page_token
+
+        run.prev_page_token = None
+        return run
diff --git a/tests/test_jobs_mixin.py b/tests/test_jobs_mixin.py
@@ -0,0 +1,123 @@
+import json
+import re
+from typing import Pattern
+
+from databricks.sdk import WorkspaceClient
+
+
+def make_path_pattern(run_id: int, page_token: str) -> Pattern[str]:
+    return re.compile(
+        rf'{re.escape("http://localhost/api/")}2.\d{re.escape(f"/jobs/runs/get?page_token={page_token}&run_id={run_id}")}'
+    )
+
+
+def test_get_run_with_no_pagination(config, requests_mock):
+    run1 = {"tasks": [{"run_id": 0}, {"run_id": 1}], }
+    requests_mock.get(make_path_pattern(1337, "initialToken"), text=json.dumps(run1))
+    w = WorkspaceClient(config=config)
+
+    run = w.jobs.get_run(1337, page_token="initialToken")
+
+    assert run.as_dict() == {"tasks": [{'run_id': 0}, {'run_id': 1}], }
+
+
+def test_get_run_pagination_with_tasks(config, requests_mock):
+    run1 = {
+        "tasks": [{
+            "run_id": 0
+        }, {
+            "run_id": 1
+        }],
+        "next_page_token": "tokenToSecondPage",
+        "prev_page_token": "tokenToPreviousPage"
+    }
+    run2 = {
+        "tasks": [{
+            "run_id": 2
+        }, {
+            "run_id": 3
+        }],
+        "next_page_token": "tokenToThirdPage",
+        "prev_page_token": "initialToken"
+    }
+    run3 = {"tasks": [{"run_id": 4}], "next_page_token": None, "prev_page_token": "tokenToSecondPage"}
+    requests_mock.get(make_path_pattern(1337, "initialToken"), text=json.dumps(run1))
+    requests_mock.get(make_path_pattern(1337, "tokenToSecondPage"), text=json.dumps(run2))
+    requests_mock.get(make_path_pattern(1337, "tokenToThirdPage"), text=json.dumps(run3))
+    w = WorkspaceClient(config=config)
+
+    run = w.jobs.get_run(1337, page_token="initialToken")
+
+    assert run.as_dict() == {
+        "tasks": [{
+            'run_id': 0
+        }, {
+            'run_id': 1
+        }, {
+            'run_id': 2
+        }, {
+            'run_id': 3
+        }, {
+            'run_id': 4
+        }],
+    }
+
+
+def test_get_run_pagination_with_iterations(config, requests_mock):
+    run1 = {
+        "tasks": [{
+            "run_id": 1337
+        }],
+        "iterations": [{
+            "run_id": 0
+        }, {
+            "run_id": 1
+        }],
+        "next_page_token": "tokenToSecondPage",
+        "prev_page_token": "tokenToPreviousPage"
+    }
+    run2 = {
+        "tasks": [{
+            "run_id": 1337
+        }],
+        "iterations": [{
+            "run_id": 2
+        }, {
+            "run_id": 3
+        }],
+        "next_page_token": "tokenToThirdPage",
+        "prev_page_token": "initialToken"
+    }
+    run3 = {
+        "tasks": [{
+            "run_id": 1337
+        }],
+        "iterations": [{
+            "run_id": 4
+        }],
+        "next_page_token": None,
+        "prev_page_token": "tokenToSecondPage"
+    }
+    requests_mock.get(make_path_pattern(1337, "initialToken"), text=json.dumps(run1))
+    requests_mock.get(make_path_pattern(1337, "tokenToSecondPage"), text=json.dumps(run2))
+    requests_mock.get(make_path_pattern(1337, "tokenToThirdPage"), text=json.dumps(run3))
+    w = WorkspaceClient(config=config)
+
+    run = w.jobs.get_run(1337, page_token="initialToken")
+
+    assert run.as_dict() == {
+        "tasks": [{
+            'run_id': 1337
+        }],
+        "iterations": [{
+            'run_id': 0
+        }, {
+            'run_id': 1
+        }, {
+            'run_id': 2
+        }, {
+            'run_id': 3
+        }, {
+            'run_id': 4
+        }],
+    }