[Internal] Update Jobs GetJob API to support paginated responses (#869)

gkiko10 · web-flow · commit 614386e60329 · 2025-02-14T10:02:54.000Z
## What changes are proposed in this pull request?

Introduces logic in extension for jobs GetJob call that paginates tasks
and other arrays in the response. This change is necessary for SDK and
API 2.2 compatibility. API 2.2 serves paginated responses as long as
next_page_token field is present in the response. The pagination logic
is not exposed to the customer.

## How is this tested?

I enabled API 2.2 calls by modifying URL string /api/2.2/jobs/runs/get
in databricks/sdk/service/jobs.py. Then I ran unit test from
tests/test_jobs_mixin.py
diff --git a/databricks/sdk/mixins/jobs.py b/databricks/sdk/mixins/jobs.py
@@ -1,6 +1,7 @@
 from typing import Optional
 
 from databricks.sdk.service import jobs
+from databricks.sdk.service.jobs import Job
 
 
 class JobsExt(jobs.JobsAPI):
@@ -52,4 +53,32 @@ def get_run(self,
             run.repair_history.extend(next_run.repair_history)
             run.next_page_token = next_run.next_page_token
 
-        return run
+        return run
+
+    def get(self, job_id: int, *, page_token: Optional[str] = None) -> Job:
+        """Get a single job.
+
+        Retrieves the details for a single job. If the job has multiple pages of tasks, job_clusters, parameters or environments,
+        it will paginate through all pages and aggregate the results.
+
+        :param job_id: int
+          The canonical identifier of the job to retrieve information about. This field is required.
+        :param page_token: str (optional)
+          Use `next_page_token` returned from the previous GetJob to request the next page of the job's
+          sub-resources.
+
+        :returns: :class:`Job`
+        """
+        job = super().get(job_id, page_token=page_token)
+
+        # jobs/get response includes next_page_token as long as there are more pages to fetch.
+        while job.next_page_token is not None:
+            next_job = super().get(job_id, page_token=job.next_page_token)
+            # Each new page of jobs/get response includes the next page of the tasks, job_clusters, job_parameters, and environments.
+            job.settings.tasks.extend(next_job.settings.tasks)
+            job.settings.job_clusters.extend(next_job.settings.job_clusters)
+            job.settings.parameters.extend(next_job.settings.parameters)
+            job.settings.environments.extend(next_job.settings.environments)
+            job.next_page_token = next_job.next_page_token
+
+        return job
diff --git a/tests/test_jobs_mixin.py b/tests/test_jobs_mixin.py
@@ -5,15 +5,21 @@
 from databricks.sdk import WorkspaceClient
 
 
-def make_path_pattern(run_id: int, page_token: str) -> Pattern[str]:
+def make_getrun_path_pattern(run_id: int, page_token: str) -> Pattern[str]:
     return re.compile(
         rf'{re.escape("http://localhost/api/")}2.\d{re.escape(f"/jobs/runs/get?page_token={page_token}&run_id={run_id}")}'
     )
 
 
+def make_getjob_path_pattern(job_id: int, page_token: str) -> Pattern[str]:
+    return re.compile(
+        rf'{re.escape("http://localhost/api/")}2.\d{re.escape(f"/jobs/get?job_id={job_id}&page_token={page_token}")}'
+    )
+
+
 def test_get_run_with_no_pagination(config, requests_mock):
     run1 = {"tasks": [{"run_id": 0}, {"run_id": 1}], }
-    requests_mock.get(make_path_pattern(1337, "initialToken"), text=json.dumps(run1))
+    requests_mock.get(make_getrun_path_pattern(1337, "initialToken"), text=json.dumps(run1))
     w = WorkspaceClient(config=config)
 
     run = w.jobs.get_run(1337, page_token="initialToken")
@@ -59,9 +65,9 @@ def test_get_run_pagination_with_tasks(config, requests_mock):
         "next_page_token": "tokenToThirdPage",
     }
     run3 = {"tasks": [{"run_id": 4}]}
-    requests_mock.get(make_path_pattern(1337, "initialToken"), text=json.dumps(run1))
-    requests_mock.get(make_path_pattern(1337, "tokenToSecondPage"), text=json.dumps(run2))
-    requests_mock.get(make_path_pattern(1337, "tokenToThirdPage"), text=json.dumps(run3))
+    requests_mock.get(make_getrun_path_pattern(1337, "initialToken"), text=json.dumps(run1))
+    requests_mock.get(make_getrun_path_pattern(1337, "tokenToSecondPage"), text=json.dumps(run2))
+    requests_mock.get(make_getrun_path_pattern(1337, "tokenToThirdPage"), text=json.dumps(run3))
     w = WorkspaceClient(config=config)
 
     run = w.jobs.get_run(1337, page_token="initialToken")
@@ -116,9 +122,9 @@ def test_get_run_pagination_with_iterations(config, requests_mock):
         "next_page_token": "tokenToThirdPage",
     }
     run3 = {"tasks": [{"run_id": 1337}], "iterations": [{"run_id": 4}], }
-    requests_mock.get(make_path_pattern(1337, "initialToken"), text=json.dumps(run1))
-    requests_mock.get(make_path_pattern(1337, "tokenToSecondPage"), text=json.dumps(run2))
-    requests_mock.get(make_path_pattern(1337, "tokenToThirdPage"), text=json.dumps(run3))
+    requests_mock.get(make_getrun_path_pattern(1337, "initialToken"), text=json.dumps(run1))
+    requests_mock.get(make_getrun_path_pattern(1337, "tokenToSecondPage"), text=json.dumps(run2))
+    requests_mock.get(make_getrun_path_pattern(1337, "tokenToThirdPage"), text=json.dumps(run3))
     w = WorkspaceClient(config=config)
 
     run = w.jobs.get_run(1337, page_token="initialToken")
@@ -139,3 +145,119 @@ def test_get_run_pagination_with_iterations(config, requests_mock):
             'run_id': 4
         }],
     }
+
+
+def test_get_job_with_no_pagination(config, requests_mock):
+    job1 = {"settings": {"tasks": [{"task_key": "taskKey1"}, {"task_key": "taskKey2"}], }}
+    requests_mock.get(make_getjob_path_pattern(1337, "initialToken"), text=json.dumps(job1))
+    w = WorkspaceClient(config=config)
+
+    job = w.jobs.get(1337, page_token="initialToken")
+
+    assert job.as_dict() == {"settings": {"tasks": [{"task_key": "taskKey1"}, {"task_key": "taskKey2"}], }}
+
+
+def test_get_job_pagination_with_tasks(config, requests_mock):
+    from databricks.sdk.service import compute, jobs
+    cluster_spec = compute.ClusterSpec(spark_version="11.3.x-scala2.12",
+                                       custom_tags={"ResourceClass": "SingleNode"},
+                                       num_workers=0,
+                                       node_type_id="Standard_DS3_v2",
+                                       )
+    cluster1 = jobs.JobCluster(job_cluster_key="cluster1", new_cluster=cluster_spec)
+    cluster2 = jobs.JobCluster(job_cluster_key="cluster2", new_cluster=cluster_spec)
+    cluster3 = jobs.JobCluster(job_cluster_key="cluster3", new_cluster=cluster_spec)
+    cluster4 = jobs.JobCluster(job_cluster_key="cluster4", new_cluster=cluster_spec)
+    job1 = {
+        "settings": {
+            "tasks": [{
+                "task_key": "taskKey1"
+            }, {
+                "task_key": "taskKey2"
+            }],
+            "job_clusters": [cluster1.as_dict(), cluster2.as_dict()],
+            "parameters": [{
+                "name": "param1",
+                "default": "default1"
+            }],
+            "environments": [{
+                "environment_key": "key1"
+            }, {
+                "environment_key": "key2"
+            }]
+        },
+        "next_page_token": "tokenToSecondPage"
+    }
+    job2 = {
+        "settings": {
+            "tasks": [{
+                "task_key": "taskKey3"
+            }, {
+                "task_key": "taskKey4"
+            }],
+            "job_clusters": [cluster3.as_dict(), cluster4.as_dict()],
+            "parameters": [{
+                "name": "param2",
+                "default": "default2"
+            }],
+            "environments": [{
+                "environment_key": "key3"
+            }]
+        },
+        "next_page_token": "tokenToThirdPage"
+    }
+    job3 = {
+        "settings": {
+            "tasks": [{
+                "task_key": "taskKey5"
+            }],
+            "parameters": [{
+                "name": "param3",
+                "default": "default3"
+            }]
+        },
+    }
+
+    requests_mock.get(make_getjob_path_pattern(1337, "initialToken"), text=json.dumps(job1))
+    requests_mock.get(make_getjob_path_pattern(1337, "tokenToSecondPage"), text=json.dumps(job2))
+    requests_mock.get(make_getjob_path_pattern(1337, "tokenToThirdPage"), text=json.dumps(job3))
+    w = WorkspaceClient(config=config)
+
+    job = w.jobs.get(1337, page_token="initialToken")
+
+    assert job.as_dict() == {
+        "settings": {
+            "tasks": [{
+                "task_key": "taskKey1"
+            }, {
+                "task_key": "taskKey2"
+            }, {
+                "task_key": "taskKey3"
+            }, {
+                "task_key": "taskKey4"
+            }, {
+                "task_key": "taskKey5"
+            }],
+            "job_clusters": [cluster1.as_dict(),
+                             cluster2.as_dict(),
+                             cluster3.as_dict(),
+                             cluster4.as_dict()],
+            "parameters": [{
+                "name": "param1",
+                "default": "default1"
+            }, {
+                "name": "param2",
+                "default": "default2"
+            }, {
+                "name": "param3",
+                "default": "default3"
+            }],
+            "environments": [{
+                "environment_key": "key1"
+            }, {
+                "environment_key": "key2"
+            }, {
+                "environment_key": "key3"
+            }]
+        }
+    }