Set upper bound for dj.data polling and fix intermittent no data errors (#1857)

shangyian · web-flow · commit 3774c627e440 · 2026-03-12T02:11:57.000-07:00
* Set upper bound to data polling
Add retry to get final set of data after successful polling

* Add tests
diff --git a/datajunction-clients/python/datajunction/client.py b/datajunction-clients/python/datajunction/client.py
@@ -417,12 +417,21 @@ def _data(  # pylint: disable=too-many-arguments,too-many-locals
                     printed_links = True
                 progress_bar.title = f"Status: {job_state.value}"
 
-                # Update the polling interval
+                # Update the polling interval (cap at 10s to avoid long waits)
                 time.sleep(poll_interval)
-                poll_interval *= 2
+                poll_interval = min(poll_interval * 2, 10)
 
-            # Return results if the job has finished
+            # Return results if the job has finished. If the server returned
+            # FINISHED with empty results, then re-poll a few
+            # times before giving up.
             if job_state == models.QueryState.FINISHED:
+                if results and not results.get("results"):
+                    for attempt in range(3):
+                        time.sleep(2**attempt)
+                        response = self._session.get(path, params=params)
+                        results = response.json()
+                        if results.get("results"):
+                            break
                 return self.process_results(results)
             if job_state == models.QueryState.CANCELED:  # pragma: no cover
                 raise DJClientException("Query execution was canceled!")
diff --git a/datajunction-clients/python/tests/test_client.py b/datajunction-clients/python/tests/test_client.py
@@ -399,6 +399,76 @@ def test_data(self, client):
                 async_=True,
             )
 
+        # FINISHED with empty results triggers re-poll retry; data arrives on second poll
+        finished_empty = type(
+            "R",
+            (),
+            {
+                "json": lambda self: {
+                    "state": "FINISHED",
+                    "results": [],
+                    "errors": [],
+                    "links": [],
+                },
+                "status_code": 200,
+            },
+        )()
+        finished_with_data = type(
+            "R",
+            (),
+            {
+                "json": lambda self: {
+                    "state": "FINISHED",
+                    "results": [
+                        {
+                            "columns": [
+                                {
+                                    "name": "default_DOT_hard_hat_DOT_city",
+                                    "type": "str",
+                                    "semantic_type": "dimension",
+                                    "semantic_entity": "default.hard_hat.city",
+                                    "semantic_name": "default.hard_hat.city",
+                                    "node": "default.hard_hat",
+                                },
+                                {
+                                    "name": "default_DOT_avg_repair_price",
+                                    "type": "float",
+                                    "semantic_type": "metric",
+                                    "semantic_name": "default.avg_repair_price",
+                                    "node": "default.avg_repair_price",
+                                },
+                            ],
+                            "rows": [["Foo", 1.0], ["Bar", 2.0]],
+                        },
+                    ],
+                    "errors": [],
+                    "links": [],
+                },
+                "status_code": 200,
+            },
+        )()
+        original_get = client._session.get
+        call_count = [0]
+
+        def mock_get(path, params=None, **kwargs):
+            if "/data/" in str(path):
+                call_count[0] += 1
+                if call_count[0] == 1:
+                    return finished_empty
+                return finished_with_data
+            return original_get(path, params=params, **kwargs)
+
+        client._session.get = mock_get
+        result = client.data(
+            metrics=["default.avg_repair_price"],
+            dimensions=["default.hard_hat.city"],
+        )
+        client._session.get = original_get
+        assert list(result.columns) == [
+            "default.hard_hat.city",
+            "default.avg_repair_price",
+        ]
+
         # Error propagation
         # with pytest.raises(DJClientException) as exc_info:
         #     client.data(