Merge branch 'boeker/handle-searchable-copies' of github.com:vespa-engine/pyvespa into boeker/handle-searchable-copies

boeker · boeker · commit e04706304596 · 2026-02-23T14:58:15.000+01:00
diff --git a/tests/unit/test_deployment.py b/tests/unit/test_deployment.py
@@ -172,6 +172,21 @@ def test_check_production_build_status_deploying(self, mock_request):
 
         self.assertEqual(status, {"deployed": False, "status": "deploying"})
 
+    @patch("vespa.deployment.VespaCloud._request")
+    def test_wait_for_prod_deployment_raises_on_failed_job(self, mock_request):
+        mock_request.return_value = {
+            "deployed": False,
+            "status": "deploying",
+            "jobs": [
+                {"jobName": "production-us-central-1", "runStatus": "success"},
+                {"jobName": "production-us-east-3", "runStatus": "deploymentFailed"},
+            ],
+        }
+
+        with self.assertRaises(RuntimeError) as ctx:
+            self.vespa_cloud.wait_for_prod_deployment(456)
+        self.assertIn("production-us-east-3: deploymentFailed", str(ctx.exception))
+
     @patch("vespa.deployment.VespaCloud._try_get_access_token")
     def test_try_get_access_token(self, mock_get_token):
         mock_get_token.return_value = "fake_access_token"
diff --git a/tests/unit/test_evaluator.py b/tests/unit/test_evaluator.py
@@ -3345,6 +3345,81 @@ def test_compute_recall(self):
             delta=0.0001,
         )
 
+    def test_compute_recall_id_field(self):
+        response_exact = self.SuccessfullMockVespaResponse(
+            [
+                {"id": "1", "fields": {"id": "1"}},
+                {"id": "2", "fields": {"id": "2"}},
+                {"id": "3", "fields": {"id": "3"}},
+                {"id": "4", "fields": {"id": "4"}},
+                {"id": "5", "fields": {"id": "5"}},
+            ]
+        )
+        self.assertAlmostEqual(
+            self.recall_evaluator._compute_recall(response_exact, response_exact),
+            1.0,
+            delta=0.0001,
+        )
+
+        response_approx = self.SuccessfullMockVespaResponse(
+            [
+                {"id": "1", "fields": {"id": "1"}},
+                {"id": "2", "fields": {"id": "2"}},
+                {"id": "3", "fields": {"id": "3"}},
+                {"id": "4", "fields": {"id": "4"}},
+            ]
+        )
+        self.assertAlmostEqual(
+            self.recall_evaluator._compute_recall(response_exact, response_approx),
+            0.8,
+            delta=0.0001,
+        )
+
+    class InternalIDResponse(MockVespaResponse):
+        def __init__(
+            self,
+            hits,
+            first_node_id=0,
+            _total_count=None,
+            _timing=None,
+            _status_code=200,
+        ):
+            super().__init__(hits, _total_count, _timing, _status_code)
+            self.next_node_num = first_node_id
+
+        def add_namespace_to_hit_ids(self, hits_list) -> List[Dict[str, Any]]:
+            new_hits = []
+            for hit_item in hits_list:
+                hit_id = hit_item.get("id")
+                if isinstance(hit_id, str) and not hit_id.startswith("index:"):
+                    hit_item["id"] = f"index:cluster/{self.next_node_num}/{hit_id}"
+                    self.next_node_num += 1
+                new_hits.append(hit_item)
+            return new_hits
+
+        def is_successful(self):
+            return True
+
+    def test_compute_recall_internal_ids(self):
+        response_exact = self.InternalIDResponse(
+            [{"id": "1"}, {"id": "2"}, {"id": "3"}, {"id": "4"}, {"id": "5"}],
+            first_node_id=0,
+        )
+        self.assertAlmostEqual(
+            self.recall_evaluator._compute_recall(response_exact, response_exact),
+            1.0,
+            delta=0.0001,
+        )
+
+        response_approx = self.InternalIDResponse(
+            [{"id": "1"}, {"id": "2"}, {"id": "3"}, {"id": "4"}], first_node_id=1
+        )
+        self.assertAlmostEqual(
+            self.recall_evaluator._compute_recall(response_exact, response_approx),
+            0.8,
+            delta=0.0001,
+        )
+
     def test_run(self):
         class MockVespaApp:
             def __init__(self, first_mock_responses, second_mock_responses):
diff --git a/vespa/deployment.py b/vespa/deployment.py
@@ -925,32 +925,31 @@ def check_production_build_status(self, build_no: Optional[int]) -> dict:
             vespa_cloud = VespaCloud(...)
             build_no = vespa_cloud.deploy_to_prod()
             status = vespa_cloud.check_production_build_status(build_no)
-            # This can yield one of three responses:
-            # 1. If the revision (build_no), or higher, has successfully converged everywhere, and nothing older has then been deployed on top of that again. Nothing more will happen in this case.
-            # {
-            #     "deployed": True,
-            #     "status": "done"
-            # }
-
-            # 2. If the revision (build_no), or newer, has not yet converged, but the system is (most likely) still trying to deploy it. There is a point in polling again later when this is the response.
-            # {
-            #     "deployed": False,
-            #     "status": "deploying"
-            # }
-            # 3. If the revision, or newer, has not yet converged everywhere, and it's never going to, because it was similar to the previous build, or marked obsolete by a user. There is no point in asking again for this revision.
-            # {
-            #     "deployed": False,
-            #     "status": "done"
-            # }
+            # The response contains:
+            # - "deployed" (bool): True if the build has converged everywhere.
+            # - "status" (str): "deploying" or "done".
+            # - "skipReason" (str, optional): Why the build was skipped, e.g. "no-changes" or "cancelled".
+            # - "jobs" (list): Per-zone deployment details, each with "jobName" and "runStatus".
+            #
+            # Example responses:
+            # 1. Successfully deployed everywhere:
+            #    {"deployed": True, "status": "done", "jobs": [{"jobName": "production-us-east-3", "runStatus": "success"}]}
+            #
+            # 2. Still deploying:
+            #    {"deployed": False, "status": "deploying", "jobs": [{"jobName": "production-us-east-3", "runStatus": "running"}]}
+            #
+            # 3. Skipped (no changes to deploy):
+            #    {"deployed": False, "status": "done", "skipReason": "no-changes", "jobs": []}
+            #
+            # 4. A job failed:
+            #    {"deployed": False, "status": "deploying", "jobs": [{"jobName": "production-us-east-3", "runStatus": "deploymentFailed"}]}
             ```
 
         Args:
             build_no (int): The build number to check.
 
         Returns:
-            dict: A dictionary with the aggregated status of all deployment jobs for the given build number. The dictionary contains:
-                - "deployed" (bool): Whether the build has successfully converged.
-                - "status" (str): The current status of the build ("done", "deploying").
+            dict: The build status response from the API. See example responses above for the full shape.
 
         Raises:
             RuntimeError: If there are issues with retrieving the status of the build.
@@ -993,23 +992,27 @@ def wait_for_prod_deployment(
             poll_interval (int, optional): Polling interval in seconds. Default is 5 seconds.
 
         Returns:
-            bool: True if the deployment is done and converged, False if the deployment has failed.
+            bool: True if the build was deployed to all production zones, False if it completed
+                without deploying (e.g. no changes).
 
         Raises:
+            RuntimeError: If any production job failed (e.g. deploymentFailed, installationFailed).
             TimeoutError: If the deployment did not finish within `max_wait` seconds.
         """
         start_time = time.time()
         while time.time() - start_time < max_wait:
             status = self.check_production_build_status(build_no)
+            failed_jobs = [
+                job for job in status.get("jobs", [])
+                if job["runStatus"] not in ("success", "running")
+            ]
+            if failed_jobs:
+                failures = ", ".join(
+                    f"{job['jobName']}: {job['runStatus']}" for job in failed_jobs
+                )
+                raise RuntimeError(f"Deployment failed: {failures}")
             if status["status"] == "done":
                 return status["deployed"]
-            if "detailed-status" in status and status["detailed-status"] not in [
-                "success",
-                "running",
-            ]:
-                raise RuntimeError(
-                    f"The build failed with status code: {status['detailed-status']}"
-                )
             time.sleep(poll_interval)
         raise TimeoutError(f"Deployment did not finish within {max_wait} seconds. ")
 
diff --git a/vespa/evaluation/_base.py b/vespa/evaluation/_base.py
@@ -1885,6 +1885,7 @@ class VespaNNRecallEvaluator:
         hits (int): Number of hits to use. Should match the parameter targetHits in the used ANN queries.
         app (Vespa): An instance of the Vespa application.
         query_limit (int): Maximum number of queries to determine the recall for. Defaults to 20.
+        id_field (str): Name of the field containing a unique id. Defaults to "id".
         **kwargs (dict, optional): Additional HTTP request parameters. See: <https://docs.vespa.ai/en/reference/document-v1-api-reference.html#request-parameters>.
     """
 
@@ -1894,12 +1895,14 @@ def __init__(
         hits: int,
         app: Vespa,
         query_limit: int = 20,
+        id_field: str = "id",
         **kwargs,
     ):
         self.queries = queries
         self.hits = hits
         self.app = app
         self.query_limit = query_limit
+        self.id_field = id_field
         self.parameters = kwargs
 
     def _compute_recall(
@@ -1924,8 +1927,39 @@ def _compute_recall(
         except KeyError:
             results_approx = []
 
-        ids_exact = list(map(lambda x: x["id"], results_exact))
-        ids_approx = list(map(lambda x: x["id"], results_approx))
+        def extract_id(hit: dict, id_field: str) -> Tuple[str, str]:
+            """Extract document ID from a Vespa hit."""
+
+            # id as specified by field
+            fields = hit.get("fields", {})
+            if id_field in fields:
+                return fields[id_field], "id_field"
+
+            # document id
+            id = hit.get("id", "")
+            if "::" in id:
+                return id, "document_id"
+
+            # internal id
+            if id.startswith(
+                "index:"
+            ):  # id is an internal id of the form index:[source]/[node-index]/[hex-gid], return hex-gid
+                return id.split("/", 2)[2], "internal_id"
+
+            raise ValueError(f"Could not extract a document id from hit: {hit}")
+
+        ids_exact = list(map(lambda x: extract_id(x, self.id_field)[0], results_exact))
+        ids_approx = list(
+            map(lambda x: extract_id(x, self.id_field)[0], results_approx)
+        )
+
+        id_types = set()
+        id_types.update(map(lambda x: extract_id(x, self.id_field)[1], results_exact))
+        id_types.update(map(lambda x: extract_id(x, self.id_field)[1], results_approx))
+        if len(id_types) > 1:
+            print(
+                f"Warning: Multiple id types obtained for hits: {id_types}. The recall computation will not be reliable. Please specify id_field correctly."
+            )
 
         if len(ids_exact) != self.hits:
             print(
@@ -2145,6 +2179,7 @@ class VespaNNParameterOptimizer:
         benchmark_time_limit (int): Time in milliseconds to spend per bucket benchmark. Defaults to 5000.
         recall_query_limit(int): Number of queries per bucket to compute the recall for. Defaults to 20.
         max_concurrent(int): Number of queries to execute concurrently during benchmark/recall calculation. Defaults to 10.
+        id_field (str): Name of the field containing a unique id for recall computation. Defaults to "id".
     """
 
     def __init__(
@@ -2157,6 +2192,7 @@ def __init__(
         benchmark_time_limit: int = 5000,
         recall_query_limit: int = 20,
         max_concurrent: int = 10,
+        id_field: str = "id",
     ):
         self.app = app
         self.queries = queries
@@ -2170,6 +2206,7 @@ def __init__(
         self.benchmark_time_limit = benchmark_time_limit
         self.recall_query_limit = recall_query_limit
         self.max_concurrent = max_concurrent
+        self.id_field = id_field
 
         self.searchable_copies = None
 
@@ -2535,7 +2572,12 @@ def compute_average_recalls(self, **kwargs) -> BucketedMetricResults:
                         end="",
                     )
                 recall_evaluator = VespaNNRecallEvaluator(
-                    bucket, self.hits, self.app, self.recall_query_limit, **kwargs
+                    bucket,
+                    self.hits,
+                    self.app,
+                    self.recall_query_limit,
+                    self.id_field,
+                    **kwargs,
                 )
                 recall_list = recall_evaluator.run()
                 results.append(recall_list)