[AVC] Fix parallel running of evals (Azure#14595)

tjprescott · web-flow · commit 2ca747dfb03d · 2026-03-19T13:21:03.000-07:00
* Fix parallel running of evals.

* Code review feedback.
diff --git a/packages/python-packages/apiview-copilot/evals/_runner.py b/packages/python-packages/apiview-copilot/evals/_runner.py
@@ -26,6 +26,7 @@
     load_recordings,
     save_recordings,
 )
+from src._credential import warm_up_credential
 from src._settings import SettingsManager
 
 DEFAULT_NUM_RUNS: int = 1
@@ -41,7 +42,7 @@ def __init__(self):
             "resource_group_name": self.settings.get("EVALS_RG"),
             "project_name": self.settings.get("EVALS_PROJECT_NAME"),
         }
-        self._credential_kwargs = self._create_credential_kwargs()
+        self.credential_kwargs = self._create_credential_kwargs()
         self._temp_files: list[Path] = []
         self._temp_files_lock = threading.Lock()
 
@@ -65,7 +66,7 @@ def _create_credential_kwargs(self) -> dict[str, Any]:
     def in_ci(self) -> bool:
         return bool(os.getenv("TF_BUILD"))
 
-    def _load_test_file(self, test_file: Path) -> dict:
+    def load_test_file(self, test_file: Path) -> dict:
         """Load test file - supports both JSON and YAML formats."""
         try:
             with test_file.open("r", encoding="utf-8") as f:
@@ -145,6 +146,10 @@ def __init__(self, *, num_runs: int = DEFAULT_NUM_RUNS, use_recording: bool = Fa
     def _ensure_context(self):
         if self._context is None:
             self._context = ExecutionContext()
+            # Pre-acquire a token so parallel workers find it cached
+            # instead of all racing to spawn az-cli subprocesses.
+            if not self._context.in_ci():
+                warm_up_credential()
 
     def run(self, discovery_result: DiscoveryResult) -> list[EvaluationResult]:
         """Execute all targets in the discovery result.
@@ -164,8 +169,10 @@ def run(self, discovery_result: DiscoveryResult) -> list[EvaluationResult]:
     def _run(self, discovery_result: DiscoveryResult) -> list[EvaluationResult]:
         """Run tests in parallel with progress tracking."""
         workflow_count = len(discovery_result.targets)
-        cpu_count = os.cpu_count() or 4
-        max_workers = min(cpu_count * 2, workflow_count)
+        # Limit concurrency to avoid overwhelming credential token
+        # acquisition (AzureCliCredential subprocess calls fail under
+        # heavy parallelism).
+        max_workers = min(4, workflow_count)
         results = []
         total_targets = len(discovery_result.targets)
 
@@ -219,7 +226,7 @@ def _execute_target(self, target: EvaluationTarget) -> EvaluationResult:
             test_file_paths = []
 
             for test_file in target.test_files:
-                test_case = self._context._load_test_file(test_file)
+                test_case = self._context.load_test_file(test_file)
                 test_file_to_case[test_file] = test_case
                 testcase_id = test_case.get("testcase")
                 if testcase_id:
@@ -300,7 +307,7 @@ def _run_azure_evaluation(self, testcases: list[dict], target: EvaluationTarget)
                 evaluator_config={"metrics": evaluator.evaluator_config},
                 target=evaluator.target_function,
                 fail_on_evaluator_errors=False,
-                **self._context._credential_kwargs,
+                **self._context.credential_kwargs,
             )
             results.append(result)
 
@@ -341,7 +348,7 @@ def show_results(self, results: list[EvaluationResult]):
                 passed_tests = []
                 partial_tests = []
                 raw = result.raw_results[0]
-                for filename, eval_result in raw.items():
+                for _, eval_result in raw.items():
                     for res in eval_result["rows"]:
                         testcase = res.get("inputs.testcase", "unknown")
                         score = res.get("outputs.metrics.score")
diff --git a/packages/python-packages/apiview-copilot/src/_credential.py b/packages/python-packages/apiview-copilot/src/_credential.py
@@ -6,7 +6,9 @@
 
 """Module for retrieving Azure credentials."""
 
+import logging
 import os
+import threading
 
 from azure.identity import (
     AzureCliCredential,
@@ -16,29 +18,60 @@
     ManagedIdentityCredential,
 )
 
+logger = logging.getLogger(__name__)
+
+_credential_cache = {"instance": None}
+_credential_lock = threading.Lock()
+
 
 def in_ci():
     """Check if the code is running in a CI environment."""
     return os.getenv("TF_BUILD", None) and "tests" in os.getenv("SYSTEM_DEFINITIONNAME", "")
 
 
 def get_credential():
-    """Get Azure credentials based on the environment."""
-    if in_ci():
-        # These are used by Azure Pipelines and should not be changed
-        service_connection_id = os.environ["AZURESUBSCRIPTION_SERVICE_CONNECTION_ID"]
-        client_id = os.environ["AZURESUBSCRIPTION_CLIENT_ID"]
-        tenant_id = os.environ["AZURESUBSCRIPTION_TENANT_ID"]
-        system_access_token = os.environ["SYSTEM_ACCESSTOKEN"]
-        return AzurePipelinesCredential(
-            service_connection_id=service_connection_id,
-            client_id=client_id,
-            tenant_id=tenant_id,
-            system_access_token=system_access_token,
-        )
-
-    return ChainedTokenCredential(
-        ManagedIdentityCredential(),
-        AzureCliCredential(),
-        AzureDeveloperCliCredential(),
-    )
+    """Get a shared Azure credential instance.
+
+    Returns a cached singleton so that concurrent threads reuse the same
+    credential instead of each spawning their own token-acquisition
+    subprocesses (which fails under high parallelism).
+    """
+    if _credential_cache["instance"] is not None:
+        return _credential_cache["instance"]
+
+    with _credential_lock:
+        if _credential_cache["instance"] is not None:
+            return _credential_cache["instance"]
+
+        if in_ci():
+            service_connection_id = os.environ["AZURESUBSCRIPTION_SERVICE_CONNECTION_ID"]
+            client_id = os.environ["AZURESUBSCRIPTION_CLIENT_ID"]
+            tenant_id = os.environ["AZURESUBSCRIPTION_TENANT_ID"]
+            system_access_token = os.environ["SYSTEM_ACCESSTOKEN"]
+            _credential_cache["instance"] = AzurePipelinesCredential(
+                service_connection_id=service_connection_id,
+                client_id=client_id,
+                tenant_id=tenant_id,
+                system_access_token=system_access_token,
+            )
+        else:
+            _credential_cache["instance"] = ChainedTokenCredential(
+                ManagedIdentityCredential(),
+                AzureCliCredential(),
+                AzureDeveloperCliCredential(),
+            )
+
+        return _credential_cache["instance"]
+
+
+def warm_up_credential():
+    """Pre-acquire a token so it is cached before parallel workers start.
+
+    This prevents a thundering-herd of concurrent subprocess calls to
+    ``az account get-access-token`` that fail under high parallelism.
+    """
+    credential = get_credential()
+    try:
+        credential.get_token("https://cognitiveservices.azure.com/.default")
+    except Exception as exc:
+        logger.warning("Credential warm-up failed: %s", exc)
diff --git a/packages/python-packages/apiview-copilot/src/_prompt_runner.py b/packages/python-packages/apiview-copilot/src/_prompt_runner.py
@@ -153,8 +153,8 @@ def _execute_prompt_template(
         file_path: Path to the .prompty file.
         inputs: Dictionary of input variables for template rendering.
         configuration: Optional configuration dict. If it contains an
-            ``api_key`` entry, an ``AzureKeyCredential`` is used instead
-            of ``DefaultAzureCredential``.
+            ``api_key`` entry, an ``AzureKeyCredential`` is used; otherwise,
+            the shared credential from ``get_credential()`` is used.
 
     Returns:
         The string response content from the model.
@@ -165,7 +165,7 @@ def _execute_prompt_template(
     from azure.ai.inference import ChatCompletionsClient
     from azure.ai.inference.models import SystemMessage, UserMessage
     from azure.core.credentials import AzureKeyCredential
-    from azure.identity import DefaultAzureCredential
+    from src._credential import get_credential
     from src._settings import SettingsManager
 
     config = _parse_prompty(file_path)
@@ -193,7 +193,8 @@ def _execute_prompt_template(
     # Format: {FOUNDRY_ENDPOINT}/models
     inference_endpoint = f"{foundry_endpoint.rstrip('/')}/models"
 
-    # Authenticate — prefer an explicit API key (used in CI), fall back to DefaultAzureCredential
+    # Authenticate — if an explicit API key is provided (e.g., in CI), use AzureKeyCredential;
+    # otherwise, fall back to the shared credential from get_credential().
     api_key = (configuration or {}).get("api_key")
     if api_key:
         credential = AzureKeyCredential(api_key)
@@ -202,7 +203,7 @@ def _execute_prompt_template(
             credential=credential,
         )
     else:
-        credential = DefaultAzureCredential()
+        credential = get_credential()
         # Specify the cognitive services scope for Azure AI
         client = ChatCompletionsClient(
             endpoint=inference_endpoint,