Evaluation: Disable local and remote tracing (Azure#38372)

ninghu · web-flow · commit fa38e216c460 · 2024-11-06T19:40:31.000-08:00
* Disable tracing for target run

* fix black issue

* fix linting issue

* update tests

* disable local tracing

* update changelog

* fix black issue
diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
@@ -15,6 +15,7 @@
 
 ### Other Changes
 - Refined error messages for serviced-based evaluators and simulators.
+- Tracing has been disabled due to Cosmos DB initialization issue.
 - Introduced environment variable `AI_EVALS_DISABLE_EXPERIMENTAL_WARNING` to disable the warning message for experimental features.
 - Changed the randomization pattern for `AdversarialSimulator` such that there is an almost equal number of Adversarial harm categories (e.g. Hate + Unfairness, Self-Harm, Violence, Sex) represented in the  `AdversarialSimulator` outputs. Previously, for 200 `max_simulation_results` a user might see 140 results belonging to the 'Hate + Unfairness' category and 40 results belonging to the 'Self-Harm' category. Now, user will see 50 results for each of Hate + Unfairness, Self-Harm, Violence, and Sex.
 - For the `DirectAttackSimulator`, the prompt templates used to generate simulated outputs for each Adversarial harm category will no longer be in a randomized order by default. To override this behavior, pass `randomize_order=True` when you call the `DirectAttackSimulator`, for example:
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_constants.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_constants.py
@@ -62,6 +62,7 @@ class EvaluationRunProperties:
 
 PF_BATCH_TIMEOUT_SEC_DEFAULT = 3600
 PF_BATCH_TIMEOUT_SEC = "PF_BATCH_TIMEOUT_SEC"
+PF_DISABLE_TRACING = "PF_DISABLE_TRACING"
 
 OTEL_EXPORTER_OTLP_TRACES_TIMEOUT = "OTEL_EXPORTER_OTLP_TRACES_TIMEOUT"
 OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT = 60
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py
@@ -14,6 +14,7 @@
     OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT,
     PF_BATCH_TIMEOUT_SEC,
     PF_BATCH_TIMEOUT_SEC_DEFAULT,
+    PF_DISABLE_TRACING,
 )
 
 from ..._user_agent import USER_AGENT
@@ -49,6 +50,7 @@ def __enter__(self) -> None:
         if isinstance(self.client, ProxyClient):
             os.environ[PF_FLOW_ENTRY_IN_TMP] = "true"
             os.environ[PF_FLOW_META_LOAD_IN_SUBPROCESS] = "false"
+            os.environ[PF_DISABLE_TRACING] = "true"
 
             if os.environ.get(PF_BATCH_TIMEOUT_SEC) is None:
                 os.environ[PF_BATCH_TIMEOUT_SEC] = str(PF_BATCH_TIMEOUT_SEC_DEFAULT)
@@ -76,6 +78,7 @@ def __exit__(
         if isinstance(self.client, ProxyClient):
             os.environ.pop(PF_FLOW_ENTRY_IN_TMP, None)
             os.environ.pop(PF_FLOW_META_LOAD_IN_SUBPROCESS, None)
+            os.environ.pop(PF_DISABLE_TRACING, None)
 
             if self._is_batch_timeout_set_by_system:
                 os.environ.pop(PF_BATCH_TIMEOUT_SEC, None)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py
@@ -6,6 +6,7 @@
 from typing import Optional, Type
 
 from promptflow._sdk._constants import PF_FLOW_ENTRY_IN_TMP
+from azure.ai.evaluation._constants import PF_DISABLE_TRACING
 
 
 class TargetRunContext:
@@ -29,6 +30,8 @@ def __enter__(self) -> None:
         if not self._upload_snapshot:
             os.environ[PF_FLOW_ENTRY_IN_TMP] = "true"
 
+        os.environ[PF_DISABLE_TRACING] = "true"
+
     def __exit__(
         self,
         exc_type: Optional[Type[BaseException]],
@@ -39,3 +42,5 @@ def __exit__(
 
         if not self._upload_snapshot:
             os.environ.pop(PF_FLOW_ENTRY_IN_TMP, None)
+
+        os.environ.pop(PF_DISABLE_TRACING, None)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_eval_run.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_eval_run.py
@@ -34,14 +34,15 @@
     from azure.ai.ml.entities._datastore.datastore import Datastore
     from azure.storage.blob import BlobServiceClient
 except (ModuleNotFoundError, ImportError):
-    # If the above mentioned modules cannot be imported, we are running
-    # in local mode and MLClient in the constructor will be None, so
-    # we will not arrive to Azure-dependent code.
-
-    # We are logging the import failure only if debug logging level is set because:
-    # - If the project configuration was not provided this import is not needed.
-    # - If the project configuration was provided, the error will be raised by PFClient.
-    LOGGER.debug("promptflow.azure is not installed.")
+    raise EvaluationException(  # pylint: disable=raise-missing-from
+        message=(
+            "The required packages for remote tracking are missing.\n"
+            'To resolve this, please install them by running "pip install azure-ai-evaluation[remote]".'
+        ),
+        target=ErrorTarget.EVALUATE,
+        category=ErrorCategory.MISSING_PACKAGE,
+        blame=ErrorBlame.USER_ERROR,
+    )
 
 
 @dataclasses.dataclass
@@ -103,7 +104,6 @@ class EvalRun(contextlib.AbstractContextManager):  # pylint: disable=too-many-in
     _SCOPE = "https://management.azure.com/.default"
 
     EVALUATION_ARTIFACT = "instance_results.jsonl"
-    EVALUATION_ARTIFACT_DUMMY_RUN = "eval_results.jsonl"
 
     def __init__(
         self,
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py
@@ -9,7 +9,7 @@
 
 import pandas as pd
 from promptflow._sdk._constants import LINE_NUMBER
-from promptflow._sdk._errors import MissingAzurePackage, UserAuthenticationError, UploadInternalError
+from promptflow._sdk._errors import UserAuthenticationError, UploadInternalError
 from promptflow.client import PFClient
 from promptflow.entities import Run
 
@@ -700,36 +700,7 @@ def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
     if target is not None:
         _validate_columns_for_target(input_data_df, target)
 
-    # Target Run
-    try:
-        pf_client = PFClient(
-            config=(
-                {"trace.destination": _trace_destination_from_project_scope(azure_ai_project)}
-                if azure_ai_project
-                else None
-            ),
-            user_agent=USER_AGENT,
-        )
-    # pylint: disable=raise-missing-from
-    except MissingAzurePackage:
-        msg = (
-            "The required packages for remote tracking are missing.\n"
-            'To resolve this, please install them by running "pip install azure-ai-evaluation[remote]".'
-        )
-
-        raise EvaluationException(  # pylint: disable=raise-missing-from
-            message=msg,
-            target=ErrorTarget.EVALUATE,
-            category=ErrorCategory.MISSING_PACKAGE,
-            blame=ErrorBlame.USER_ERROR,
-        )
-
-    trace_destination: Optional[str] = pf_client._config.get_trace_destination()  # pylint: disable=protected-access
-
-    # Handle the case where the customer manually run "pf config set trace.destination=none"
-    if trace_destination and trace_destination.lower() == "none":
-        trace_destination = None
-
+    pf_client = PFClient(user_agent=USER_AGENT)
     target_run: Optional[Run] = None
 
     # Create default configuration for evaluators that directly maps
@@ -803,11 +774,7 @@ def eval_batch_run(
         # Ensure the absolute path is passed to pf.run, as relative path doesn't work with
         # multiple evaluators. If the path is already absolute, abspath will return the original path.
         data = os.path.abspath(data)
-
-        # A user reported intermittent errors when PFClient uploads evaluation runs to the cloud.
-        # The root cause is still unclear, but it seems related to a conflict between the async run uploader
-        # and the async batch run. As a quick mitigation, use a PFClient without a trace destination for batch runs.
-        per_evaluator_results = eval_batch_run(ProxyClient(PFClient(user_agent=USER_AGENT)), data=data)
+        per_evaluator_results = eval_batch_run(ProxyClient(pf_client), data=data)
     else:
         data = input_data_df
         per_evaluator_results = eval_batch_run(CodeClient(), data=input_data_df)
@@ -849,6 +816,10 @@ def eval_batch_run(
     result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)
     metrics = _aggregate_metrics(evaluators_result_df, evaluators)
     metrics.update(evaluators_metric)
+
+    # Since tracing is disabled, pass None for target_run so a dummy evaluation run will be created each time.
+    target_run = None
+    trace_destination = _trace_destination_from_project_scope(azure_ai_project) if azure_ai_project else None
     studio_url = _log_metrics_and_instance_results(
         metrics,
         result_df,
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_utils.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_utils.py
@@ -137,7 +137,7 @@ def _log_metrics_and_instance_results(
         ml_client=azure_pf_client.ml_client,
         promptflow_run=run,
     ) as ev_run:
-        artifact_name = EvalRun.EVALUATION_ARTIFACT if run else EvalRun.EVALUATION_ARTIFACT_DUMMY_RUN
+        artifact_name = EvalRun.EVALUATION_ARTIFACT
 
         with tempfile.TemporaryDirectory() as tmpdir:
             # storing multi_modal images if exists
@@ -164,9 +164,8 @@ def _log_metrics_and_instance_results(
                 ev_run.write_properties_to_run_history(
                     properties={
                         EvaluationRunProperties.RUN_TYPE: "eval_run",
-                        EvaluationRunProperties.EVALUATION_RUN: "azure-ai-generative-parent",
+                        EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
                         "_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
-                        "isEvaluatorRun": "true",
                     }
                 )
 
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py
@@ -631,7 +631,6 @@ def test_evaluate_track_in_cloud(
         remote_run = _get_run_from_run_history(run_id, azure_ml_client, project_scope)
 
         assert remote_run is not None
-        assert remote_run["runMetadata"]["properties"]["azureml.promptflow.local_to_cloud"] == "true"
         assert remote_run["runMetadata"]["properties"]["runType"] == "eval_run"
         assert remote_run["runMetadata"]["properties"]["_azureml.evaluation_run"] == "promptflow.BatchRun"
         assert remote_run["runMetadata"]["displayName"] == evaluation_name
@@ -678,7 +677,7 @@ def test_evaluate_track_in_cloud_no_target(
 
         assert remote_run is not None
         assert remote_run["runMetadata"]["properties"]["runType"] == "eval_run"
-        assert remote_run["runMetadata"]["properties"]["_azureml.evaluation_run"] == "azure-ai-generative-parent"
+        assert remote_run["runMetadata"]["properties"]["_azureml.evaluation_run"] == "promptflow.BatchRun"
         assert remote_run["runMetadata"]["displayName"] == evaluation_name
 
     @pytest.mark.parametrize(