Skip to content

Commit 5aa70de

Browse files
needuvkdestin
andauthored
Address API Feedback on Type Hints (Azure#37826)
* remove dict from model_config and azure_ai_project * fix dict -> Dict * hide attributes on evaluators * Fix import issue * fix evaluate typehint * remove type hints from init and fix more __call__ typehints * add conversation schema * fix import error, type hints * fix docstring * address pylint * address pylint issues and mypy issues * fix import error * fix mypy issue * attempt to fix pylint issue * remove whitespace * fix type hint * remove type hint from class * style: Run isort * fix(typing): Rai based evaluators can return other than Dict[str, Union[str, float]] * fix(apiview): Match :rtype: with return type for apiview * run black * fix lint * rename EvaluateResult -> EvaluationResult * change data/output_path type hints from str -> Union[str, os.PathLike] * move experimental to common module * mark all RAI service based evaluators as experimental * expand evaluator return type hints to accomodate conversation return value * fix optional import in eval * fix pylint, run black * fix mypy * remove protected material eval * make eci experimental --------- Co-authored-by: kdestin <[email protected]>
1 parent 697369e commit 5aa70de

File tree

38 files changed

+349
-262
lines changed

38 files changed

+349
-262
lines changed

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,10 @@
2727
from ._model_configurations import (
2828
AzureAIProject,
2929
AzureOpenAIModelConfiguration,
30+
Conversation,
31+
EvaluationResult,
3032
EvaluatorConfig,
33+
Message,
3134
OpenAIModelConfiguration,
3235
)
3336

@@ -57,4 +60,7 @@
5760
"AzureOpenAIModelConfiguration",
5861
"OpenAIModelConfiguration",
5962
"EvaluatorConfig",
63+
"Conversation",
64+
"Message",
65+
"EvaluationResult",
6066
]
Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,13 @@
2727

2828

2929
@overload
30-
def experimental(wrapped: Type[T]) -> Type[T]: ...
30+
def experimental(wrapped: Type[T]) -> Type[T]:
31+
...
3132

3233

3334
@overload
34-
def experimental(wrapped: Callable[P, T]) -> Callable[P, T]: ...
35+
def experimental(wrapped: Callable[P, T]) -> Callable[P, T]:
36+
...
3537

3638

3739
def experimental(wrapped: Union[Type[T], Callable[P, T]]) -> Union[Type[T], Callable[P, T]]:

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import re
88
import time
99
from ast import literal_eval
10-
from typing import Dict, List, Optional, Union, cast
10+
from typing import Any, Dict, List, Optional, Union, cast
1111
from urllib.parse import urlparse
1212

1313
import jwt
@@ -413,7 +413,7 @@ async def fetch_or_reuse_token(credential: TokenCredential, token: Optional[str]
413413

414414
async def evaluate_with_rai_service(
415415
query: str, response: str, metric_name: str, project_scope: AzureAIProject, credential: TokenCredential
416-
) -> Dict[str, Union[str, float]]:
416+
) -> Dict[str, Any]:
417417
""" "Evaluate the content safety of the response using Responsible AI service
418418
419419
:param query: The query to evaluate.

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,16 @@
11
# ---------------------------------------------------------
22
# Copyright (c) Microsoft Corporation. All rights reserved.
33
# ---------------------------------------------------------
4+
5+
# pylint: disable=protected-access
6+
47
import inspect
58
import logging
69
import math
710
import os
11+
from collections import OrderedDict
812
from concurrent.futures import Future
913
from typing import Any, Callable, Dict, Optional, Union
10-
from collections import OrderedDict
1114

1215
import pandas as pd
1316
from promptflow.client import PFClient

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,16 @@
22
# Copyright (c) Microsoft Corporation. All rights reserved.
33
# ---------------------------------------------------------
44
import inspect
5+
import json
56
import os
67
import re
78
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, TypeVar, Union
8-
import json
99

1010
import pandas as pd
1111
from promptflow._sdk._constants import LINE_NUMBER
12+
from promptflow._sdk._errors import MissingAzurePackage
1213
from promptflow.client import PFClient
1314
from promptflow.entities import Run
14-
from promptflow._sdk._errors import MissingAzurePackage
1515

1616
from azure.ai.evaluation._common.math import list_sum
1717
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
@@ -24,11 +24,10 @@
2424
Prefixes,
2525
_InternalEvaluationMetrics,
2626
)
27-
from .._model_configurations import AzureAIProject, EvaluatorConfig
27+
from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
2828
from .._user_agent import USER_AGENT
2929
from ._batch_run_client import BatchRunContext, CodeClient, ProxyClient
3030
from ._utils import (
31-
EvaluateResult,
3231
_apply_column_mapping,
3332
_log_metrics_and_instance_results,
3433
_trace_destination_from_project_scope,
@@ -392,7 +391,7 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
392391

393392
def _apply_target_to_data(
394393
target: Callable,
395-
data: str,
394+
data: Union[str, os.PathLike],
396395
pf_client: PFClient,
397396
initial_data: pd.DataFrame,
398397
evaluation_name: Optional[str] = None,
@@ -404,7 +403,7 @@ def _apply_target_to_data(
404403
:param target: The function to be applied to data.
405404
:type target: Callable
406405
:param data: The path to input jsonl file.
407-
:type data: str
406+
:type data: Union[str, os.PathLike]
408407
:param pf_client: The promptflow client to be used.
409408
:type pf_client: PFClient
410409
:param initial_data: The data frame with the loaded data.
@@ -514,15 +513,15 @@ def _rename_columns_conditionally(df: pd.DataFrame) -> pd.DataFrame:
514513
# @log_evaluate_activity
515514
def evaluate(
516515
*,
517-
data: str,
516+
data: Union[str, os.PathLike],
518517
evaluators: Dict[str, Callable],
519518
evaluation_name: Optional[str] = None,
520519
target: Optional[Callable] = None,
521520
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
522521
azure_ai_project: Optional[AzureAIProject] = None,
523-
output_path: Optional[str] = None,
522+
output_path: Optional[Union[str, os.PathLike]] = None,
524523
**kwargs,
525-
):
524+
) -> EvaluationResult:
526525
"""Evaluates target or data with built-in or custom evaluators. If both target and data are provided,
527526
data will be run through target function and then results will be evaluated.
528527
@@ -547,7 +546,7 @@ def evaluate(
547546
:keyword azure_ai_project: Logs evaluation results to AI Studio if set.
548547
:paramtype azure_ai_project: Optional[~azure.ai.evaluation.AzureAIProject]
549548
:return: Evaluation results.
550-
:rtype: dict
549+
:rtype: ~azure.ai.evaluation.EvaluationResult
551550
552551
:Example:
553552
@@ -644,12 +643,12 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
644643
evaluators: Dict[str, Callable],
645644
evaluation_name: Optional[str] = None,
646645
target: Optional[Callable] = None,
647-
data: str,
646+
data: Union[str, os.PathLike],
648647
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
649648
azure_ai_project: Optional[AzureAIProject] = None,
650-
output_path: Optional[str] = None,
649+
output_path: Optional[Union[str, os.PathLike]] = None,
651650
**kwargs,
652-
) -> EvaluateResult:
651+
) -> EvaluationResult:
653652
input_data_df = _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name)
654653

655654
# Process evaluator config to replace ${target.} with ${data.}
@@ -683,7 +682,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
683682
'To resolve this, please install them by running "pip install azure-ai-evaluation[remote]".'
684683
)
685684

686-
raise EvaluationException(
685+
raise EvaluationException( # pylint: disable=raise-missing-from
687686
message=msg,
688687
target=ErrorTarget.EVALUATE,
689688
category=ErrorCategory.MISSING_PACKAGE,
@@ -818,7 +817,8 @@ def eval_batch_run(
818817
evaluation_name,
819818
)
820819

821-
result: EvaluateResult = {"rows": result_df.to_dict("records"), "metrics": metrics, "studio_url": studio_url}
820+
result_df_dict = result_df.to_dict("records")
821+
result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url} # type: ignore
822822

823823
if output_path:
824824
_write_output(output_path, result)

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_telemetry/__init__.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,10 @@
1616
from promptflow.core import Prompty as prompty_core
1717
from typing_extensions import ParamSpec
1818

19-
from azure.ai.evaluation._model_configurations import AzureAIProject
19+
from azure.ai.evaluation._model_configurations import AzureAIProject, EvaluationResult
2020

2121
from ..._user_agent import USER_AGENT
22-
from .._utils import EvaluateResult, _trace_destination_from_project_scope
22+
from .._utils import _trace_destination_from_project_scope
2323

2424
LOGGER = logging.getLogger(__name__)
2525

@@ -97,17 +97,17 @@ def _get_evaluator_properties(evaluator, evaluator_name):
9797

9898

9999
# cspell:ignore isna
100-
def log_evaluate_activity(func: Callable[P, EvaluateResult]) -> Callable[P, EvaluateResult]:
100+
def log_evaluate_activity(func: Callable[P, EvaluationResult]) -> Callable[P, EvaluationResult]:
101101
"""Decorator to log evaluate activity
102102
103103
:param func: The function to be decorated
104104
:type func: Callable
105105
:returns: The decorated function
106-
:rtype: Callable[P, EvaluateResult]
106+
:rtype: Callable[P, EvaluationResult]
107107
"""
108108

109109
@functools.wraps(func)
110-
def wrapper(*args: P.args, **kwargs: P.kwargs) -> EvaluateResult:
110+
def wrapper(*args: P.args, **kwargs: P.kwargs) -> EvaluationResult:
111111
from promptflow._sdk._telemetry import ActivityType, log_activity
112112
from promptflow._sdk._telemetry.telemetry import get_telemetry_logger
113113

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_utils.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import re
88
import tempfile
99
from pathlib import Path
10-
from typing import Any, Dict, List, NamedTuple, Optional, Tuple, TypedDict, Union
10+
from typing import Any, Dict, NamedTuple, Optional, Tuple, Union
1111

1212
import pandas as pd
1313
from promptflow.client import PFClient
@@ -37,12 +37,6 @@ class AzureMLWorkspace(NamedTuple):
3737
workspace_name: str
3838

3939

40-
class EvaluateResult(TypedDict):
41-
metrics: Dict[str, float]
42-
studio_url: Optional[str]
43-
rows: List[Dict]
44-
45-
4640
def is_none(value) -> bool:
4741
return value is None or str(value).lower() == "none"
4842

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_bleu/_bleu.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def __call__(self, *, response: str, ground_truth: str, **kwargs):
6363
:keyword ground_truth: The ground truth to be compared against.
6464
:paramtype ground_truth: str
6565
:return: The BLEU score.
66-
:rtype: dict
66+
:rtype: Dict[str, float]
6767
"""
6868
return async_run_allowing_running_loop(
6969
self._async_evaluator, response=response, ground_truth=ground_truth, **kwargs

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -35,22 +35,22 @@ class CoherenceEvaluator(PromptyEvaluatorBase):
3535
}
3636
"""
3737

38-
PROMPTY_FILE = "coherence.prompty"
39-
RESULT_KEY = "gpt_coherence"
38+
_PROMPTY_FILE = "coherence.prompty"
39+
_RESULT_KEY = "gpt_coherence"
4040

4141
@override
42-
def __init__(self, model_config: dict):
42+
def __init__(self, model_config):
4343
current_dir = os.path.dirname(__file__)
44-
prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
45-
super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self.RESULT_KEY)
44+
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
45+
super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
4646

4747
@override
4848
def __call__(
4949
self,
5050
*,
5151
query: Optional[str] = None,
5252
response: Optional[str] = None,
53-
conversation: Optional[dict] = None,
53+
conversation=None,
5454
**kwargs,
5555
):
5656
"""Evaluate coherence. Accepts either a query and response for a single evaluation,
@@ -64,8 +64,8 @@ def __call__(
6464
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
6565
key "messages". Conversation turns are expected
6666
to be dictionaries with keys "content" and "role".
67-
:paramtype conversation: Optional[Dict]
67+
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
6868
:return: The relevance score.
69-
:rtype: Dict[str, float]
69+
:rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]]
7070
"""
7171
return super().__call__(query=query, response=response, conversation=conversation, **kwargs)

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ def __call__(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult
9696
:keyword kwargs: A dictionary that contains inputs needed to evaluate a conversation.
9797
:type kwargs: Dict
9898
:return: The evaluation result
99-
:rtype: Dict
99+
:rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
100100
"""
101101
return async_run_allowing_running_loop(self._async_evaluator, **kwargs)
102102

@@ -110,7 +110,7 @@ async def _do_eval(self, eval_input: Any) -> DoEvalResult[T_EvalValue]:
110110
:param eval_input: Whatever inputs are needed for this evaluator to perform a single evaluation.
111111
:type eval_input: Any
112112
:return: A single evaluation result
113-
:rtype: Dict
113+
:rtype: DoEvalResult[T_EvalValue]
114114
"""
115115

116116
# ~~~ METHODS THAT MIGHT NEED TO BE OVERRIDDEN BY CHILDREN~~~
@@ -254,7 +254,7 @@ def _aggregate_results(self, per_turn_results: List[DoEvalResult[T_EvalValue]])
254254
values (including non-numerics) located in under the "evaluation_per_turn" key,
255255
which each sub-key being a metric and each sub-value being a the list of that metric's
256256
per-turn values.
257-
:rtype: Dict
257+
:rtype: AggregateResult[T_EvalValue]
258258
"""
259259

260260
aggregated: Dict[str, Union[float, Dict[str, List[T_EvalValue]]]] = {}
@@ -283,7 +283,7 @@ async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], Aggrega
283283
:keyword kwargs: The inputs to evaluate.
284284
:type kwargs: Dict
285285
:return: The evaluation result.
286-
:rtype: Dict
286+
:rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
287287
"""
288288
# Convert inputs into list of evaluable inputs.
289289
eval_input_list = self._convert_kwargs_to_eval_input(**kwargs)

0 commit comments

Comments
 (0)