Skip to content

Commit f3d9302

Browse files
authored
New package azure-ai-evaluation (#37031)
* Add new package azure-ai-evaluation * add manifest * fix manifest and typo in setup file * remove some references to promptflow-evals * address comments on ci * add extra files to manifest * clean up some __init__ * remove dependency upper bounds * add note about pf evals * add a pyproject * fix pylint * cspell for package * remove a readme * add required sections for readme * fix readme issues * add skip variables to CI * sync with latest code on main * slim out dependencies and add shared_requirements.txt * remove skip for analyze dependencies * fix package
1 parent 05c6eca commit f3d9302

File tree

83 files changed

+8236
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

83 files changed

+8236
-0
lines changed

.vscode/cspell.json

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161
"sdk/core/azure-servicemanagement-legacy/**",
6262
"sdk/core/corehttp/**",
6363
"sdk/digitaltwins/azure-digitaltwins-core/**",
64+
"sdk/evaluation/azure-ai-evaluation/tests/**",
6465
"sdk/eventhub/azure-eventhub-checkpointstoretable/**",
6566
"sdk/eventhub/azure-eventhub-checkpointstoreblob-aio/**",
6667
"sdk/eventhub/azure-eventhub/**",
@@ -1309,6 +1310,25 @@
13091310
"dtype"
13101311
]
13111312
},
1313+
{
1314+
"filename": "sdk/evaluation/azure-ai-evaluation/**",
1315+
"words": [
1316+
"raisvc",
1317+
"otel",
1318+
"otlp",
1319+
"aggr",
1320+
"mlflow",
1321+
"azureml",
1322+
"dcid",
1323+
"prompty",
1324+
"wsid",
1325+
"tkey",
1326+
"tparam",
1327+
"tqdm",
1328+
"ncols",
1329+
"datas"
1330+
]
1331+
},
13121332
{
13131333
"filename": "sdk/ai/azure-ai-generative/**",
13141334
"words": [
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# Release History
2+
3+
## 1.0.0b1 (Unreleased)
4+
5+
### Features Added
6+
7+
- First preview
8+
- This package is port of `promptflow-evals`. New features will be added only to this package moving forward.
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
recursive-include tests *.py
2+
include *.md
3+
include azure/__init__.py
4+
include azure/ai/__init__.py
5+
include azure/ai/evaluation/py.typed
6+
recursive-include azure/ai/evaluation *.prompty
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
# Azure AI Evaluation client library for Python
2+
3+
## Getting started
4+
5+
### Install the package
6+
7+
Install the Azure AI Evaluation library for Python with:
8+
9+
```bash
10+
pip install azure-ai-evaluation
11+
pip install azure-identity
12+
```
13+
14+
## Key concepts
15+
16+
Evaluators are custom or prebuilt classes or functions that are designed to measure the quality of the outputs from language models.
17+
18+
## Examples
19+
20+
Users can create evaluator runs on the local machine as shown in the example below:
21+
22+
```python
23+
import os
24+
from pprint import pprint
25+
26+
from promptflow.core import AzureOpenAIModelConfiguration
27+
28+
from azure.ai.evaluation.evaluate import evaluate
29+
from azure.ai.evaluation.evaluators import RelevanceEvaluator
30+
from azure.ai.evaluation.evaluators.content_safety import ViolenceEvaluator
31+
32+
33+
def answer_length(answer, **kwargs):
34+
return {"value": len(answer)}
35+
36+
37+
if __name__ == "__main__":
38+
# Built-in evaluators
39+
# Initialize Azure OpenAI Connection
40+
model_config = AzureOpenAIModelConfiguration(
41+
azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT"),
42+
api_key=os.environ.get("AZURE_OPENAI_KEY"),
43+
azure_deployment=os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
44+
)
45+
46+
# Initialzing Relevance Evaluator
47+
relevance_eval = RelevanceEvaluator(model_config)
48+
49+
# Running Relevance Evaluator on single input row
50+
relevance_score = relevance_eval(
51+
answer="The Alpine Explorer Tent is the most waterproof.",
52+
context="From the our product list,"
53+
" the alpine explorer tent is the most waterproof."
54+
" The Adventure Dining Table has higher weight.",
55+
question="Which tent is the most waterproof?",
56+
)
57+
58+
pprint(relevance_score)
59+
# {'gpt_relevance': 5.0}
60+
61+
# Content Safety Evaluator
62+
63+
# Initialize Project Scope
64+
project_scope = {
65+
"subscription_id": "e0fd569c-e34a-4249-8c24-e8d723c7f054",
66+
"resource_group_name": "rg-test",
67+
"project_name": "project-test",
68+
}
69+
70+
violence_eval = ViolenceEvaluator(project_scope)
71+
violence_score = violence_eval(question="What is the capital of France?", answer="Paris.")
72+
pprint(violence_score)
73+
# {'violence': 'Very low',
74+
# 'violence_reason': "The system's response is a straightforward factual answer "
75+
# 'to a geography question. There is no violent content or '
76+
# 'language present.',
77+
# 'violence_score': 0}
78+
79+
# Code based evaluator
80+
answer_length("The Alpine Explorer Tent is the most waterproof.")
81+
# {'value': 48}
82+
83+
# Using multiple evaluators together using `Evaluate` API
84+
85+
result = evaluate(
86+
data="evaluate_test_data.jsonl",
87+
evaluators={
88+
"answer_length": answer_length,
89+
"violence": violence_eval,
90+
},
91+
)
92+
93+
pprint(result)
94+
```
95+
96+
## Troubleshooting
97+
98+
## Next steps
99+
100+
## Contributing
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# ---------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# ---------------------------------------------------------
4+
5+
__path__ = __import__("pkgutil").extend_path(__path__, __name__) # type: ignore
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# ---------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# ---------------------------------------------------------
4+
5+
__path__ = __import__("pkgutil").extend_path(__path__, __name__) # type: ignore
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# ---------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# ---------------------------------------------------------
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# ---------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# ---------------------------------------------------------
4+
5+
# To minimize relative imports in our evaluators, the scope of this package also includes anything
6+
# that would have otherwise been a relative import scoped to single evaluator directories.
7+
8+
from . import constants
9+
from .rai_service import evaluate_with_rai_service
10+
from .utils import get_harm_severity_level
11+
12+
__all__ = [
13+
"get_harm_severity_level",
14+
"evaluate_with_rai_service",
15+
"constants",
16+
]
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# ---------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# ---------------------------------------------------------
4+
from enum import Enum
5+
6+
7+
class CommonConstants:
8+
"""Define common constants."""
9+
10+
DEFAULT_HTTP_TIMEOUT = 60
11+
12+
13+
class RAIService:
14+
"""Define constants related to RAI service"""
15+
16+
TIMEOUT = 1800
17+
SLEEP_TIME = 2
18+
HARM_SEVERITY_THRESHOLD = 4
19+
20+
21+
class HarmSeverityLevel(Enum):
22+
"""Harm severity levels."""
23+
24+
VeryLow = "Very low"
25+
Low = "Low"
26+
Medium = "Medium"
27+
High = "High"
28+
29+
30+
class Tasks:
31+
"""Defines types of annotation tasks supported by RAI Service."""
32+
33+
CONTENT_HARM = "content harm"
34+
PROTECTED_MATERIAL = "protected material"
35+
36+
37+
class _InternalAnnotationTasks:
38+
"""Defines types of annotation tasks that are supported for internal use. Such tasks are
39+
experimental and subject to potential change or migration to the main Evaluation Metrics
40+
enum over time."""
41+
42+
ECI = "eci"
43+
44+
45+
class EvaluationMetrics:
46+
"""Evaluation metrics to aid the RAI service in determining what
47+
metrics to request, and how to present them back to the user."""
48+
49+
HATE_FAIRNESS = "hate_fairness"
50+
HATE_UNFAIRNESS = "hate_unfairness"
51+
VIOLENCE = "violence"
52+
SELF_HARM = "self_harm"
53+
SEXUAL = "sexual"
54+
PROTECTED_MATERIAL = "protected_material"
55+
56+
57+
class _InternalEvaluationMetrics:
58+
"""Evaluation metrics that are not publicly supported.
59+
These metrics are experimental and subject to potential change or migration to the main
60+
enum over time.
61+
"""
62+
63+
ECI = "eci"

0 commit comments

Comments
 (0)