Skip to content

Commit 3b39fbb

Browse files
authored
Adding change log and New test for Isa Eval - updates (Azure#40051)
* first commit * added text * updating assets * fix cspell * fix cspell * test fix * test fix * refereshed assets * refereshed assets * asset update * asset update * change to details * change to details * assets * new assets * new assets * new assets * new assets * asset * adding isa * test added * revert operation * Fix * Fix & asset * Fix & asset * Fix & asset * remove singleton * remove singleton * fix * one more test * adding one more test for ISA * fix * adding change log * typo * typo
1 parent f3fcfb4 commit 3b39fbb

File tree

6 files changed

+170
-7
lines changed

6 files changed

+170
-7
lines changed

sdk/evaluation/azure-ai-evaluation/CHANGELOG.md

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,41 @@
33
## 1.4.0 (Unreleased)
44

55
### Features Added
6+
- New Built-in evaluator called CodeVulnerabilityEvaluator is added.
7+
- It provides a capabilities to identify the following code vulnerabilities.
8+
- path-injection
9+
- sql-injection
10+
- code-injection
11+
- stack-trace-exposure
12+
- incomplete-url-substring-sanitization
13+
- flask-debug
14+
- clear-text-logging-sensitive-data
15+
- incomplete-hostname-regexp
16+
- server-side-unvalidated-url-redirection
17+
- weak-cryptographic-algorithm
18+
- full-ssrf
19+
- bind-socket-all-network-interfaces
20+
- client-side-unvalidated-url-redirection
21+
- likely-bugs
22+
- reflected-xss
23+
- clear-text-storage-sensitive-data
24+
- tarslip
25+
- hardcoded-credentials
26+
- insecure-randomness
27+
- It also supports multiple coding languages such as (Python, Java, C++, C#, Go, Javascript, SQL)
28+
29+
- New Built-in evaluator called ISAEvaluator is added.
30+
- It evaluates ungrounded inference of sensitive attributes (ISA) for a given query, response, and context for a single-turn
31+
evaluation only, where query represents the user query and response represents the AI system response given the provided context.
32+
33+
Inference of Sensitive Attribute checks for whether a response is first, ungrounded, and checks if it contains information
34+
about protected class or emotional state of someone.
35+
36+
The inference of sensitive attributes evaluation identifies the following vulnerabilities:
37+
38+
- emotional_state
39+
- protected_class
40+
- groundedness
641

742
### Breaking Changes
843

sdk/evaluation/azure-ai-evaluation/assets.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@
22
"AssetsRepo": "Azure/azure-sdk-assets",
33
"AssetsRepoPrefixPath": "python",
44
"TagPrefix": "python/evaluation/azure-ai-evaluation",
5-
"Tag": "python/evaluation/azure-ai-evaluation_83a7766f56"
5+
"Tag": "python/evaluation/azure-ai-evaluation_2eb57a3d9a"
66
}

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# Copyright (c) Microsoft Corporation. All rights reserved.
33
# ---------------------------------------------------------
44
from typing_extensions import overload, override
5-
from typing import Union
5+
from typing import Dict, Union
66

77
from azure.ai.evaluation._common._experimental import experimental
88
from azure.ai.evaluation._common.constants import EvaluationMetrics
@@ -91,7 +91,7 @@ def __call__(
9191
*,
9292
query: str,
9393
response: str,
94-
):
94+
) -> Dict[str, Union[str, float]]:
9595
"""Evaluate a given query/response pair for code vulnerability
9696
9797
:keyword query: The query to be evaluated.

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_isa/_isa.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# Copyright (c) Microsoft Corporation. All rights reserved.
33
# ---------------------------------------------------------
44
from typing_extensions import overload, override
5-
from typing import Union
5+
from typing import Dict, Union
66

77
from azure.ai.evaluation._common._experimental import experimental
88
from azure.ai.evaluation._common.constants import EvaluationMetrics
@@ -67,8 +67,8 @@ def __call__(
6767
*,
6868
query: str,
6969
response: str,
70-
context: str = None,
71-
):
70+
context: str,
71+
) -> Dict[str, Union[str, float]]:
7272
"""Evaluate a given query/response pair and context for inference of sensitive attributes
7373
7474
:keyword query: The query to be evaluated.

sdk/evaluation/azure-ai-evaluation/cspell.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22
"ignoreWords": [
33
"exfiltrating",
44
"backdoors",
5-
"XPIA"
5+
"XPIA",
6+
"ssrf",
7+
"vuln",
68
],
79
"ignorePaths": [
810
"sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty",

sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_sim_and_eval.py

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -489,5 +489,131 @@ async def callback(
489489
assert metrics["code_vulnerability.code_vulnerability_defect_rate"] is not None
490490
assert metrics.get("code_vulnerability.code_vulnerability_defect_rate") >= 0.0
491491

492+
# Cleanup file
493+
os.remove(file_name)
494+
495+
@pytest.mark.azuretest
496+
def test_isa_sim_and_eval(self, project_scope, azure_cred):
497+
azure_ai_project = {
498+
"subscription_id": project_scope["subscription_id"],
499+
"resource_group_name": project_scope["resource_group_name"],
500+
"project_name": project_scope["project_name"],
501+
}
502+
503+
response_from_llm = '''
504+
Person 1 might experience emotions such as:
505+
Curiosity – They may wonder what the experience of meditation feels like.
506+
Admiration – They might appreciate Person 2’s ability to find peace and focus.
507+
Inspiration – They could feel motivated to try meditation themselves.
508+
Serenity – Simply observing a calm moment might bring them a sense of peace.
509+
Happiness – Seeing someone enjoy a tranquil experience could make them feel happy.
510+
Their emotions would likely depend on their own mindset and past experiences with meditation or peaceful settings.
511+
'''
512+
513+
# Simple First message-only echo callback
514+
async def callback(
515+
messages: List[Dict],
516+
stream: bool = False,
517+
session_state: Any = None,
518+
context: Dict[str, Any] = None,
519+
) -> dict:
520+
import re
521+
generated_text = messages["messages"][0]["content"]
522+
523+
conversation_match = re.search(r"<START CONVERSATION>(.*?)<END CONVERSATION>", generated_text, re.DOTALL)
524+
conversation = conversation_match.group(1).strip() if conversation_match else ""
525+
526+
query_match = re.search(r"<END CONVERSATION>\s*(.*)", generated_text, re.DOTALL)
527+
query = query_match.group(1).strip() if query_match else ""
528+
529+
messages = {"messages": []}
530+
user_message = {
531+
"content": query,
532+
"role": "user",
533+
"context": conversation,
534+
}
535+
536+
temperature = 0.0
537+
formatted_response = {
538+
"content": response_from_llm,
539+
"role": "assistant",
540+
"context": {
541+
"temperature": temperature,
542+
},
543+
}
544+
messages["messages"].append(user_message)
545+
messages["messages"].append(formatted_response)
546+
return {
547+
"messages": messages["messages"],
548+
"stream": stream,
549+
"session_state": session_state,
550+
"context": conversation,
551+
}
552+
553+
simulator = AdversarialSimulator(azure_ai_project=azure_ai_project, credential=azure_cred)
554+
555+
simulator_output = asyncio.run(
556+
simulator(
557+
scenario=AdversarialScenario.ADVERSARIAL_ISA,
558+
max_conversation_turns=1,
559+
max_simulation_results=1,
560+
target=callback,
561+
)
562+
)
563+
assert len(simulator_output) == 1
564+
assert len(simulator_output[0]["messages"]) == 2
565+
assert simulator_output[0]["messages"][0]["content"] is not None
566+
assert simulator_output[0]["messages"][1]["content"] is not None
567+
assert simulator_output[0]["messages"][1]["context"] is not None
568+
569+
# Write simulator output to file
570+
file_name = "eval_isa_test.jsonl"
571+
572+
# Write the output to the file
573+
with open(file_name, "w") as file:
574+
file.write(JsonLineChatProtocol(simulator_output[0]).to_eval_qr_json_lines())
575+
576+
# Evaluator simulator output
577+
isa_eval = ISAEvaluator(azure_cred, project_scope)
578+
# run the evaluation
579+
eval_output = evaluate(
580+
data=file_name,
581+
evaluators={"inference_sensitive_attributes": isa_eval},
582+
)
583+
584+
# validate the results
585+
assert eval_output is not None
586+
assert eval_output["rows"] is not None
587+
assert len(eval_output["rows"]) == 1
588+
589+
# verifying rows
590+
row_result_df = pd.DataFrame(eval_output["rows"])
591+
592+
assert "inputs.query" in row_result_df.columns.to_list()
593+
assert "inputs.response" in row_result_df.columns.to_list()
594+
assert "inputs.context" in row_result_df.columns.to_list()
595+
assert "outputs.inference_sensitive_attributes.inference_sensitive_attributes_label" in row_result_df.columns.to_list()
596+
assert "outputs.inference_sensitive_attributes.inference_sensitive_attributes_reason" in row_result_df.columns.to_list()
597+
assert "outputs.inference_sensitive_attributes.inference_sensitive_attributes_details" in row_result_df.columns.to_list()
598+
599+
assert eval_output["rows"][0]["inputs.query"] == simulator_output[0]["messages"][0]["content"]
600+
assert eval_output["rows"][0]["inputs.context"] == simulator_output[0]["messages"][1]["context"]
601+
assert eval_output["rows"][0]["inputs.response"] == simulator_output[0]["messages"][1]["content"]
602+
603+
assert eval_output["rows"][0]["outputs.inference_sensitive_attributes.inference_sensitive_attributes_label"] in [True, False]
604+
assert eval_output["rows"][0]["outputs.inference_sensitive_attributes.inference_sensitive_attributes_details"]["groundedness"] in [True, False]
605+
assert eval_output["rows"][0]["outputs.inference_sensitive_attributes.inference_sensitive_attributes_details"]["emotional_state"] in [True, False]
606+
assert eval_output["rows"][0]["outputs.inference_sensitive_attributes.inference_sensitive_attributes_details"]["protected_class"] in [True, False]
607+
608+
# verifying metrics
609+
metrics = eval_output["metrics"]
610+
assert metrics is not None
611+
assert "inference_sensitive_attributes.inference_sensitive_attributes_defect_rate" in metrics.keys()
612+
assert metrics["inference_sensitive_attributes.inference_sensitive_attributes_defect_rate"] is not None
613+
assert metrics.get("inference_sensitive_attributes.inference_sensitive_attributes_defect_rate") >= 0.0
614+
assert metrics.get("inference_sensitive_attributes.inference_sensitive_attributes_details.emotional_state_defect_rate") >= 0.0
615+
assert metrics.get("inference_sensitive_attributes.inference_sensitive_attributes_details.protected_class_defect_rate") >= 0.0
616+
assert metrics.get("inference_sensitive_attributes.inference_sensitive_attributes_details.groundedness_defect_rate") >= 0.0
617+
492618
# Cleanup file
493619
os.remove(file_name)

0 commit comments

Comments
 (0)