Bump version for builtin evaluators after azure_ai_project removal (#4897)

aprilk-ms · Copilot · web-flow · commit ae6ff78dcda9 · 2026-03-25T19:53:59.000-07:00
Bump versions for all 11 evaluators that had azure_ai_project removed:
- code_vulnerability: 2 -&gt; 3
- groundedness_pro: 6 -&gt; 7
- hate_unfairness: 2 -&gt; 3
- indirect_attack: 2 -&gt; 3
- prohibited_actions: 4 -&gt; 5
- protected_material: 2 -&gt; 3
- self_harm: 2 -&gt; 3
- sensitive_data_leakage: 4 -&gt; 5
- sexual: 2 -&gt; 3
- ungrounded_attributes: 2 -&gt; 3
- violence: 2 -&gt; 3

Co-authored-by: Copilot &lt;223556219+Copilot@users.noreply.github.com&gt;
diff --git a/assets/evaluators/builtin/code_vulnerability/spec.yaml b/assets/evaluators/builtin/code_vulnerability/spec.yaml
@@ -1,6 +1,6 @@
 type: "evaluator"
 name: "builtin.code_vulnerability"
-version: 2
+version: 3
 displayName: "Code-Vulnerability-Evaluator"
 description: "Assesses whether generated code contains potential security flaws. Lower scores indicate safer, more secure code. Use this metric in code generation, security auditing, and software quality assurance."
 evaluatorType: "builtin"
diff --git a/assets/evaluators/builtin/groundedness_pro/spec.yaml b/assets/evaluators/builtin/groundedness_pro/spec.yaml
@@ -1,6 +1,6 @@
 type: "evaluator"
 name: "builtin.groundedness_pro"
-version: 6
+version: 7
 displayName: "Service-Groundedness-Evaluator"
 description: "Checks if the response is properly grounded in provided sources. A true result means it aligns with reliable context. It’s best used for retrieval-augmented generation question and answering (RAG QA). Use the groundedness pro metric when you need to verify that AI-generated responses align with and are validated by the provided context."
 evaluatorType: "builtin"
diff --git a/assets/evaluators/builtin/hate_unfairness/spec.yaml b/assets/evaluators/builtin/hate_unfairness/spec.yaml
@@ -1,6 +1,6 @@
 type: "evaluator"
 name: "builtin.hate_unfairness"
-version: 2
+version: 3
 displayName: "Hate-and-Unfairness-Evaluator"
 description: "Detects hateful, biased, or unfair language in the response. Lower scores indicate more neutral and inclusive content."
 evaluatorType: "builtin"
diff --git a/assets/evaluators/builtin/indirect_attack/spec.yaml b/assets/evaluators/builtin/indirect_attack/spec.yaml
@@ -1,6 +1,6 @@
 type: "evaluator"
 name: "builtin.indirect_attack"
-version: 2
+version: 3
 displayName: "Indirect-Attack-Evaluator"
 description: "Identifies subtle or indirect attacks toward individuals or groups. Lower scores indicate more respectful and non‑aggressive tone."
 evaluatorType: "builtin"
diff --git a/assets/evaluators/builtin/prohibited_actions/spec.yaml b/assets/evaluators/builtin/prohibited_actions/spec.yaml
@@ -1,6 +1,6 @@
 type: "evaluator"
 name: "builtin.prohibited_actions"
-version: 4
+version: 5
 displayName: "Prohibited-Actions-Evaluator"
 description: "Flags content that suggests or enables actions violating safety guidelines. Helps prevent risky or disallowed behaviors."
 evaluatorType: "builtin"
diff --git a/assets/evaluators/builtin/protected_material/spec.yaml b/assets/evaluators/builtin/protected_material/spec.yaml
@@ -1,6 +1,6 @@
 type: "evaluator"
 name: "builtin.protected_material"
-version: 2
+version: 3
 displayName: "Protected-Material-Evaluator"
 description: "Detects use of copyrighted or proprietary content in responses. Ensures generated text stays within acceptable use boundaries."
 evaluatorType: "builtin"
diff --git a/assets/evaluators/builtin/self_harm/spec.yaml b/assets/evaluators/builtin/self_harm/spec.yaml
@@ -1,6 +1,6 @@
 type: "evaluator"
 name: "builtin.self_harm"
-version: 2
+version: 3
 displayName: "Self-Harm-Related-Content-Evaluator"
 description: "Detects language that promotes or discusses self‑harm. Lower scores indicate safer, more supportive content."
 evaluatorType: "builtin"
diff --git a/assets/evaluators/builtin/sensitive_data_leakage/spec.yaml b/assets/evaluators/builtin/sensitive_data_leakage/spec.yaml
@@ -1,6 +1,6 @@
 type: "evaluator"
 name: "builtin.sensitive_data_leakage"
-version: 4
+version: 5
 displayName: "Sensitive-Data-Leakage-Evaluator"
 description: "Tests whether an AI system leaks sensitive or private data (e.g., financial, medical, or PII) when exposed to direct or obfuscated adversarial queries. Use it to detect and classify leakage risk levels—ranging from benign direct queries to high-severity outputs containing realistic sensitive information."
 evaluatorType: "builtin"
diff --git a/assets/evaluators/builtin/sexual/spec.yaml b/assets/evaluators/builtin/sexual/spec.yaml
@@ -1,6 +1,6 @@
 type: "evaluator"
 name: "builtin.sexual"
-version: 2
+version: 3
 displayName: "Sexual-Content-Evaluator"
 description: "Detects sexual or explicit content in responses. Lower scores indicate safer and more appropriate language."
 evaluatorType: "builtin"
diff --git a/assets/evaluators/builtin/ungrounded_attributes/spec.yaml b/assets/evaluators/builtin/ungrounded_attributes/spec.yaml
@@ -1,6 +1,6 @@
 type: "evaluator"
 name: "builtin.ungrounded_attributes"
-version: 2
+version: 3
 displayName: "Ungrounded-Attributes-Evaluator"
 description: "Identifies details added by the model that are not supported by provided data. Helps catch hallucinated or made‑up information. This evaluator is useful for evaluating summarization, reporting, and generative ai systems where factual grounding is critical."
 evaluatorType: "builtin"
diff --git a/assets/evaluators/builtin/violence/spec.yaml b/assets/evaluators/builtin/violence/spec.yaml
@@ -1,6 +1,6 @@
 type: "evaluator"
 name: "builtin.violence"
-version: 2
+version: 3
 displayName: "Violence-Content-Evaluator"
 description: "Detects violent or threatening language in the response. Lower scores indicate safer, non‑violent communication."
 evaluatorType: "builtin"