Azure
diff --git a/‎sdk/evaluation/azure-ai-evaluation/CHANGELOG.md‎
Lines changed: 1 addition & 0 deletions b/‎sdk/evaluation/azure-ai-evaluation/CHANGELOG.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/README.md‎
Lines changed: 8 additions & 8 deletions b/‎sdk/evaluation/azure-ai-evaluation/README.md‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py‎
Lines changed: 13 additions & 13 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py‎
Lines changed: 13 additions & 13 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py‎
Lines changed: 4 additions & 4 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_bleu/_bleu.py‎
Lines changed: 8 additions & 8 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_bleu/_bleu.py‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_chat/_chat.py‎
Lines changed: 13 additions & 13 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_chat/_chat.py‎
Lines changed: 13 additions & 13 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py‎
Lines changed: 10 additions & 10 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py‎
Lines changed: 14 additions & 14 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py‎
Lines changed: 14 additions & 14 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/coherence.prompty‎
Lines changed: 4 additions & 4 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/coherence.prompty‎
Lines changed: 4 additions & 4 deletions
@@ -7,6 +7,7 @@
 - The `synthetic` namespace has been renamed to `simulator`, and sub-namespaces under this module have been removed
 - The `evaluate` and `evaluators` namespaces have been removed, and everything previously exposed in those modules has been added to the root namespace `azure.ai.evaluation`  
 - The parameter name `project_scope` in content safety evaluators have been renamed to `azure_ai_project` for consistency with evaluate API and simulators.
+- Updated the parameter names for `question` and `answer` in built-in evaluators to more generic terms: `query` and `response`.
 
 
 ### Features Added
 
@@ -28,8 +28,8 @@ from promptflow.core import AzureOpenAIModelConfiguration
 from azure.ai.evaluation import evaluate, RelevanceEvaluator, ViolenceEvaluator
 
 
-def answer_length(answer, **kwargs):
-    return {"value": len(answer)}
+def response_length(response, **kwargs):
+    return {"value": len(response)}
 
 
 if __name__ == "__main__":
@@ -46,11 +46,11 @@ if __name__ == "__main__":
 
     # Running Relevance Evaluator on single input row
     relevance_score = relevance_eval(
-        answer="The Alpine Explorer Tent is the most waterproof.",
+        response="The Alpine Explorer Tent is the most waterproof.",
         context="From the our product list,"
         " the alpine explorer tent is the most waterproof."
         " The Adventure Dining Table has higher weight.",
-        question="Which tent is the most waterproof?",
+        query="Which tent is the most waterproof?",
     )
 
     pprint(relevance_score)
@@ -66,24 +66,24 @@ if __name__ == "__main__":
     }
 
     violence_eval = ViolenceEvaluator(azure_ai_project)
-    violence_score = violence_eval(question="What is the capital of France?", answer="Paris.")
+    violence_score = violence_eval(query="What is the capital of France?", response="Paris.")
     pprint(violence_score)
     # {'violence': 'Very low',
-    # 'violence_reason': "The system's response is a straightforward factual answer "
+    # 'violence_reason': "The system's response is a straightforward factual response "
     #                    'to a geography question. There is no violent content or '
     #                    'language present.',
     # 'violence_score': 0}
 
     # Code based evaluator
-    answer_length("The Alpine Explorer Tent is the most waterproof.")
+    response_length("The Alpine Explorer Tent is the most waterproof.")
     # {'value': 48}
 
     # Using multiple evaluators together using `Evaluate` API
 
     result = evaluate(
         data="evaluate_test_data.jsonl",
         evaluators={
-            "answer_length": answer_length,
+            "response_length": response_length,
             "violence": violence_eval,
         },
     )
 
@@ -119,13 +119,13 @@ def generate_payload(normalized_user_text: str, metric: str) -> Dict:
     )
 
 
-async def submit_request(question: str, answer: str, metric: str, rai_svc_url: str, token: str) -> str:
+async def submit_request(query: str, response: str, metric: str, rai_svc_url: str, token: str) -> str:
     """Submit request to Responsible AI service for evaluation and return operation ID
 
-    :param question: The question to evaluate.
-    :type question: str
-    :param answer: The answer to evaluate.
-    :type answer: str
+    :param query: The query to evaluate.
+    :type query: str
+    :param response: The response to evaluate.
+    :type response: str
     :param metric: The evaluation metric to use.
     :type metric: str
     :param rai_svc_url: The Responsible AI service URL.
@@ -135,7 +135,7 @@ async def submit_request(question: str, answer: str, metric: str, rai_svc_url: s
     :return: The operation ID.
     :rtype: str
     """
-    user_text = f"<Human>{question}</><System>{answer}</>"
+    user_text = f"<Human>{query}</><System>{response}</>"
     normalized_user_text = user_text.replace("'", '\\"')
     payload = generate_payload(normalized_user_text, metric)
 
@@ -410,14 +410,14 @@ async def fetch_or_reuse_token(credential: TokenCredential, token: str = None) -
 
 
 async def evaluate_with_rai_service(
-    question: str, answer: str, metric_name: str, project_scope: dict, credential: TokenCredential
+    query: str, response: str, metric_name: str, project_scope: dict, credential: TokenCredential
 ):
-    """ "Evaluate the content safety of the answer using Responsible AI service
+    """ "Evaluate the content safety of the response using Responsible AI service
 
-       :param question: The question to evaluate.
-       :type question: str
-       :param answer: The answer to evaluate.
-       :type answer: str
+       :param query: The query to evaluate.
+       :type query: str
+       :param response: The response to evaluate.
+       :type response: str
        :param metric_name: The evaluation metric to use.
        :type metric_name: str
        :param project_scope: The Azure AI project scope details.
@@ -439,7 +439,7 @@ async def evaluate_with_rai_service(
     await ensure_service_availability(rai_svc_url, token, Tasks.CONTENT_HARM)
 
     # Submit annotation request and fetch result
-    operation_id = await submit_request(question, answer, metric_name, rai_svc_url, token)
+    operation_id = await submit_request(query, response, metric_name, rai_svc_url, token)
     annotation_response = await fetch_result(operation_id, rai_svc_url, credential, token)
     result = parse_response(annotation_response, metric_name)
 
 
@@ -418,13 +418,13 @@ def evaluate(
                 },
                 evaluator_config={
                     "coherence": {
-                        "answer": "${data.answer}",
-                        "question": "${data.question}"
+                        "response": "${data.response}",
+                        "query": "${data.query}"
                     },
                     "relevance": {
-                        "answer": "${data.answer}",
+                        "response": "${data.response}",
                         "context": "${data.context}",
-                        "question": "${data.question}"
+                        "query": "${data.query}"
                     }
                 }
             )
 
@@ -11,9 +11,9 @@ class _AsyncBleuScoreEvaluator:
     def __init__(self):
         pass
 
-    async def __call__(self, *, answer: str, ground_truth: str, **kwargs):
+    async def __call__(self, *, response: str, ground_truth: str, **kwargs):
         reference_tokens = nltk_tokenize(ground_truth)
-        hypothesis_tokens = nltk_tokenize(answer)
+        hypothesis_tokens = nltk_tokenize(response)
 
         # NIST Smoothing
         smoothing_function = SmoothingFunction().method4
@@ -39,7 +39,7 @@ class BleuScoreEvaluator:
 
         eval_fn = BleuScoreEvaluator()
         result = eval_fn(
-            answer="Tokyo is the capital of Japan.",
+            response="Tokyo is the capital of Japan.",
             ground_truth="The capital of Japan is Tokyo.")
 
     **Output format**
@@ -54,19 +54,19 @@ class BleuScoreEvaluator:
     def __init__(self):
         self._async_evaluator = _AsyncBleuScoreEvaluator()
 
-    def __call__(self, *, answer: str, ground_truth: str, **kwargs):
+    def __call__(self, *, response: str, ground_truth: str, **kwargs):
         """
-        Evaluate the BLEU score between the answer and the ground truth.
+        Evaluate the BLEU score between the response and the ground truth.
 
-        :keyword answer: The answer to be evaluated.
-        :paramtype answer: str
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
         :keyword ground_truth: The ground truth to be compared against.
         :paramtype ground_truth: str
         :return: The BLEU score.
         :rtype: dict
         """
         return async_run_allowing_running_loop(
-            self._async_evaluator, answer=answer, ground_truth=ground_truth, **kwargs
+            self._async_evaluator, response=response, ground_truth=ground_truth, **kwargs
         )
 
     def _to_async(self):
 
@@ -107,9 +107,9 @@ def __call__(self, *, conversation, **kwargs):
         """
         self._validate_conversation(conversation)
 
-        # Extract questions, answers and contexts from conversation
-        questions = []
-        answers = []
+        # Extract queries, responses and contexts from conversation
+        queries = []
+        responses = []
         contexts = []
 
         if self._eval_last_turn:
@@ -121,16 +121,16 @@ def __call__(self, *, conversation, **kwargs):
         for each_turn in conversation_slice:
             role = each_turn["role"]
             if role == "user":
-                questions.append(each_turn["content"])
+                queries.append(each_turn["content"])
             elif role == "assistant":
-                answers.append(each_turn["content"])
+                responses.append(each_turn["content"])
                 if "context" in each_turn and "citations" in each_turn["context"]:
                     citations = json.dumps(each_turn["context"]["citations"])
                     contexts.append(citations)
 
         # Select evaluators to be used for evaluation
         compute_rag_based_metrics = True
-        if len(answers) != len(contexts):
+        if len(responses) != len(contexts):
             safe_message = (
                 "Skipping rag based metrics as we need citations or "
                 "retrieved_documents in context key of every assistant's turn"
@@ -145,15 +145,15 @@ def __call__(self, *, conversation, **kwargs):
 
         # Evaluate each turn
         per_turn_results = []
-        for turn_num in range(len(questions)):
+        for turn_num in range(len(queries)):
             current_turn_result = {}
 
             if self._parallel:
                 # Parallel execution
                 with ThreadPoolExecutor() as executor:
                     future_to_evaluator = {
                         executor.submit(
-                            self._evaluate_turn, turn_num, questions, answers, contexts, evaluator
+                            self._evaluate_turn, turn_num, queries, responses, contexts, evaluator
                         ): evaluator
                         for evaluator in selected_evaluators
                     }
@@ -165,7 +165,7 @@ def __call__(self, *, conversation, **kwargs):
                 # Sequential execution
                 for evaluator in selected_evaluators:
                     async_evaluator = evaluator._to_async()
-                    result = self._evaluate_turn(turn_num, questions, answers, contexts, async_evaluator)
+                    result = self._evaluate_turn(turn_num, queries, responses, contexts, async_evaluator)
                     current_turn_result.update(result)
 
             per_turn_results.append(current_turn_result)
@@ -191,13 +191,13 @@ def __call__(self, *, conversation, **kwargs):
 
         return aggregated
 
-    def _evaluate_turn(self, turn_num, questions, answers, contexts, evaluator):
+    def _evaluate_turn(self, turn_num, queries, responses, contexts, evaluator):
         try:
-            question = questions[turn_num] if turn_num < len(questions) else ""
-            answer = answers[turn_num] if turn_num < len(answers) else ""
+            query = queries[turn_num] if turn_num < len(queries) else ""
+            response = responses[turn_num] if turn_num < len(responses) else ""
             context = contexts[turn_num] if turn_num < len(contexts) else ""
 
-            score = evaluator(question=question, answer=answer, context=context)
+            score = evaluator(query=query, response=response, context=context)
 
             return score
         except Exception as e:  # pylint: disable=broad-exception-caught
 
@@ -48,34 +48,34 @@ def __init__(self, model_config: Union[AzureOpenAIModelConfiguration, OpenAIMode
         self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
 
     async def __call__(self, *, conversation, **kwargs):
-        # Extract questions, answers and contexts from conversation
-        questions = []
-        answers = []
+        # Extract queries, responses and contexts from conversation
+        queries = []
+        responses = []
         contexts = []
 
         for each_turn in conversation:
             role = each_turn["role"]
             if role == "user":
-                questions.append(each_turn["content"])
+                queries.append(each_turn["content"])
             elif role == "assistant":
-                answers.append(each_turn["content"])
+                responses.append(each_turn["content"])
                 if "context" in each_turn and "citations" in each_turn["context"]:
                     citations = json.dumps(each_turn["context"]["citations"])
                     contexts.append(citations)
 
         # Evaluate each turn
         per_turn_scores = []
         history = []
-        for turn_num, question in enumerate(questions):
+        for turn_num, query in enumerate(queries):
             try:
-                question = question if turn_num < len(questions) else ""
-                answer = answers[turn_num] if turn_num < len(answers) else ""
+                query = query if turn_num < len(queries) else ""
+                answer = responses[turn_num] if turn_num < len(responses) else ""
                 context = contexts[turn_num] if turn_num < len(contexts) else ""
 
-                history.append({"user": question, "assistant": answer})
+                history.append({"user": query, "assistant": answer})
 
                 llm_output = await self._flow(
-                    query=question, history=history, documents=context, timeout=self.LLM_CALL_TIMEOUT, **kwargs
+                    query=query, history=history, documents=context, timeout=self.LLM_CALL_TIMEOUT, **kwargs
                 )
                 score = np.nan
                 if llm_output:
 
@@ -43,16 +43,16 @@ def __init__(self, model_config: Union[AzureOpenAIModelConfiguration, OpenAIMode
         prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
         self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
 
-    async def __call__(self, *, question: str, answer: str, **kwargs):
+    async def __call__(self, *, query: str, response: str, **kwargs):
         # Validate input parameters
-        question = str(question or "")
-        answer = str(answer or "")
+        query = str(query or "")
+        response = str(response or "")
 
-        if not (question.strip() and answer.strip()):
-            raise ValueError("Both 'question' and 'answer' must be non-empty strings.")
+        if not (query.strip() and response.strip()):
+            raise ValueError("Both 'query' and 'response' must be non-empty strings.")
 
         # Run the evaluation flow
-        llm_output = await self._flow(question=question, answer=answer, timeout=self.LLM_CALL_TIMEOUT, **kwargs)
+        llm_output = await self._flow(query=query, response=response, timeout=self.LLM_CALL_TIMEOUT, **kwargs)
 
         score = np.nan
         if llm_output:
@@ -77,8 +77,8 @@ class CoherenceEvaluator:
 
         eval_fn = CoherenceEvaluator(model_config)
         result = eval_fn(
-            question="What is the capital of Japan?",
-            answer="The capital of Japan is Tokyo.")
+            query="What is the capital of Japan?",
+            response="The capital of Japan is Tokyo.")
 
     **Output format**
 
@@ -92,18 +92,18 @@ class CoherenceEvaluator:
     def __init__(self, model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration]):
         self._async_evaluator = _AsyncCoherenceEvaluator(model_config)
 
-    def __call__(self, *, question: str, answer: str, **kwargs):
+    def __call__(self, *, query: str, response: str, **kwargs):
         """
         Evaluate coherence.
 
-        :keyword question: The question to be evaluated.
-        :paramtype question: str
-        :keyword answer: The answer to be evaluated.
-        :paramtype answer: str
+        :keyword query: The query to be evaluated.
+        :paramtype query: str
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
         :return: The coherence score.
         :rtype: Dict[str, float]
         """
-        return async_run_allowing_running_loop(self._async_evaluator, question=question, answer=answer, **kwargs)
+        return async_run_allowing_running_loop(self._async_evaluator, query=query, response=response, **kwargs)
 
     def _to_async(self):
         return self._async_evaluator
@@ -18,9 +18,9 @@ model:
       type: text
 
 inputs:
-  question:
+  query:
     type: string
-  answer:
+  response:
     type: string
 
 ---
@@ -57,6 +57,6 @@ question: What can you tell me about climate change and its effects on the envir
 answer: Climate change has far-reaching effects on the environment. Rising temperatures result in the melting of polar ice caps, contributing to sea-level rise. Additionally, more frequent and severe weather events, such as hurricanes and heatwaves, can cause disruption to ecosystems and human societies alike.
 stars: 5
 
-question: {{question}}
-answer: {{answer}}
+question: {{query}}
+answer: {{response}}
 stars:
Original file line number	Diff line number	Diff line change
`@@ -418,13 +418,13 @@ def evaluate(`
`418`	`418`	`},`
`419`	`419`	`evaluator_config={`
`420`	`420`	`"coherence": {`
`421`		`- "answer": "${data.answer}",`
`422`		`- "question": "${data.question}"`
	`421`	`+ "response": "${data.response}",`
	`422`	`+ "query": "${data.query}"`
`423`	`423`	`},`
`424`	`424`	`"relevance": {`
`425`		`- "answer": "${data.answer}",`
	`425`	`+ "response": "${data.response}",`
`426`	`426`	`"context": "${data.context}",`
`427`		`- "question": "${data.question}"`
	`427`	`+ "query": "${data.query}"`
`428`	`428`	`}`
`429`	`429`	`}`
`430`	`430`	`)`