Update _nv_metrics.py (#2053)

titericz · web-flow · commit a18f157fac48 · 2025-09-16T22:33:57.000-07:00
Remove hard context lenght limitation on 7000 words on metrics
`nv_relevance` and `nv_groundedness`.
Updated Judge x HumanCorrelation Leaderboard.
diff --git a/src/ragas/metrics/_nv_metrics.py b/src/ragas/metrics/_nv_metrics.py
@@ -21,17 +21,17 @@ class AnswerAccuracy(MetricWithLLM, SingleTurnMetric):
     This metric averages two distinct judge prompts to evaluate.
 
     Top10, Zero-shoot LLM-as-a-Judge Leaderboard:
-    1)- mistralai/mixtral-8x22b-instruct-v0.1
-    2)- mistralai/mixtral-8x7b-instruct-v0.1
-    3)- meta/llama-3.1-70b-instruct
-    4)- meta/llama-3.3-70b-instruct
-    5)- meta/llama-3.1-405b-instruct
-    6)- mistralai/mistral-nemo-12b-instruct
-    7)- nvidia/llama-3.1-nemotron-70b-instruct
-    8)- meta/llama-3.1-8b-instruct
-    9)- google/gemma-2-2b-it
-    10)- nvidia/nemotron-mini-4b-instruct
-    The top1 LB model have high correlation with human judges (~0.90).
+    1)- nvidia/Llama-3_3-Nemotron-Super-49B-v1
+    2)- mistralai/mixtral-8x22b-instruct-v0.1
+    3)- mistralai/mixtral-8x7b-instruct-v0.1
+    4)- meta/llama-3.1-70b-instruct
+    5)- meta/llama-3.3-70b-instruct
+    6)- meta/llama-3.1-405b-instruct
+    7)- mistralai/mistral-nemo-12b-instruct
+    8)- nvidia/llama-3.1-nemotron-70b-instruct
+    9)- meta/llama-3.1-8b-instruct
+    10)- google/gemma-2-2b-it
+    The top1 LB model have high correlation with human judges (~0.92).
 
     Attributes
     ----------
@@ -252,7 +252,7 @@ async def _single_turn_ascore(
                 formatted_prompt = StringPromptValue(
                     text=self.template_relevance1.format(
                         query=sample.user_input,
-                        context="\n".join(sample.retrieved_contexts)[:7000],
+                        context="\n".join(sample.retrieved_contexts),
                     )
                 )
                 req = self.llm.agenerate_text(
@@ -271,7 +271,7 @@ async def _single_turn_ascore(
                 formatted_prompt = StringPromptValue(
                     text=self.template_relevance2.format(
                         query=sample.user_input,
-                        context="\n".join(sample.retrieved_contexts)[:7000],
+                        context="\n".join(sample.retrieved_contexts),
                     )
                 )
                 req = self.llm.agenerate_text(
@@ -385,7 +385,7 @@ async def _single_turn_ascore(
             for retry in range(self.retry):
                 formatted_prompt = StringPromptValue(
                     text=self.template_groundedness1.format(
-                        context="\n".join(sample.retrieved_contexts)[:7000],
+                        context="\n".join(sample.retrieved_contexts),
                         response=sample.response,
                     )
                 )
@@ -404,7 +404,7 @@ async def _single_turn_ascore(
             for retry in range(self.retry):
                 formatted_prompt = StringPromptValue(
                     text=self.template_groundedness2.format(
-                        context="\n".join(sample.retrieved_contexts)[:7000],
+                        context="\n".join(sample.retrieved_contexts),
                         response=sample.response,
                     )
                 )