@@ -21,17 +21,17 @@ class AnswerAccuracy(MetricWithLLM, SingleTurnMetric):
2121 This metric averages two distinct judge prompts to evaluate.
2222
2323 Top10, Zero-shoot LLM-as-a-Judge Leaderboard:
24- 1)- mistralai/mixtral-8x22b-instruct-v0.1
25- 2)- mistralai/mixtral-8x7b -instruct-v0.1
26- 3)- meta/llama-3.1-70b- instruct
27- 4)- meta/llama-3.3 -70b-instruct
28- 5)- meta/llama-3.1-405b -instruct
29- 6)- mistralai/mistral-nemo-12b -instruct
30- 7)- nvidia/llama-3.1-nemotron-70b -instruct
31- 8)- meta /llama-3.1-8b -instruct
32- 9)- google/gemma-2-2b-it
33- 10)- nvidia/nemotron-mini-4b-instruct
34- The top1 LB model have high correlation with human judges (~0.90 ).
24+ 1)- nvidia/Llama-3_3-Nemotron-Super-49B-v1
25+ 2)- mistralai/mixtral-8x22b -instruct-v0.1
26+ 3)- mistralai/mixtral-8x7b- instruct-v0.1
27+ 4)- meta/llama-3.1 -70b-instruct
28+ 5)- meta/llama-3.3-70b -instruct
29+ 6)- meta/llama-3.1-405b -instruct
30+ 7)- mistralai/mistral-nemo-12b -instruct
31+ 8)- nvidia /llama-3.1-nemotron-70b -instruct
32+ 9)- meta/llama-3.1-8b-instruct
33+ 10)- google/gemma-2-2b-it
34+ The top1 LB model have high correlation with human judges (~0.92 ).
3535
3636 Attributes
3737 ----------
@@ -252,7 +252,7 @@ async def _single_turn_ascore(
252252 formatted_prompt = StringPromptValue (
253253 text = self .template_relevance1 .format (
254254 query = sample .user_input ,
255- context = "\n " .join (sample .retrieved_contexts )[: 7000 ] ,
255+ context = "\n " .join (sample .retrieved_contexts ),
256256 )
257257 )
258258 req = self .llm .agenerate_text (
@@ -271,7 +271,7 @@ async def _single_turn_ascore(
271271 formatted_prompt = StringPromptValue (
272272 text = self .template_relevance2 .format (
273273 query = sample .user_input ,
274- context = "\n " .join (sample .retrieved_contexts )[: 7000 ] ,
274+ context = "\n " .join (sample .retrieved_contexts ),
275275 )
276276 )
277277 req = self .llm .agenerate_text (
@@ -385,7 +385,7 @@ async def _single_turn_ascore(
385385 for retry in range (self .retry ):
386386 formatted_prompt = StringPromptValue (
387387 text = self .template_groundedness1 .format (
388- context = "\n " .join (sample .retrieved_contexts )[: 7000 ] ,
388+ context = "\n " .join (sample .retrieved_contexts ),
389389 response = sample .response ,
390390 )
391391 )
@@ -404,7 +404,7 @@ async def _single_turn_ascore(
404404 for retry in range (self .retry ):
405405 formatted_prompt = StringPromptValue (
406406 text = self .template_groundedness2 .format (
407- context = "\n " .join (sample .retrieved_contexts )[: 7000 ] ,
407+ context = "\n " .join (sample .retrieved_contexts ),
408408 response = sample .response ,
409409 )
410410 )
0 commit comments