33import typing as t
44from typing import List
55
6- import numpy as np
76from pydantic import BaseModel
87
98from ragas .metrics .collections .base import BaseMetric
@@ -120,29 +119,29 @@ async def ascore(
120119 if not response .strip () or not context_str .strip ():
121120 return MetricResult (value = 0.0 )
122121
123- # Get ratings from both judges
122+ # Get ratings from both judges (already on 0.0-1.0 scale from legacy parsing)
124123 judge1_rating = await self ._get_judge_rating (
125124 response_groundedness_judge1_prompt (response , context_str )
126125 )
127126 judge2_rating = await self ._get_judge_rating (
128127 response_groundedness_judge2_prompt (response , context_str )
129128 )
130129
131- # Average the scores (convert from 0,1,2 scale to 0 .0-1.0)
132- score = self ._average_scores (judge1_rating / 2.0 , judge2_rating / 2.0 )
130+ # Average the scores (already on 0 .0-1.0 scale like legacy )
131+ score = self ._average_scores (judge1_rating , judge2_rating )
133132
134133 return MetricResult (value = float (score ))
135134
136135 async def _get_judge_rating (self , prompt : str ) -> float :
137- """Get rating from judge with retry logic ."""
136+ """Get rating from judge using structured output with legacy-compatible processing ."""
138137 for retry in range (self .max_retries ):
139138 try :
140139 result = await self .llm .agenerate (prompt , GroundednessRating )
141140 rating = result .rating
142141
143- # Validate rating is in expected range
142+ # Validate rating is in expected range and convert to 0.0-1.0 scale
144143 if rating in [0 , 1 , 2 ]:
145- return float ( rating )
144+ return rating / 2.0 # Convert to legacy 0.0-1.0 scale
146145 else :
147146 if retry < self .max_retries - 1 :
148147 continue # Retry if invalid rating
@@ -158,12 +157,9 @@ async def _get_judge_rating(self, prompt: str) -> float:
158157 return float ("nan" )
159158
160159 def _average_scores (self , score1 : float , score2 : float ) -> float :
161- """Average two judge scores, handling NaN values."""
162- if not np . isnan ( score1 ) and not np . isnan ( score2 ) :
160+ """Average two judge scores, handling NaN values. Matches legacy logic exactly. """
161+ if score1 >= 0 and score2 >= 0 :
163162 return (score1 + score2 ) / 2.0
164- elif not np .isnan (score1 ):
165- return score1
166- elif not np .isnan (score2 ):
167- return score2
168163 else :
169- return float ("nan" )
164+ # Match legacy behavior: use max() for NaN handling
165+ return max (score1 , score2 )
0 commit comments