diff --git a/continuous_eval/metrics/generation/text/llm_based.py b/continuous_eval/metrics/generation/text/llm_based.py index 539206b..020e376 100644 --- a/continuous_eval/metrics/generation/text/llm_based.py +++ b/continuous_eval/metrics/generation/text/llm_based.py @@ -32,11 +32,12 @@ def __call__(self, answer: str, retrieved_context: List[str], question: str, **k answer (str): the generated answer retrieved_context (List[str]): the retrieved contexts question (str): the question - """ """""" + """ """" if self.classify_by_statement: # Context coverage uses the same prompt as faithfulness because it calculates how what proportion statements in the answer can be attributed to the context. # The difference is that faithfulness uses the generated answer, while context coverage uses ground truth answer (to evaluate context). - context_coverage = LLMBasedContextCoverage(use_few_shot=self.use_few_shot) + model = self.model if self.model is not None else None + context_coverage = LLMBasedContextCoverage(model=model, use_few_shot=self.use_few_shot) results = context_coverage(question, retrieved_context, answer) score = results["LLM_based_context_coverage"] reasoning = results["LLM_based_context_statements"] @@ -228,6 +229,7 @@ def __call__(self, answer: str, ground_truth_answers: Union[List[str], str], **k Response: 2.5 The generated answer is more brief and doesn't have the formality and empathetic tone in the reference answer. + """ else: few_shot_prompt = "" @@ -257,4 +259,4 @@ def __call__(self, answer: str, ground_truth_answers: Union[List[str], str], **k return { "LLM_based_style_consistency": normalized_score, "LLM_based_style_consistency_reasoning": reasoning, - } + } \ No newline at end of file