@@ -14,14 +14,35 @@ class SemanticRecallPrecision(dspy.Signature):
1414 precision : float = dspy .OutputField (desc = "fraction (out of 1.0) of system response covered by the ground truth" )
1515
1616
17+ class DecompositionalSemanticRecallPrecision (dspy .Signature ):
18+ """
19+ Compare a system's response to the ground truth to compute recall and precision of key ideas.
20+ You will first enumerate key ideas in each response, discuss their overlap, and then report recall and precision.
21+ """
22+
23+ question : str = dspy .InputField ()
24+ ground_truth : str = dspy .InputField ()
25+ system_response : str = dspy .InputField ()
26+ ground_truth_key_ideas : str = dspy .OutputField (desc = "enumeration of key ideas in the ground truth" )
27+ system_response_key_ideas : str = dspy .OutputField (desc = "enumeration of key ideas in the system response" )
28+ discussion : str = dspy .OutputField (desc = "discussion of the overlap between ground truth and system response" )
29+ recall : float = dspy .OutputField (desc = "fraction (out of 1.0) of ground truth covered by the system response" )
30+ precision : float = dspy .OutputField (desc = "fraction (out of 1.0) of system response covered by the ground truth" )
31+
32+
1733def f1_score (precision , recall ):
34+ precision , recall = max (0.0 , min (1.0 , precision )), max (0.0 , min (1.0 , recall ))
1835 return 0.0 if precision + recall == 0 else 2 * (precision * recall ) / (precision + recall )
1936
2037
2138class SemanticF1 (dspy .Module ):
22- def __init__ (self , threshold = 0.66 ):
39+ def __init__ (self , threshold = 0.66 , decompositional = False ):
2340 self .threshold = threshold
24- self .module = dspy .ChainOfThought (SemanticRecallPrecision )
41+
42+ if decompositional :
43+ self .module = dspy .ChainOfThought (DecompositionalSemanticRecallPrecision )
44+ else :
45+ self .module = dspy .ChainOfThought (SemanticRecallPrecision )
2546
2647 def forward (self , example , pred , trace = None ):
2748 scores = self .module (question = example .question , ground_truth = example .response , system_response = pred .response )
@@ -30,42 +51,92 @@ def forward(self, example, pred, trace=None):
3051 return score if trace is None else score >= self .threshold
3152
3253
33- """
34- Soon-to-be deprecated Signatures & Modules Below.
35- """
54+
55+ ###########
56+
57+
58+ class DecompositionalSemanticRecall (dspy .Signature ):
59+ """
60+ Estimate the completeness of a system's responses, against the ground truth.
61+ You will first enumerate key ideas in each response, discuss their overlap, and then report completeness.
62+ """
63+
64+ question : str = dspy .InputField ()
65+ ground_truth : str = dspy .InputField ()
66+ system_response : str = dspy .InputField ()
67+ ground_truth_key_ideas : str = dspy .OutputField (desc = "enumeration of key ideas in the ground truth" )
68+ system_response_key_ideas : str = dspy .OutputField (desc = "enumeration of key ideas in the system response" )
69+ discussion : str = dspy .OutputField (desc = "discussion of the overlap between ground truth and system response" )
70+ completeness : float = dspy .OutputField (desc = "fraction (out of 1.0) of ground truth covered by the system response" )
71+
72+
73+
74+ class DecompositionalGroundedness (dspy .Signature ):
75+ """
76+ Estimate the groundedness of a system's responses, against real retrieved documents written by people.
77+ You will first enumerate whatever non-trivial or check-worthy claims are made in the system response, and then
78+ discuss the extent to which some or all of them can be deduced from the retrieved context and basic commonsense.
79+ """
80+
81+ question : str = dspy .InputField ()
82+ retrieved_context : str = dspy .InputField ()
83+ system_response : str = dspy .InputField ()
84+ system_response_claims : str = dspy .OutputField (desc = "enumeration of non-trivial or check-worthy claims in the system response" )
85+ discussion : str = dspy .OutputField (desc = "discussion of how supported the claims are by the retrieved context" )
86+ groundedness : float = dspy .OutputField (desc = "fraction (out of 1.0) of system response supported by the retrieved context" )
87+
88+
89+ class CompleteAndGrounded (dspy .Module ):
90+ def __init__ (self , threshold = 0.66 ):
91+ self .threshold = threshold
92+ self .completeness_module = dspy .ChainOfThought (DecompositionalSemanticRecall )
93+ self .groundedness_module = dspy .ChainOfThought (DecompositionalGroundedness )
94+
95+ def forward (self , example , pred , trace = None ):
96+ completeness = self .completeness_module (question = example .question , ground_truth = example .response , system_response = pred .response )
97+ groundedness = self .groundedness_module (question = example .question , retrieved_context = pred .context , system_response = pred .response )
98+ score = f1_score (groundedness .groundedness , completeness .completeness )
99+
100+ return score if trace is None else score >= self .threshold
101+
102+
103+
104+ # """
105+ # Soon-to-be deprecated Signatures & Modules Below.
106+ # """
36107
37108
38- class AnswerCorrectnessSignature (dspy .Signature ):
39- """Verify that the predicted answer matches the gold answer."""
109+ # class AnswerCorrectnessSignature(dspy.Signature):
110+ # """Verify that the predicted answer matches the gold answer."""
40111
41- question = dspy .InputField ()
42- gold_answer = dspy .InputField (desc = "correct answer for question" )
43- predicted_answer = dspy .InputField (desc = "predicted answer for question" )
44- is_correct = dspy .OutputField (desc = "True or False" )
112+ # question = dspy.InputField()
113+ # gold_answer = dspy.InputField(desc="correct answer for question")
114+ # predicted_answer = dspy.InputField(desc="predicted answer for question")
115+ # is_correct = dspy.OutputField(desc="True or False")
45116
46117
47- class AnswerCorrectness (dspy .Module ):
48- def __init__ (self ):
49- super ().__init__ ()
50- self .evaluate_correctness = dspy .ChainOfThought (AnswerCorrectnessSignature )
118+ # class AnswerCorrectness(dspy.Module):
119+ # def __init__(self):
120+ # super().__init__()
121+ # self.evaluate_correctness = dspy.ChainOfThought(AnswerCorrectnessSignature)
51122
52- def forward (self , question , gold_answer , predicted_answer ):
53- return self .evaluate_correctness (question = question , gold_answer = gold_answer , predicted_answer = predicted_answer )
123+ # def forward(self, question, gold_answer, predicted_answer):
124+ # return self.evaluate_correctness(question=question, gold_answer=gold_answer, predicted_answer=predicted_answer)
54125
55126
56- class AnswerFaithfulnessSignature (dspy .Signature ):
57- """Verify that the predicted answer is based on the provided context."""
127+ # class AnswerFaithfulnessSignature(dspy.Signature):
128+ # """Verify that the predicted answer is based on the provided context."""
58129
59- context = dspy .InputField (desc = "relevant facts for producing answer" )
60- question = dspy .InputField ()
61- answer = dspy .InputField (desc = "often between 1 and 5 words" )
62- is_faithful = dspy .OutputField (desc = "True or False" )
130+ # context = dspy.InputField(desc="relevant facts for producing answer")
131+ # question = dspy.InputField()
132+ # answer = dspy.InputField(desc="often between 1 and 5 words")
133+ # is_faithful = dspy.OutputField(desc="True or False")
63134
64135
65- class AnswerFaithfulness (dspy .Module ):
66- def __init__ (self ):
67- super ().__init__ ()
68- self .evaluate_faithfulness = dspy .ChainOfThought (AnswerFaithfulnessSignature )
136+ # class AnswerFaithfulness(dspy.Module):
137+ # def __init__(self):
138+ # super().__init__()
139+ # self.evaluate_faithfulness = dspy.ChainOfThought(AnswerFaithfulnessSignature)
69140
70- def forward (self , context , question , answer ):
71- return self .evaluate_faithfulness (context = context , question = question , answer = answer )
141+ # def forward(self, context, question, answer):
142+ # return self.evaluate_faithfulness(context=context, question=question, answer=answer)
0 commit comments