stanfordnlp
diff --git a/‎dspy/utils/dummies.py‎
Lines changed: 7 additions & 4 deletions b/‎dspy/utils/dummies.py‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎tests/evaluate/test_evaluate.py‎
Lines changed: 5 additions & 11 deletions b/‎tests/evaluate/test_evaluate.py‎
Lines changed: 5 additions & 11 deletions
@@ -7,7 +7,7 @@
 
 from dsp.modules import LM as DSPLM
 from dsp.utils.utils import dotdict
-from dspy.adapters.chat_adapter import field_header_pattern
+from dspy.adapters.chat_adapter import field_header_pattern, format_fields
 from dspy.clients.lm import LM
 
 
@@ -98,7 +98,7 @@ def get_convo(self, index) -> str:
 
 
 class DummyLM(LM):
-    def __init__(self, answers: Union[list[str], dict[str, str]], follow_examples: bool = False):
+    def __init__(self, answers: Union[list[dict[str, str]], dict[str, dict[str, str]]], follow_examples: bool = False):
         super().__init__("dummy", "chat", 0.0, 1000, True)
         self.answers = answers
         if isinstance(answers, list):
@@ -133,10 +133,13 @@ def __call__(self, prompt=None, messages=None, **kwargs):
                 outputs.append(self._use_example(messages))
             elif isinstance(self.answers, dict):
                 outputs.append(
-                    next((v for k, v in self.answers.items() if k in messages[-1]["content"]), "No more responses")
+                    next(
+                        (format_fields(v) for k, v in self.answers.items() if k in messages[-1]["content"]),
+                        "No more responses",
+                    )
                 )
             else:
-                outputs.append(next(self.answers, "No more responses"))
+                outputs.append(format_fields(next(self.answers, {"answer": "No more responses"})))
 
             # Logging, with removed api key & where `cost` is None on cache hit.
             kwargs = {k: v for k, v in kwargs.items() if not k.startswith("api_")}
 
@@ -37,8 +37,8 @@ def test_evaluate_call():
     dspy.settings.configure(
         lm=DummyLM(
             {
-                "What is 1+1?": "[[ ## answer ## ]]\n2",
-                "What is 2+2?": "[[ ## answer ## ]]\n4",
+                "What is 1+1?": {"answer": "2"},
+                "What is 2+2?": {"answer": "4"},
             }
         )
     )
@@ -55,9 +55,7 @@ def test_evaluate_call():
 
 
 def test_multithread_evaluate_call():
-    dspy.settings.configure(
-        lm=DummyLM({"What is 1+1?": "[[ ## answer ## ]]\n2", "What is 2+2?": "[[ ## answer ## ]]\n4"})
-    )
+    dspy.settings.configure(lm=DummyLM({"What is 1+1?": {"answer": "2"}, "What is 2+2?": {"answer": "4"}}))
     devset = [new_example("What is 1+1?", "2"), new_example("What is 2+2?", "4")]
     program = Predict("question -> answer")
     assert program(question="What is 1+1?").answer == "2"
@@ -80,9 +78,7 @@ def __call__(self, *args, **kwargs):
             time.sleep(1)
             return super().__call__(*args, **kwargs)
 
-    dspy.settings.configure(
-        lm=SlowLM({"What is 1+1?": "[[ ## answer ## ]]\n2", "What is 2+2?": "[[ ## answer ## ]]\n4"})
-    )
+    dspy.settings.configure(lm=SlowLM({"What is 1+1?": {"answer": "2"}, "What is 2+2?": {"answer": "4"}}))
 
     devset = [new_example("What is 1+1?", "2"), new_example("What is 2+2?", "4")]
     program = Predict("question -> answer")
@@ -112,9 +108,7 @@ def sleep_then_interrupt():
 
 
 def test_evaluate_call_bad():
-    dspy.settings.configure(
-        lm=DummyLM({"What is 1+1?": "[[ ## answer ## ]]\n0", "What is 2+2?": "[[ ## answer ## ]]\n0"})
-    )
+    dspy.settings.configure(lm=DummyLM({"What is 1+1?": {"answer": "0"}, "What is 2+2?": {"answer": "0"}}))
     devset = [new_example("What is 1+1?", "2"), new_example("What is 2+2?", "4")]
     program = Predict("question -> answer")
     ev = Evaluate(