feat(optimize): stricter metric + labeled=0 + filter demos; agent(OpenAI): compose final from tool results to preserve numeric answers; tests passing

haasonsaas · haasonsaas · commit 59989c3feda4 · 2025-09-08T19:42:22.000-07:00
diff --git a/micro_agent/agent.py b/micro_agent/agent.py
@@ -211,7 +211,18 @@ def _accumulate_usage():
                     if must_time and not used_tool(state, "now"):
                         state.append({"tool": "⛔️policy_violation", "args": {}, "observation": "Finalize before now (OpenAI path)."})
                         continue
-                    p = dspy.Prediction(answer=final, trace=state)
+                    # Prefer composing from tool results when available to ensure answers include key values.
+                    composed = []
+                    calculators = [s for s in state if s.get("tool") == "calculator" and isinstance(s.get("observation"), dict)]
+                    nows = [s for s in state if s.get("tool") == "now" and isinstance(s.get("observation"), dict)]
+                    if calculators:
+                        composed.append(str(calculators[0]["observation"].get("result")))
+                    if nows:
+                        iso = nows[-1]["observation"].get("iso")
+                        if iso:
+                            composed.append(f"UTC: {iso}")
+                    answer_text = " | ".join(composed) if composed else final
+                    p = dspy.Prediction(answer=answer_text, trace=state)
                     p.usage = {
                         "lm_calls": lm_calls,
                         "tool_calls": tool_calls,
diff --git a/micro_agent/optimize.py b/micro_agent/optimize.py
@@ -134,16 +134,19 @@ def forward(self, question: str, state: str = "[]", tools=None):
     def metric(example, pred, trace):
         q = example.get('question', '')
         expect = example.get('expect_contains')
-        score = 0.0
         calls = getattr(pred, 'tool_calls', None)
-        if any(ch.isdigit() for ch in q) and calls:
-            score += 0.5
-        if ("time" in q.lower() or "utc" in q.lower()) and calls:
-            score += 0.5
         fin = getattr(pred, 'final', '') or ''
-        if expect and expect in str(fin):
-            score += 1.0
-        return score
+
+        # If we know the expected substring (math tasks), require it in final.
+        if expect:
+            return 1.0 if (fin and expect in str(fin)) else 0.0
+
+        # Otherwise (e.g., time tasks), accept when appropriate tool is used.
+        if calls and getattr(calls, 'tool_calls', None):
+            for c in calls.tool_calls:
+                if getattr(c, 'name', '') == 'now':
+                    return 1.0
+        return 0.0
 
     # Build trainset Examples
     trainset: List[Example] = []
@@ -155,18 +158,23 @@ def metric(example, pred, trace):
         ex = ex.with_inputs('question', 'state', 'tools')
         trainset.append(ex)
 
-    tele = BootstrapFewShot(metric=metric, max_bootstrapped_demos=8, max_labeled_demos=8, max_rounds=1)
+    tele = BootstrapFewShot(metric=metric, metric_threshold=1.0, max_bootstrapped_demos=8, max_labeled_demos=0, max_rounds=1)
     compiled = tele.compile(Planner(), trainset=trainset)
 
     # Extract demos from the compiled predictor
     demos = []
     for demo in getattr(compiled.decide, 'demos', []) or []:
         raw = demo.toDict()
+        tool_calls = _serialize_tool_calls(raw.get("tool_calls"))
+        final = raw.get("final")
+        # Keep only demos that actually contain signals (augmented)
+        if not tool_calls and not final:
+            continue
         record = {
             "question": raw.get("question"),
             "state": raw.get("state", "[]"),
-            "tool_calls": _serialize_tool_calls(raw.get("tool_calls")),
-            "final": raw.get("final"),
+            "tool_calls": tool_calls,
+            "final": final,
         }
         demos.append(record)
 
diff --git a/opt/plan_demos.json b/opt/plan_demos.json
@@ -1,17 +1,4 @@
 [
-  {
-    "question": "What's 2*(3+5)? Return only the number.",
-    "state": "[]",
-    "tool_calls": [
-      {
-        "name": "calculator",
-        "args": {
-          "expression": "2*(3+5)"
-        }
-      }
-    ],
-    "final": null
-  },
   {
     "question": "What time is it right now? Use UTC.",
     "state": "[]",
@@ -26,28 +13,9 @@
     "final": null
   },
   {
-    "question": "Compute (7**2 + 14) / 5 and explain briefly.",
-    "state": "[]",
-    "tool_calls": [
-      {
-        "name": "calculator",
-        "args": {
-          "expression": "(7**2 + 14) / 5"
-        }
-      }
-    ],
-    "final": null
-  },
-  {
-    "question": "Add 12345 and 67890, then tell me the current date (UTC).",
+    "question": "What time is it right now? Use UTC.",
     "state": "[]",
     "tool_calls": [
-      {
-        "name": "calculator",
-        "args": {
-          "expression": "12345 + 67890"
-        }
-      },
       {
         "name": "now",
         "args": {
@@ -58,39 +26,13 @@
     "final": null
   },
   {
-    "question": "If I spend $12.50 daily for 9 days, what's the total?",
-    "state": "[]",
-    "tool_calls": [
-      {
-        "name": "calculator",
-        "args": {
-          "expression": "12.50 * 9"
-        }
-      }
-    ],
-    "final": null
-  },
-  {
-    "question": "What's 9! / (3!*3!*3!)? Just the integer.",
-    "state": "[]",
-    "tool_calls": [
-      {
-        "name": "calculator",
-        "args": {
-          "expression": "9! / (3!*3!*3!)"
-        }
-      }
-    ],
-    "final": null
-  },
-  {
-    "question": "What's 2*(3+5)? Return only the number.",
+    "question": "What time is it right now? Use UTC.",
     "state": "[]",
     "tool_calls": [
       {
-        "name": "calculator",
+        "name": "now",
         "args": {
-          "expression": "2*(3+5)"
+          "timezone": "utc"
         }
       }
     ],