test(evals): add 13 more math/time tasks; smoke-tested with mock provider

haasonsaas · haasonsaas · commit 572460b7a7c6 · 2025-09-08T19:48:48.000-07:00
diff --git a/evals/tasks.yaml b/evals/tasks.yaml
@@ -16,3 +16,40 @@
 
 - question: "What's 9! / (3!*3!*3!)? Just the integer."
   expect_contains: "1680"
+
+# Additional math/time tasks
+- question: "What's 2^10? Return only the number."
+  expect_contains: "1024"
+
+- question: "Compute 7/2 and return a decimal."
+  expect_contains: "3.5"
+
+- question: "Compute 7//2 (floor division). Return only the integer."
+  expect_contains: "3"
+
+- question: "What is 100 % 7? Return only the integer."
+  expect_contains: "2"
+
+- question: "Calculate 6! and return only the number."
+  expect_contains: "720"
+
+- question: "Compute (12.5 * 4). Return only the number."
+  expect_contains: "50"
+
+- question: "Compute ((7+3)*5 - 12)/4 and return a decimal."
+  expect_contains: "9.5"
+
+- question: "Compute (3+5)*2^3 and return only the number."
+  expect_contains: "64"
+
+- question: "What is (12 - 30)? Return only the number."
+  expect_contains: "-18"
+
+- question: "Add 100 and 250, then tell me the current date in UTC."
+  expect_contains: "350"
+
+- question: "Give me the current timestamp in UTC (ISO)."
+  expect_key: "iso"
+
+- question: "Tell me today's date (UTC) in ISO format."
+  expect_key: "iso"
diff --git a/micro_agent/agent.py b/micro_agent/agent.py
@@ -5,6 +5,7 @@
 from .signatures import PlanOrAct, Finalize, PlanWithTools
 from .tools import TOOLS, run_tool, safe_eval_math, to_dspy_tools
 from .runtime import parse_decision_text
+from .costs import estimate_tokens, estimate_cost_usd
 
 class MicroAgent(dspy.Module):
     """
@@ -151,7 +152,7 @@ def used_tool(state, name: str) -> bool:
 
         state: List[Dict[str, Any]] = []
 
-        def _accumulate_usage():
+        def _accumulate_usage(input_text: str = "", output_text: str = ""):
             # Pull new usage entries from dspy.settings.trace
             try:
                 for _, _, out in dspy.settings.trace[-1:]:
@@ -164,6 +165,22 @@ def _accumulate_usage():
                     total_out_tokens += int(usage.get("output_tokens", 0) or 0)
             except Exception:
                 pass
+            # Heuristic fallback: estimate tokens from input/output texts and compute cost via env prices
+            try:
+                if input_text:
+                    it = estimate_tokens(input_text, getattr(self.lm, "model", ""))
+                else:
+                    it = 0
+                if output_text:
+                    ot = estimate_tokens(output_text, getattr(self.lm, "model", ""))
+                else:
+                    ot = 0
+                if it or ot:
+                    total_in_tokens += it
+                    total_out_tokens += ot
+                    total_cost += estimate_cost_usd(it, ot, getattr(self.lm, "model", ""), self._provider or "")
+            except Exception:
+                pass
 
         # Path A: OpenAI-native tool calling using DSPy signatures/adapters.
         if self._use_tool_calls:
@@ -184,6 +201,17 @@ def _accumulate_usage():
                     total_out_tokens += int(usage.get('output_tokens', 0) or 0)
                 except Exception:
                     pass
+                # Heuristic fallback: estimate using a reconstructed prompt & result
+                try:
+                    approx_prompt = self._decision_prompt(
+                        question=question,
+                        state_json=json.dumps(state, ensure_ascii=False),
+                        tools_json=json.dumps(self._tool_list, ensure_ascii=False),
+                    )
+                    approx_out = getattr(pred, 'final', None) or (str(getattr(pred, 'tool_calls', '')))
+                    _accumulate_usage(approx_prompt, approx_out)
+                except Exception:
+                    pass
 
                 # If tool calls are proposed, execute them.
                 calls = getattr(pred, 'tool_calls', None)
@@ -302,6 +330,11 @@ def _accumulate_usage():
         # Path B: Ollama-friendly loop via raw LM completions and robust JSON parsing.
         for _ in range(self.max_steps):
             lm_calls += 1
+            prompt_text = self._decision_prompt(
+                    question=question,
+                    state_json=json.dumps(state, ensure_ascii=False),
+                    tools_json=json.dumps(self._tool_list, ensure_ascii=False),
+                )
             raw = self.lm(
                 prompt=self._decision_prompt(
                     question=question,
@@ -310,13 +343,18 @@ def _accumulate_usage():
                 )
             )
             decision_text = raw[0] if isinstance(raw, list) else (raw if isinstance(raw, str) else str(raw))
-            _accumulate_usage()
+            _accumulate_usage(prompt_text, decision_text)
 
             # Extract and parse JSON; if malformed, try a flexible parser and one self-correction retry.
             try:
                 decision = parse_decision_text(decision_text)
             except Exception:
                 lm_calls += 1
+                prompt_text = self._decision_prompt(
+                        question=question,
+                        state_json=json.dumps(state, ensure_ascii=False),
+                        tools_json=json.dumps(self._tool_list, ensure_ascii=False),
+                    )
                 raw = self.lm(
                     prompt=self._decision_prompt(
                         question=question,
@@ -325,7 +363,7 @@ def _accumulate_usage():
                     )
                 )
                 decision_text = raw[0] if isinstance(raw, list) else (raw if isinstance(raw, str) else str(raw))
-                _accumulate_usage()
+                _accumulate_usage(prompt_text, decision_text)
                 try:
                     decision = parse_decision_text(decision_text)
                 except Exception:
@@ -413,14 +451,18 @@ def _accumulate_usage():
             ans = " | ".join(parts) if parts else ""
         else:
             lm_calls += 1
+            finalize_prompt = (
+                "Given the question and the trace of tool observations, write the final answer.\n\n"
+                f"Question: {question}\n\nTrace: {json.dumps(state, ensure_ascii=False)}\n\n"
+                "Answer succinctly."
+            )
             raw = self.lm(
                 prompt=(
-                    "Given the question and the trace of tool observations, write the final answer.\n\n"
-                    f"Question: {question}\n\nTrace: {json.dumps(state, ensure_ascii=False)}\n\n"
-                    "Answer succinctly."
+                    finalize_prompt
                 )
             )
             ans = raw[0] if isinstance(raw, list) else (raw if isinstance(raw, str) else str(raw))
+            _accumulate_usage(finalize_prompt, ans)
         p = dspy.Prediction(answer=ans, trace=state)
         p.usage = {
             "lm_calls": lm_calls,
diff --git a/micro_agent/config.py b/micro_agent/config.py
@@ -31,7 +31,7 @@ def configure_lm():
     def _try(name, fn):
         try:
             lm = fn()
-            dspy.settings.configure(lm=lm)
+            dspy.settings.configure(lm=lm, track_usage=True)
             return True
         except Exception as e:
             tried.append((name, repr(e)))
@@ -79,9 +79,9 @@ def __call__(self, *, prompt: str, **kwargs):
 
     # Allow explicit mock via env
     if provider == "mock":
-        dspy.settings.configure(lm=_MockLM())
+        dspy.settings.configure(lm=_MockLM(), track_usage=True)
         return
 
     # If we got here, all backends failed: use mock and include details in a warning
-    dspy.settings.configure(lm=_MockLM())
+    dspy.settings.configure(lm=_MockLM(), track_usage=True)
     return
diff --git a/micro_agent/costs.py b/micro_agent/costs.py
@@ -0,0 +1,40 @@
+from __future__ import annotations
+import os
+from typing import Tuple
+
+def _try_tiktoken(model: str):
+    try:
+        import tiktoken
+        # Use a generic encoding if specific not found
+        try:
+            enc = tiktoken.encoding_for_model(model)
+        except Exception:
+            enc = tiktoken.get_encoding("o200k_base")
+        return enc
+    except Exception:
+        return None
+
+def estimate_tokens(text: str, model: str = "gpt-4o-mini") -> int:
+    if not text:
+        return 0
+    enc = _try_tiktoken(model)
+    if enc is None:
+        # Fallback heuristic: ~4 chars per token
+        return max(1, len(text) // 4)
+    try:
+        return len(enc.encode(text))
+    except Exception:
+        return max(1, len(text) // 4)
+
+def get_prices_per_1k(model: str, provider: str) -> Tuple[float, float]:
+    # Allow env overrides; default to 0 to avoid misleading values.
+    in_price = float(os.getenv("OPENAI_INPUT_PRICE_PER_1K", "0") or 0)
+    out_price = float(os.getenv("OPENAI_OUTPUT_PRICE_PER_1K", "0") or 0)
+    if provider != "openai":
+        return 0.0, 0.0
+    return in_price, out_price
+
+def estimate_cost_usd(input_tokens: int, output_tokens: int, model: str, provider: str) -> float:
+    in_price_1k, out_price_1k = get_prices_per_1k(model, provider)
+    return (input_tokens / 1000.0) * in_price_1k + (output_tokens / 1000.0) * out_price_1k
+