Harden math/time detection and tracing

haasonsaas · haasonsaas · commit f343fb145139 · 2025-12-21T22:37:22.000-08:00
diff --git a/micro_agent/agent.py b/micro_agent/agent.py
@@ -156,17 +156,24 @@ def forward(self, question: str):
 
         def needs_math(q: str) -> bool:
             ql = q.lower()
-            if re.search(r"[0-9].*[+\-*/]", q):
+            if re.search(r"[0-9].*[+\-*/%]", q):
                 return True
-            if any(w in ql for w in ["add", "sum", "multiply", "divide", "compute", "calculate", "total", "power", "factorial", "!", "**", "^"]):
+            if re.search(r"\b\d+(?:\.\d+)?\s*(?:x|times|multiplied by)\s*\d+(?:\.\d+)?\b", ql):
+                return True
+            if re.search(r"\b\d+(?:\.\d+)?\s*(?:plus|minus|add|added to|subtract|subtracted by|divide|divided by|over)\s*\d+(?:\.\d+)?\b", ql):
+                return True
+            if re.search(r"\d", ql) and any(w in ql for w in [
+                "add", "sum", "plus", "minus", "subtract", "multiply", "divide",
+                "total", "power", "factorial", "compute", "calculate"
+            ]):
                 return True
             return False
 
         def needs_time(q: str) -> bool:
             ql = q.lower()
             if "current time" in ql or "current date" in ql:
                 return True
-            return re.search(r"\b(time|times|date|dates|utc|now|today|tomorrow|yesterday|timestamp|datetime)\b", ql) is not None
+            return re.search(r"\b(time|date|utc|now|today|tomorrow|yesterday|timestamp|datetime)\b", ql) is not None
 
         def used_tool(state, name: str) -> bool:
             return any(step.get("tool") == name for step in state)
@@ -178,17 +185,25 @@ def used_tool(state, name: str) -> bool:
 
         def _accumulate_usage(input_text: str = "", output_text: str = ""):
             # Pull new usage entries from dspy.settings.trace
+            in_tok = 0
+            out_tok = 0
+            cost = 0.0
             try:
                 for _, _, out in dspy.settings.trace[-1:]:
                     usage = getattr(out, "usage", None) or {}
                     nonlocal total_cost, total_in_tokens, total_out_tokens
                     c = getattr(out, "cost", None)
                     if c is not None:
-                        total_cost += float(c or 0)
-                    total_in_tokens += int(usage.get("input_tokens", 0) or 0)
-                    total_out_tokens += int(usage.get("output_tokens", 0) or 0)
+                        cost += float(c or 0)
+                    in_tok += int(usage.get("input_tokens", 0) or 0)
+                    out_tok += int(usage.get("output_tokens", 0) or 0)
             except Exception:
                 pass
+            if in_tok or out_tok or cost:
+                total_cost += cost
+                total_in_tokens += in_tok
+                total_out_tokens += out_tok
+                return
             # Heuristic fallback: estimate tokens from input/output texts and compute cost via env prices
             try:
                 if input_text:
@@ -206,6 +221,38 @@ def _accumulate_usage(input_text: str = "", output_text: str = ""):
             except Exception:
                 pass
 
+        def _infer_expression(q: str) -> str:
+            ql = q.lower()
+            # Handle "divide X by Y" and "subtract X from Y"
+            m = re.search(r"\bdivide\s+(\d+(?:\.\d+)?)\s+by\s+(\d+(?:\.\d+)?)\b", ql)
+            if m:
+                return f"{m.group(1)}/{m.group(2)}"
+            m = re.search(r"\bsubtract\s+(\d+(?:\.\d+)?)\s+from\s+(\d+(?:\.\d+)?)\b", ql)
+            if m:
+                return f"{m.group(2)}-{m.group(1)}"
+            # Binary worded ops
+            m = re.search(r"\b(\d+(?:\.\d+)?)\s*(?:x|times|multiplied by)\s*(\d+(?:\.\d+)?)\b", ql)
+            if m:
+                return f"{m.group(1)}*{m.group(2)}"
+            m = re.search(r"\b(\d+(?:\.\d+)?)\s*(?:plus|add|added to)\s*(\d+(?:\.\d+)?)\b", ql)
+            if m:
+                return f"{m.group(1)}+{m.group(2)}"
+            m = re.search(r"\b(\d+(?:\.\d+)?)\s*(?:minus|subtract|subtracted by)\s*(\d+(?:\.\d+)?)\b", ql)
+            if m:
+                return f"{m.group(1)}-{m.group(2)}"
+            m = re.search(r"\b(\d+(?:\.\d+)?)\s*(?:divide|divided by|over)\s*(\d+(?:\.\d+)?)\b", ql)
+            if m:
+                return f"{m.group(1)}/{m.group(2)}"
+            # Multi-number add/sum
+            if "add" in ql or "sum" in ql:
+                nums = [n for n in re.findall(r"\b\d+\b", q)]
+                if len(nums) >= 2:
+                    return "+".join(nums)
+            # Fallback: longest math-like substring
+            candidates = re.findall(r"[0-9\+\-\*/%\(\)\.!\^\s]+", q)
+            candidates = [c.strip() for c in candidates if any(op in c for op in ["+","-","*","/","%","^","(",")","!"])]
+            return max(candidates, key=len) if candidates else ""
+
         # Path A: OpenAI-native tool calling using DSPy signatures/adapters.
         if self._use_tool_calls:
             dspy_tools = to_dspy_tools()
@@ -275,6 +322,13 @@ def _accumulate_usage(input_text: str = "", output_text: str = ""):
                 # Check finalization.
                 final = getattr(pred, 'final', None)
                 if final:
+                    if executed_any:
+                        state.append({
+                            "tool": "⛔️policy_violation",
+                            "args": {"reason": "tool_and_final"},
+                            "observation": "Model returned a final answer alongside a tool call.",
+                        })
+                        continue
                     if had_policy_violation or had_validation_error:
                         continue
                     if must_math and not used_tool(state, "calculator"):
@@ -319,29 +373,15 @@ def _accumulate_usage(input_text: str = "", output_text: str = ""):
             if calculators:
                 parts.append(str(calculators[0]["observation"].get("result")))
             elif must_math:
-                # Last-chance math: infer a simple expression from the question.
-                import re as _re
-                ql = question.lower()
-                if "add" in ql or "sum" in ql:
-                    nums = [int(n) for n in _re.findall(r"\b\d+\b", question)]
-                    if len(nums) >= 2:
-                        res = sum(nums)
+                expr = _infer_expression(question)
+                if expr:
+                    try:
+                        res = safe_eval_math(expr)
                         parts.append(str(res))
-                        # also record as a calculator step for trace parity
-                        state.append({"tool": "calculator", "args": {"expression": "+".join(map(str, nums))}, "observation": {"result": res}})
+                        state.append({"tool": "calculator", "args": {"expression": expr}, "observation": {"result": res}})
                         tool_calls += 1
-                if not parts:
-                    candidates = _re.findall(r"[0-9\+\-\*/%\(\)\.!\^\s]+", question)
-                    candidates = [c.strip() for c in candidates if any(op in c for op in ["+","-","*","/","%","^","(",")","!"])]
-                    expr = max(candidates, key=len) if candidates else ""
-                    if expr:
-                        try:
-                            res = safe_eval_math(expr)
-                            parts.append(str(res))
-                            state.append({"tool": "calculator", "args": {"expression": expr}, "observation": {"result": res}})
-                            tool_calls += 1
-                        except Exception:
-                            pass
+                    except Exception:
+                        pass
             if nows:
                 iso = nows[-1]["observation"].get("iso")
                 if iso:
@@ -413,6 +453,13 @@ def _accumulate_usage(input_text: str = "", output_text: str = ""):
                     continue
 
             if "final" in decision:
+                if "tool" in decision:
+                    state.append({
+                        "tool": "⛔️policy_violation",
+                        "args": {"reason": "tool_and_final"},
+                        "observation": "Decision contained both tool and final.",
+                    })
+                    continue
                 # Enforce tool usage policy: if required tools not yet used, keep planning.
                 if must_math and not used_tool(state, "calculator"):
                     state.append({"tool": "⛔️policy_violation", "args": {}, "observation": "Finalize attempted before calculator."})
@@ -430,7 +477,14 @@ def _accumulate_usage(input_text: str = "", output_text: str = ""):
                     iso = nows[-1]["observation"].get("iso")
                     if iso:
                         composed_parts.append(f"UTC: {iso}")
-                final_text = " | ".join(composed_parts) if composed_parts else decision["final"].get("answer", "")
+                if composed_parts:
+                    final_text = " | ".join(composed_parts)
+                else:
+                    final_payload = decision.get("final")
+                    if isinstance(final_payload, dict):
+                        final_text = final_payload.get("answer", "")
+                    else:
+                        final_text = str(final_payload) if final_payload is not None else ""
                 p = dspy.Prediction(answer=final_text, trace=state)
                 p.usage = {
                     "lm_calls": lm_calls,
@@ -478,24 +532,12 @@ def _accumulate_usage(input_text: str = "", output_text: str = ""):
                 if calc_results:
                     parts.append(str(calc_results[0]))
             if must_math and not parts:
-                # Last-chance math: try to infer a simple expression from the question.
-                ql = question.lower()
-                # If looks like 'add X and Y', sum integers.
-                import re
-                if "add" in ql or "sum" in ql:
-                    nums = [int(n) for n in re.findall(r"\b\d+\b", question)]
-                    if len(nums) >= 2:
-                        parts.append(str(sum(nums)))
-                if not parts:
-                    # Extract longest math-like substring and evaluate.
-                    candidates = re.findall(r"[0-9\+\-\*/%\(\)\.!\^\s]+", question)
-                    candidates = [c.strip() for c in candidates if any(op in c for op in ["+","-","*","/","%","^","(",")","!"])]
-                    expr = max(candidates, key=len) if candidates else ""
-                    if expr:
-                        try:
-                            parts.append(str(safe_eval_math(expr)))
-                        except Exception:
-                            pass
+                expr = _infer_expression(question)
+                if expr:
+                    try:
+                        parts.append(str(safe_eval_math(expr)))
+                    except Exception:
+                        pass
             if nows:
                 iso = nows[-1]["observation"].get("iso")
                 if iso:
diff --git a/micro_agent/config.py b/micro_agent/config.py
@@ -35,16 +35,35 @@ def __call__(self, *, prompt: str, **kwargs):
                 question = qmatch.group(1).strip() if qmatch else prompt
                 ql = question.lower()
                 # heuristic: suggest calculator/now/final
-                if re.search(r"[0-9].*[+\-*/]", question) or any(w in ql for w in [
-                    "add","sum","multiply","divide","compute","calculate","total","power","factorial","!","**","^"
-                ]):
-                    # crude expression extraction
-                    cands = re.findall(r"[0-9\+\-\*/%\(\)\.!\^\s]+", question)
-                    cands = [c.strip() for c in cands if c.strip()]
-                    expr = max(cands, key=len) if cands else "2+2"
+                if (re.search(r"[0-9].*[+\-*/%]", question) or
+                    re.search(r"\b\d+(?:\.\d+)?\s*(?:x|times|multiplied by|plus|minus|add|added to|subtract|subtracted by|divide|divided by|over)\s*\d+(?:\.\d+)?\b", ql) or
+                    (re.search(r"\d", ql) and any(w in ql for w in [
+                        "add","sum","plus","minus","subtract","multiply","divide","total","power","factorial","compute","calculate","!","**","^"
+                    ]))):
+                    expr = None
+                    m = re.search(r"\b(\d+(?:\.\d+)?)\s*(?:x|times|multiplied by)\s*(\d+(?:\.\d+)?)\b", ql)
+                    if m:
+                        expr = f"{m.group(1)}*{m.group(2)}"
+                    if expr is None:
+                        m = re.search(r"\b(\d+(?:\.\d+)?)\s*(?:plus|add|added to)\s*(\d+(?:\.\d+)?)\b", ql)
+                        if m:
+                            expr = f"{m.group(1)}+{m.group(2)}"
+                    if expr is None:
+                        m = re.search(r"\b(\d+(?:\.\d+)?)\s*(?:minus|subtract|subtracted by)\s*(\d+(?:\.\d+)?)\b", ql)
+                        if m:
+                            expr = f"{m.group(1)}-{m.group(2)}"
+                    if expr is None:
+                        m = re.search(r"\b(\d+(?:\.\d+)?)\s*(?:divide|divided by|over)\s*(\d+(?:\.\d+)?)\b", ql)
+                        if m:
+                            expr = f"{m.group(1)}/{m.group(2)}"
+                    # crude expression extraction fallback
+                    if expr is None:
+                        cands = re.findall(r"[0-9\+\-\*/%\(\)\.!\^\s]+", question)
+                        cands = [c.strip() for c in cands if c.strip()]
+                        expr = max(cands, key=len) if cands else "2+2"
                     return _json.dumps({"tool": {"name": "calculator", "args": {"expression": expr}}})
                 if ("current time" in ql or "current date" in ql or
-                    re.search(r"\b(time|times|date|dates|utc|now|today|tomorrow|yesterday|timestamp|datetime)\b", ql)):
+                    re.search(r"\b(time|date|utc|now|today|tomorrow|yesterday|timestamp|datetime)\b", ql)):
                     return _json.dumps({"tool": {"name": "now", "args": {"timezone": "utc"}}})
                 return _json.dumps({"final": {"answer": "ok"}})
         dspy.settings.configure(lm=_MockLM(), track_usage=True)
diff --git a/micro_agent/runtime.py b/micro_agent/runtime.py
@@ -39,9 +39,10 @@ def dump_trace(trace_id: str, question: str, steps: List[Step], answer: str, *,
         rec["usage"] = usage
     if cost_usd is not None:
         rec["cost_usd"] = float(cost_usd)
+    os.makedirs(TRACES_DIR, exist_ok=True)
     path = os.path.join(TRACES_DIR, f"{trace_id}.jsonl")
     with open(path, "a", encoding="utf-8") as f:
-        f.write(json.dumps(rec, ensure_ascii=False) + "\n")
+        f.write(json.dumps(rec, ensure_ascii=False, default=str) + "\n")
     return path
 
 def extract_json_block(text: str) -> str:
@@ -90,7 +91,10 @@ def parse_decision_text(text: str) -> Dict[str, Any]:
     block = extract_json_block(text)
     # 1) strict json
     try:
-        return json.loads(block)
+        obj = json.loads(block)
+        if isinstance(obj, dict):
+            return obj
+        raise ValueError("Decision JSON is not an object")
     except Exception:
         pass
     # 2) json-repair (if available)
@@ -99,7 +103,10 @@ def parse_decision_text(text: str) -> Dict[str, Any]:
             repaired = json_repair.repair(block)
             if isinstance(repaired, dict):
                 return repaired
-            return json.loads(repaired)
+            obj = json.loads(repaired)
+            if isinstance(obj, dict):
+                return obj
+            raise ValueError("Decision JSON is not an object")
         except Exception:
             pass
     # 3) python literal (handles single quotes)
diff --git a/micro_agent/tools.py b/micro_agent/tools.py
@@ -40,13 +40,21 @@ def _eval_expr(node):
         result = ALLOWED_OPS[type(node.op)](lv, rv)
         if isinstance(result, complex):
             raise ValueError("complex results are not supported")
+        if isinstance(result, float) and not math.isfinite(result):
+            raise ValueError("number not finite")
+        if isinstance(result, (int, float)) and abs(result) > MAX_ABS_NUMBER:
+            raise ValueError("number too large")
         return result
     if isinstance(node, ast.UnaryOp) and type(node.op) in ALLOWED_OPS:
         v = _eval_expr(node.operand)
         if isinstance(v, (int, float)) and abs(v) > MAX_ABS_NUMBER: raise ValueError("number too large")
         result = ALLOWED_OPS[type(node.op)](v)
         if isinstance(result, complex):
             raise ValueError("complex results are not supported")
+        if isinstance(result, float) and not math.isfinite(result):
+            raise ValueError("number not finite")
+        if isinstance(result, (int, float)) and abs(result) > MAX_ABS_NUMBER:
+            raise ValueError("number too large")
         return result
     if isinstance(node, ast.Call) and isinstance(node.func, ast.Name) and node.func.id in ALLOWED_CALLS:
         if len(node.args) != 1:
@@ -61,7 +69,10 @@ def _eval_expr(node):
             raise ValueError("factorial requires a non-negative integer")
         if arg_int > MAX_FACTORIAL_N:
             raise ValueError("factorial too large")
-        return ALLOWED_CALLS[node.func.id](arg_int)
+        result = ALLOWED_CALLS[node.func.id](arg_int)
+        if isinstance(result, (int, float)) and abs(result) > MAX_ABS_NUMBER:
+            raise ValueError("number too large")
+        return result
     if isinstance(node, ast.Expression): return _eval_expr(node.body)
     raise ValueError("Disallowed expression")
 
diff --git a/tests/test_regressions.py b/tests/test_regressions.py
@@ -1,8 +1,11 @@
+import datetime
+import json
 import pytest
 
 from micro_agent.config import configure_lm
 from micro_agent.agent import MicroAgent
 from micro_agent.tools import safe_eval_math
+from micro_agent import runtime
 
 
 def test_no_false_time_trigger_on_update(monkeypatch):
@@ -21,3 +24,28 @@ def test_factorial_rejects_non_integer():
 def test_complex_results_rejected():
     with pytest.raises(ValueError):
         safe_eval_math("(-1)^(0.5)")
+
+
+def test_times_is_math_not_time(monkeypatch):
+    monkeypatch.setenv("LLM_PROVIDER", "mock")
+    configure_lm()
+    agent = MicroAgent(max_steps=3)
+    pred = agent("What is 3 times 4?")
+    assert "12" in pred.answer
+    assert any(step.get("tool") == "calculator" for step in (pred.trace or []))
+    assert not any(step.get("tool") == "now" for step in (pred.trace or []))
+
+
+def test_dump_trace_serializes_non_json(tmp_path, monkeypatch):
+    monkeypatch.setattr(runtime, "TRACES_DIR", str(tmp_path))
+    trace_id = runtime.new_trace_id()
+    steps = [{"tool": "now", "args": {}, "observation": {"when": datetime.datetime(2020, 1, 1)}}]
+    path = runtime.dump_trace(trace_id, "q", steps, "a")
+    with open(path, "r", encoding="utf-8") as f:
+        rec = json.loads(f.readline())
+    assert rec["steps"][0]["observation"]["when"].startswith("2020-01-01")
+
+
+def test_result_magnitude_limit():
+    with pytest.raises(ValueError):
+        safe_eval_math("1000000*10000000")