Tighten tool validation and math normalization

haasonsaas · haasonsaas · commit 2c0582fbd1a9 · 2025-12-22T10:06:29.000-08:00
diff --git a/micro_agent/agent.py b/micro_agent/agent.py
@@ -154,9 +154,19 @@ def forward(self, question: str):
         total_in_tokens = 0
         total_out_tokens = 0
 
+        def _normalize_text(q: str) -> str:
+            return (
+                q.replace("\u00d7", "x")
+                .replace("\u00f7", "/")
+                .replace("\u2212", "-")
+                .replace("\u2013", "-")
+                .replace("\u2014", "-")
+            )
+
         def needs_math(q: str) -> bool:
-            ql = q.lower()
-            if re.search(r"[0-9].*[+\-*/%]", q):
+            qn = _normalize_text(q)
+            ql = qn.lower()
+            if re.search(r"[0-9].*[+\-*/%]", qn):
                 return True
             if re.search(r"\b\d+(?:\.\d+)?\s*(?:x|times|multiplied by)\s*\d+(?:\.\d+)?\b", ql):
                 return True
@@ -170,7 +180,7 @@ def needs_math(q: str) -> bool:
             return False
 
         def needs_time(q: str) -> bool:
-            ql = q.lower()
+            ql = _normalize_text(q).lower()
             if "current time" in ql or "current date" in ql:
                 return True
             return re.search(r"\b(time|date|utc|now|today|tomorrow|yesterday|timestamp|datetime)\b", ql) is not None
@@ -222,7 +232,8 @@ def _accumulate_usage(input_text: str = "", output_text: str = ""):
                 pass
 
         def _infer_expression(q: str) -> str:
-            ql = q.lower()
+            qn = _normalize_text(q)
+            ql = qn.lower()
             # Handle "divide X by Y" and "subtract X from Y"
             m = re.search(r"\bdivide\s+(\d+(?:\.\d+)?)\s+by\s+(\d+(?:\.\d+)?)\b", ql)
             if m:
@@ -245,11 +256,11 @@ def _infer_expression(q: str) -> str:
                 return f"{m.group(1)}/{m.group(2)}"
             # Multi-number add/sum
             if "add" in ql or "sum" in ql:
-                nums = [n for n in re.findall(r"\b\d+\b", q)]
+                nums = [n for n in re.findall(r"\b\d+\b", qn)]
                 if len(nums) >= 2:
                     return "+".join(nums)
             # Fallback: longest math-like substring
-            candidates = re.findall(r"[0-9\+\-\*/%\(\)\.!\^\s]+", q)
+            candidates = re.findall(r"[0-9\+\-\*/%\(\)\.!\^\s]+", qn)
             candidates = [c.strip() for c in candidates if any(op in c for op in ["+","-","*","/","%","^","(",")","!"])]
             return max(candidates, key=len) if candidates else ""
 
@@ -307,6 +318,14 @@ def _infer_expression(q: str) -> str:
                             continue
                         # Validate/execute; on validation error, record and continue planning
                         obs = run_tool(name, args)
+                        if isinstance(obs, dict) and "error" in obs and obs.get("error", "").startswith("Unknown tool"):
+                            had_validation_error = True
+                            state.append({
+                                "tool": "⛔️validation_error",
+                                "args": {"name": name, "args": args},
+                                "observation": obs,
+                            })
+                            continue
                         if isinstance(obs, dict) and "error" in obs and "validation" in obs.get("error", ""):
                             had_validation_error = True
                             state.append({
@@ -506,6 +525,13 @@ def _infer_expression(q: str) -> str:
                     name = str(tool_desc)
                     args = decision.get("args", {}) or {}
                 obs = run_tool(name, args)
+                if isinstance(obs, dict) and "error" in obs and obs.get("error", "").startswith("Unknown tool"):
+                    state.append({
+                        "tool": "⛔️validation_error",
+                        "args": {"name": name, "args": args},
+                        "observation": obs,
+                    })
+                    continue
                 if isinstance(obs, dict) and "error" in obs and "validation" in obs.get("error", ""):
                     # second-chance: record detailed schema hint in state and continue planning
                     schema = TOOLS.get(name).schema if name in TOOLS else {}
diff --git a/micro_agent/config.py b/micro_agent/config.py
@@ -33,9 +33,15 @@ def __call__(self, *, prompt: str, **kwargs):
                 import re, json as _json
                 qmatch = re.search(r"Question:\s*(.*)", prompt, re.S)
                 question = qmatch.group(1).strip() if qmatch else prompt
-                ql = question.lower()
+                qn = (question
+                      .replace("\u00d7", "x")
+                      .replace("\u00f7", "/")
+                      .replace("\u2212", "-")
+                      .replace("\u2013", "-")
+                      .replace("\u2014", "-"))
+                ql = qn.lower()
                 # heuristic: suggest calculator/now/final
-                if (re.search(r"[0-9].*[+\-*/%]", question) or
+                if (re.search(r"[0-9].*[+\-*/%]", qn) or
                     re.search(r"\b\d+(?:\.\d+)?\s*(?:x|times|multiplied by|plus|minus|add|added to|subtract|subtracted by|divide|divided by|over)\s*\d+(?:\.\d+)?\b", ql) or
                     (re.search(r"\d", ql) and any(w in ql for w in [
                         "add","sum","plus","minus","subtract","multiply","divide","total","power","factorial","compute","calculate","!","**","^"
@@ -58,7 +64,7 @@ def __call__(self, *, prompt: str, **kwargs):
                             expr = f"{m.group(1)}/{m.group(2)}"
                     # crude expression extraction fallback
                     if expr is None:
-                        cands = re.findall(r"[0-9\+\-\*/%\(\)\.!\^\s]+", question)
+                        cands = re.findall(r"[0-9\+\-\*/%\(\)\.!\^\s]+", qn)
                         cands = [c.strip() for c in cands if c.strip()]
                         expr = max(cands, key=len) if cands else "2+2"
                     return _json.dumps({"tool": {"name": "calculator", "args": {"expression": expr}}})
diff --git a/micro_agent/costs.py b/micro_agent/costs.py
@@ -67,19 +67,29 @@ def estimate_cost_usd(input_tokens: int, output_tokens: int, model: str, provide
 def estimate_prediction_cost(question: str, trace: Any, answer: str, usage: Dict[str, Any]) -> Dict[str, Any]:
     """Estimate token usage and USD cost for a single prediction.
 
-    Heuristic: input tokens ~= lm_calls * tokens(question) + tokens(str(trace))
-               output tokens ~= tokens(answer)
+    If usage provides token counts, prefer them. Otherwise fall back to a heuristic:
+    input tokens ~= lm_calls * tokens(question) + tokens(str(trace))
+    output tokens ~= tokens(answer)
     """
-    provider = (usage or {}).get("provider") or "openai"
-    model = (usage or {}).get("model") or "gpt-4o-mini"
-    lm_calls = int((usage or {}).get("lm_calls", 0) or 0)
+    usage = usage or {}
+    provider = usage.get("provider") or "openai"
+    model = usage.get("model") or "gpt-4o-mini"
+    lm_calls = int(usage.get("lm_calls", 0) or 0)
 
-    q_tokens = estimate_tokens(str(question or ""), model)
-    trace_tokens = estimate_tokens(str(trace or ""), model)
-    ans_tokens = estimate_tokens(str(answer or ""), model)
-    in_tokens = lm_calls * q_tokens + trace_tokens
-    out_tokens = ans_tokens
-    cost = estimate_cost_usd(in_tokens, out_tokens, model=model, provider=provider)
+    in_tokens = int(usage.get("input_tokens", 0) or 0)
+    out_tokens = int(usage.get("output_tokens", 0) or 0)
+    if in_tokens == 0 and out_tokens == 0:
+        q_tokens = estimate_tokens(str(question or ""), model)
+        trace_tokens = estimate_tokens(str(trace or ""), model)
+        ans_tokens = estimate_tokens(str(answer or ""), model)
+        in_tokens = lm_calls * q_tokens + trace_tokens
+        out_tokens = ans_tokens
+
+    cost = usage.get("cost")
+    if cost is None or cost == 0:
+        cost = estimate_cost_usd(in_tokens, out_tokens, model=model, provider=provider)
+    else:
+        cost = float(cost)
     return {
         "input_tokens": in_tokens,
         "output_tokens": out_tokens,
diff --git a/micro_agent/tools.py b/micro_agent/tools.py
@@ -29,7 +29,12 @@ def spec(self) -> Dict[str, Any]:
 def _eval_expr(node):
     # Python 3.10+: numeric literals appear as ast.Constant
     if isinstance(node, ast.Constant) and isinstance(node.value, (int, float)) and not isinstance(node.value, bool):
-        return node.value
+        v = node.value
+        if isinstance(v, float) and not math.isfinite(v):
+            raise ValueError("number not finite")
+        if abs(v) > MAX_ABS_NUMBER:
+            raise ValueError("number too large")
+        return v
     if isinstance(node, ast.BinOp) and type(node.op) in ALLOWED_OPS:
         lv, rv = _eval_expr(node.left), _eval_expr(node.right)
         if isinstance(lv, (int, float)) and abs(lv) > MAX_ABS_NUMBER: raise ValueError("number too large")
@@ -80,6 +85,15 @@ def preprocess_math(expr: str) -> str:
     # Replace simple factorial forms like 9! or 12! with fact(9) / fact(12)
     expr = str(expr or "").strip()
     expr = re.sub(r"(\d+)\!", r"fact(\1)", expr)
+    # Normalize common unicode operators
+    expr = (
+        expr
+        .replace("\u00d7", "*")  # ×
+        .replace("\u00f7", "/")  # ÷
+        .replace("\u2212", "-")  # −
+        .replace("\u2013", "-")  # –
+        .replace("\u2014", "-")  # —
+    )
     # Replace caret ^ with exponentiation
     expr = expr.replace("^", "**")
     # Trim trailing punctuation that commonly slips from prose
@@ -103,7 +117,9 @@ def tool_calculator(args: Dict[str, Any]):
 
 def tool_now(args: Dict[str, Any]):
     tz = str(args.get("timezone", "local")).lower()
-    now = datetime.datetime.now(datetime.timezone.utc) if tz == "utc" else datetime.datetime.now()
+    if tz not in {"utc", "local"}:
+        raise ValueError("timezone must be 'utc' or 'local'")
+    now = datetime.datetime.now(datetime.timezone.utc) if tz == "utc" else datetime.datetime.now().astimezone()
     return {"iso": now.isoformat(timespec="seconds")}
 
 def _load_plugins():
@@ -130,13 +146,13 @@ def _load_plugins():
     "calculator": Tool(
         "calculator",
         "Evaluate arithmetic expressions. Schema: {expression: string}. Supports +,-,*,/,**,%, //, parentheses.",
-        {"type": "object", "properties": {"expression": {"type": "string"}}, "required": ["expression"]},
+        {"type": "object", "properties": {"expression": {"type": "string"}}, "required": ["expression"], "additionalProperties": False},
         tool_calculator
     ),
     "now": Tool(
         "now",
         "Return the current timestamp. Optional: {timezone: 'utc'|'local'}",
-        {"type": "object", "properties": {"timezone": {"type": "string"}}, "required": []},
+        {"type": "object", "properties": {"timezone": {"type": "string", "enum": ["utc", "local"]}}, "required": [], "additionalProperties": False},
         tool_now
     ),
 }
diff --git a/tests/test_regressions.py b/tests/test_regressions.py
@@ -1,11 +1,13 @@
 import datetime
 import json
+import re
 import pytest
 
 from micro_agent.config import configure_lm
 from micro_agent.agent import MicroAgent
 from micro_agent.tools import safe_eval_math
 from micro_agent import runtime
+from micro_agent.tools import run_tool
 
 
 def test_no_false_time_trigger_on_update(monkeypatch):
@@ -49,3 +51,18 @@ def test_dump_trace_serializes_non_json(tmp_path, monkeypatch):
 def test_result_magnitude_limit():
     with pytest.raises(ValueError):
         safe_eval_math("1000000*10000000")
+
+
+def test_unicode_multiply():
+    assert safe_eval_math("3\u00d74") == 12
+
+
+def test_now_local_has_offset():
+    obs = run_tool("now", {"timezone": "local"})
+    assert "iso" in obs
+    assert re.search(r"[+-]\d\d:\d\d$", obs["iso"])
+
+
+def test_now_invalid_timezone_validation():
+    obs = run_tool("now", {"timezone": "pst"})
+    assert "error" in obs and "validation" in obs["error"]