feat(evals): approximate cost using token estimates; ci(release): add PyPI publish workflow on tags; docs: note cost env vars in Evals section

haasonsaas · haasonsaas · commit c1a85a278305 · 2025-09-08T19:58:59.000-07:00
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -0,0 +1,26 @@
+name: Release
+
+on:
+  push:
+    tags:
+      - 'v*'
+
+jobs:
+  build-and-publish:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      id-token: write
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+      - name: Build sdist and wheel
+        run: |
+          python -m pip install --upgrade pip build
+          python -m build
+      - name: Publish to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          password: ${{ secrets.PYPI_API_TOKEN }}
diff --git a/evals/run_evals.py b/evals/run_evals.py
@@ -4,6 +4,7 @@
 from micro_agent.config import configure_lm
 from micro_agent.agent import MicroAgent
 from micro_agent.runtime import new_trace_id, dump_trace
+from micro_agent.costs import estimate_tokens, estimate_cost_usd
 
 def load_yaml(path: str):
     with open(path, "r", encoding="utf-8") as f:
@@ -70,10 +71,21 @@ def main():
         latencies.append(dt)
         # Basic usage tracking (provided by MicroAgent)
         usage = getattr(pred, "usage", {}) or {}
-        lm_calls_list.append(int(usage.get("lm_calls", 0) or 0))
+        lm_calls = int(usage.get("lm_calls", 0) or 0)
+        lm_calls_list.append(lm_calls)
         tool_calls_list.append(int(usage.get("tool_calls", 0) or 0))
         steps_list.append(len(pred.trace or []))
-        costs_list.append(float(usage.get("cost", 0.0) or 0.0))
+
+        # Approximate cost (tokens) per run using simple heuristics
+        provider = usage.get("provider") or "openai"
+        model = usage.get("model") or "gpt-4o-mini"
+        q_text = str(q)
+        trace_text = json.dumps(pred.trace, ensure_ascii=False)
+        ans_text = str(pred.answer or "")
+        # Rough input tokens ~ (lm_calls * question) + final trace
+        in_tokens = lm_calls * estimate_tokens(q_text, model=model) + estimate_tokens(trace_text, model=model)
+        out_tokens = estimate_tokens(ans_text, model=model)
+        costs_list.append(estimate_cost_usd(in_tokens, out_tokens, model=model, provider=provider))
 
         print(f"[{i}/{len(dataset)}] s={s} t={dt:.2f}s  q={q!r}")