Merge pull request #97 from WecoAI/line-plot-example

ZhengyaoJiang · web-flow · commit 38b3329bb5cc · 2025-12-10T19:11:09.000Z
Add extract-line-plot example
diff --git a/examples/extract-line-plot/README.md b/examples/extract-line-plot/README.md
@@ -1,6 +1,6 @@
-## Extract Line Plot (Chart → CSV) with a VLM
+## Extract Line Plot (Chart → CSV): Accuracy/Cost Optimization for Agentic Workflow
 
-This example is about optimizing an AI feature that turns image of chart into a table in csv format.
+This example demonstrates optimizing an AI feature that turns chart images into CSV tables, showcasing how to use Weco to improve accuracy or reduce cost of a VLM-based extraction workflow.
 
 ### Prerequisites
 
@@ -15,8 +15,9 @@ export OPENAI_API_KEY=your_key_here
 ### Files
 
 - `prepare_data.py`: downloads ChartQA (full) and prepares a 100-sample subset of line charts.
-- `optimize.py`: baseline VLM function (`VLMExtractor.image_to_csv`) to be optimized.
+- `optimize.py`: exposes `extract_csv(image_path)` which returns CSV text plus the per-call cost (helpers stay private).
 - `eval.py`: evaluation harness that runs the baseline on images and reports a similarity score as "accuracy".
+- `guide.md`: optional additional instructions you can feed to Weco via `--additional-instructions guide.md`.
 
 Generated artifacts (gitignored):
 - `subset_line_100/` and `subset_line_100.zip`
@@ -47,12 +48,21 @@ Metric definition (summarized):
 - Per-sample score = 0.2 × header match + 0.8 × Jaccard(similarity of content rows).
 - Reported `accuracy` is the mean score over all evaluated samples.
 
+To emit a secondary `cost` metric that Weco can minimize (while enforcing `accuracy > 0.45`), append `--cost-metric`:
+
+```bash
+uv run --with openai python eval.py --max-samples 10 --num-workers 4 --cost-metric
+```
+
+If the final accuracy falls at or below `0.45`, the reported cost is replaced with a large penalty so Weco keeps searching for higher-accuracy solutions.
+You can tighten or relax this constraint with `--cost-accuracy-threshold`, e.g. `--cost-accuracy-threshold 0.50`.
+
 ### 3) Optimize the baseline with Weco
 
 Run Weco to iteratively improve `optimize.py` using 100 examples and many workers:
 
 ```bash
-weco run --source optimize.py --eval-command 'uv run --with openai python eval.py --max-samples 100 --num-workers 50' --metric accuracy --goal maximize --steps 20 --model gpt-5
+weco run --source optimize.py --eval-command 'uv run --with openai python eval.py --max-samples 100 --num-workers 50' --metric accuracy --goal maximize --steps 20 --model gpt-5 --additional-instructions guide.md
 ```
 
 Arguments:
@@ -63,10 +73,20 @@ Arguments:
 - `--steps 20`: number of optimization iterations.
 - `--model gpt-5`: model used by Weco to propose edits (change as desired).
 
+To minimize cost instead (subject to the accuracy constraint), enable the flag in the eval command and switch the optimization target:
+
+```bash
+weco run --source optimize.py --eval-command 'uv run --with openai python eval.py --max-samples 100 --num-workers 50 --cost-metric' --metric cost --goal minimize --steps 20 --model gpt-5 --additional-instructions guide.md
+```
+
+#### Cost optimization workflow
+- Run the evaluation command with `--cost-metric` once to confirm accuracy meets your threshold and note the baseline cost.
+- Adjust `--cost-accuracy-threshold` if you want to tighten or relax the constraint before launching optimization.
+- Kick off Weco with `--metric cost --goal minimize --additional-instructions guide.md` so the optimizer respects the constraint while acting on the extra tips.
+
 ### Tips
 
 - Ensure your OpenAI key has access to a vision-capable model (default: `gpt-4o-mini` in the eval; change via `--model`).
 - Adjust `--num-workers` to balance throughput and rate limits.
 - You can tweak baseline behavior in `optimize.py` (prompt, temperature) — Weco will explore modifications automatically during optimization.
-
-
+- Include `--additional-instructions guide.md` whenever you run Weco so those cost-conscious hints influence the generated proposals.
diff --git a/examples/extract-line-plot/eval.py b/examples/extract-line-plot/eval.py
@@ -8,7 +8,7 @@
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple
 
-from optimize import VLMExtractor
+from optimize import extract_csv
 
 try:
     import matplotlib
@@ -18,6 +18,9 @@
 except Exception:  # pragma: no cover - optional dependency
     plt = None
 
+COST_ACCURACY_THRESHOLD_DEFAULT = 0.45
+COST_CONSTRAINT_PENALTY = 1_000_000.0
+
 
 def read_index(index_csv_path: Path) -> List[Tuple[str, Path, Path]]:
     rows: List[Tuple[str, Path, Path]] = []
@@ -259,14 +262,14 @@ def evaluate_predictions(gt_csv_path: Path, pred_csv_path: Path) -> float:
 
 
 def process_one(
-    extractor: VLMExtractor, base_dir: Path, example_id: str, image_rel: Path, gt_table_rel: Path, output_dir: Path
-) -> Tuple[str, float, Path, Path]:
+    base_dir: Path, example_id: str, image_rel: Path, gt_table_rel: Path, output_dir: Path
+) -> Tuple[str, float, Path, Path, float]:
     image_path = base_dir / image_rel
     gt_csv_path = base_dir / gt_table_rel
-    pred_csv_text = extractor.image_to_csv(image_path)
+    pred_csv_text, cost_usd = extract_csv(image_path)
     pred_path = write_csv(output_dir, example_id, pred_csv_text)
     score = evaluate_predictions(gt_csv_path, pred_path)
-    return example_id, score, pred_path, gt_csv_path
+    return example_id, score, pred_path, gt_csv_path, cost_usd
 
 
 def main() -> None:
@@ -276,6 +279,20 @@ def main() -> None:
     parser.add_argument("--out-dir", type=str, default="predictions")
     parser.add_argument("--max-samples", type=int, default=100)
     parser.add_argument("--num-workers", type=int, default=4)
+    parser.add_argument(
+        "--cost-metric",
+        action="store_true",
+        help=(
+            "When set, also report a `cost:` metric suitable for Weco minimization. "
+            "Requires final accuracy to exceed --cost-accuracy-threshold; otherwise a large penalty is reported."
+        ),
+    )
+    parser.add_argument(
+        "--cost-accuracy-threshold",
+        type=float,
+        default=COST_ACCURACY_THRESHOLD_DEFAULT,
+        help="Minimum accuracy required when --cost-metric is set (default: 0.45).",
+    )
     parser.add_argument(
         "--visualize-dir",
         type=str,
@@ -307,30 +324,31 @@ def main() -> None:
         sys.exit(1)
 
     rows = read_index(index_path)[: args.max_samples]
-    extractor = VLMExtractor()
 
     visualize_dir: Optional[Path] = Path(args.visualize_dir) if args.visualize_dir else None
     visualize_max = max(0, args.visualize_max)
     if visualize_dir and plt is None:
         print("[warn] matplotlib not available; skipping visualization.", file=sys.stderr)
         visualize_dir = None
 
-    print(f"[setup] evaluating {len(rows)} samples using {extractor.model} …", flush=True)
+    print(f"[setup] evaluating {len(rows)} samples …", flush=True)
     start = time.time()
     scores: List[float] = []
+    costs: List[float] = []
     saved_visualizations = 0
 
     with ThreadPoolExecutor(max_workers=max(1, args.num_workers)) as pool:
         futures = [
-            pool.submit(process_one, extractor, base_dir, example_id, image_rel, gt_table_rel, Path(args.out_dir))
+            pool.submit(process_one, base_dir, example_id, image_rel, gt_table_rel, Path(args.out_dir))
             for (example_id, image_rel, gt_table_rel) in rows
         ]
 
         try:
             for idx, fut in enumerate(as_completed(futures), 1):
                 try:
-                    example_id, score, pred_path, gt_csv_path = fut.result()
+                    example_id, score, pred_path, gt_csv_path, cost_usd = fut.result()
                     scores.append(score)
+                    costs.append(cost_usd)
                     if visualize_dir and (visualize_max == 0 or saved_visualizations < visualize_max):
                         out_path = visualize_difference(
                             gt_csv_path,
@@ -346,7 +364,11 @@ def main() -> None:
                     if idx % 5 == 0 or idx == len(rows):
                         elapsed = time.time() - start
                         avg = sum(scores) / len(scores) if scores else 0.0
-                        print(f"[progress] {idx}/{len(rows)} done, avg score: {avg:.4f}, elapsed {elapsed:.1f}s", flush=True)
+                        avg_cost = sum(costs) / len(costs) if costs else 0.0
+                        print(
+                            f"[progress] {idx}/{len(rows)} done, avg score: {avg:.4f}, avg cost: ${avg_cost:.4f}, elapsed {elapsed:.1f}s",
+                            flush=True,
+                        )
                 except Exception as e:
                     print(f"[error] failed on sample {idx}: {e}", file=sys.stderr)
         except KeyboardInterrupt:
@@ -356,7 +378,7 @@ def main() -> None:
     final_score = sum(scores) / len(scores) if scores else 0.0
 
     # Apply cost cap: accuracy is zeroed if average cost/query exceeds $0.02
-    avg_cost_per_query = (extractor.total_cost_usd / extractor.num_queries) if getattr(extractor, "num_queries", 0) else 0.0
+    avg_cost_per_query = (sum(costs) / len(costs)) if costs else 0.0
     if avg_cost_per_query > 0.02:
         print(f"[cost] avg ${avg_cost_per_query:.4f}/query exceeds $0.02 cap; accuracy set to 0.0", flush=True)
         final_score = 0.0
@@ -365,6 +387,20 @@ def main() -> None:
 
     print(f"accuracy: {final_score:.4f}")
 
+    if args.cost_metric:
+        if final_score > args.cost_accuracy_threshold:
+            reported_cost = avg_cost_per_query
+        else:
+            print(
+                (
+                    f"[constraint] accuracy {final_score:.4f} <= "
+                    f"threshold {args.cost_accuracy_threshold:.2f}; reporting penalty ${COST_CONSTRAINT_PENALTY:.1f}"
+                ),
+                flush=True,
+            )
+            reported_cost = COST_CONSTRAINT_PENALTY
+        print(f"cost: {reported_cost:.6f}")
+
 
 if __name__ == "__main__":
     main()
diff --git a/examples/extract-line-plot/optimize.py b/examples/extract-line-plot/optimize.py
@@ -1,19 +1,23 @@
 """
 optimize.py
 
-Baseline implementation of a VLM-driven function that takes an image and returns CSV.
-Weco will optimize the prompt and logic here.
+Exposes a single public entry point `extract_csv` that turns a chart image into CSV text.
+All helper utilities remain private to this module.
 """
 
 import base64
-import threading
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Tuple
 
 from openai import OpenAI
 
+__all__ = ["extract_csv"]
 
-def build_prompt() -> str:
+_DEFAULT_MODEL = "gpt-4o-mini"
+_CLIENT = OpenAI()
+
+
+def _build_prompt() -> str:
     return (
         "You are a precise data extraction model. Given a chart image, extract the underlying data table.\n"
         "Return ONLY the CSV text with a header row and no markdown code fences.\n"
@@ -25,92 +29,69 @@ def build_prompt() -> str:
     )
 
 
-def image_to_data_uri(image_path: Path) -> str:
+def _image_to_data_uri(image_path: Path) -> str:
     mime = "image/png" if image_path.suffix.lower() == ".png" else "image/jpeg"
     data = image_path.read_bytes()
     b64 = base64.b64encode(data).decode("ascii")
     return f"data:{mime};base64,{b64}"
 
 
-def clean_to_csv(text: str) -> str:
+def _clean_to_csv(text: str) -> str:
     return text.strip()
 
 
-class VLMExtractor:
-    """Baseline VLM wrapper for chart-to-CSV extraction."""
-
-    def __init__(self, model: str = "gpt-4o-mini", client: Optional[OpenAI] = None) -> None:
-        self.model = model
-        self.client = client or OpenAI()
-        # Aggregates
-        self.total_prompt_tokens: int = 0
-        self.total_completion_tokens: int = 0
-        self.total_cost_usd: float = 0.0
-        self.num_queries: int = 0
-        self._usage_lock = threading.Lock()
-
-    def _pricing_for_model(self) -> dict:
-        """Return pricing for current model in USD per token.
-
-        Structure: {"in": x, "in_cached": y, "out": z}
-        Defaults to GPT-5 mini if model not matched.
-        """
-        name = (self.model or "").lower()
-        # Prices are given per 1M tokens in the spec; convert to per-token
-        per_million = {
-            "gpt-5": {"in": 1.250, "in_cached": 0.125, "out": 10.000},
-            "gpt-5-mini": {"in": 0.250, "in_cached": 0.025, "out": 2.000},
-            "gpt-5-nano": {"in": 0.050, "in_cached": 0.005, "out": 0.400},
-        }
-        # Pick by prefix
-        if name.startswith("gpt-5-nano"):
-            chosen = per_million["gpt-5-nano"]
-        elif name.startswith("gpt-5-mini"):
-            chosen = per_million["gpt-5-mini"]
-        elif name.startswith("gpt-5"):
-            chosen = per_million["gpt-5"]
-        else:
-            chosen = per_million["gpt-5-mini"]
-        # Convert per 1M to per token
-        return {k: v / 1_000_000.0 for k, v in chosen.items()}
-
-    def image_to_csv(self, image_path: Path) -> str:
-        prompt = build_prompt()
-        image_uri = image_to_data_uri(image_path)
-        response = self.client.chat.completions.create(
-            model=self.model,
-            messages=[
-                {
-                    "role": "user",
-                    "content": [{"type": "text", "text": prompt}, {"type": "image_url", "image_url": {"url": image_uri}}],
-                }
-            ],
+def _pricing_for_model(model_name: str) -> dict:
+    """Return pricing information for the given model in USD per token."""
+    name = (model_name or "").lower()
+    per_million = {
+        "gpt-5": {"in": 1.250, "in_cached": 0.125, "out": 10.000},
+        "gpt-5-mini": {"in": 0.250, "in_cached": 0.025, "out": 2.000},
+        "gpt-5-nano": {"in": 0.050, "in_cached": 0.005, "out": 0.400},
+    }
+    if name.startswith("gpt-5-nano"):
+        chosen = per_million["gpt-5-nano"]
+    elif name.startswith("gpt-5-mini"):
+        chosen = per_million["gpt-5-mini"]
+    elif name.startswith("gpt-5"):
+        chosen = per_million["gpt-5"]
+    else:
+        chosen = per_million["gpt-5-mini"]
+    return {k: v / 1_000_000.0 for k, v in chosen.items()}
+
+
+def extract_csv(image_path: Path, model: Optional[str] = None) -> Tuple[str, float]:
+    """
+    Extract CSV text from an image and return (csv_text, cost_usd).
+
+    The caller can optionally override the model name; otherwise the default is used.
+    """
+    effective_model = model or _DEFAULT_MODEL
+    prompt = _build_prompt()
+    image_uri = _image_to_data_uri(image_path)
+    response = _CLIENT.chat.completions.create(
+        model=effective_model,
+        messages=[
+            {
+                "role": "user",
+                "content": [{"type": "text", "text": prompt}, {"type": "image_url", "image_url": {"url": image_uri}}],
+            }
+        ],
+    )
+
+    usage = getattr(response, "usage", None)
+    cost_usd = 0.0
+    if usage is not None:
+        prompt_tokens = int(getattr(usage, "prompt_tokens", 0) or 0)
+        completion_tokens = int(getattr(usage, "completion_tokens", 0) or 0)
+        details = getattr(usage, "prompt_tokens_details", None)
+        cached_tokens = 0
+        if details is not None:
+            cached_tokens = int(getattr(details, "cached_tokens", 0) or 0)
+        non_cached_prompt_tokens = max(0, prompt_tokens - cached_tokens)
+        rates = _pricing_for_model(effective_model)
+        cost_usd = (
+            non_cached_prompt_tokens * rates["in"] + cached_tokens * rates["in_cached"] + completion_tokens * rates["out"]
         )
-        # Track usage and cost if available
-        usage = getattr(response, "usage", None)
-        with self._usage_lock:
-            if usage is not None:
-                prompt_tokens = int(getattr(usage, "prompt_tokens", 0) or 0)
-                completion_tokens = int(getattr(usage, "completion_tokens", 0) or 0)
-                # Attempt to detect cached tokens if available
-                details = getattr(usage, "prompt_tokens_details", None)
-                cached_tokens = 0
-                if details is not None:
-                    cached_tokens = int(getattr(details, "cached_tokens", 0) or 0)
-                non_cached_prompt_tokens = max(0, prompt_tokens - cached_tokens)
-
-                rates = self._pricing_for_model()
-                cost = (
-                    non_cached_prompt_tokens * rates["in"]
-                    + cached_tokens * rates["in_cached"]
-                    + completion_tokens * rates["out"]
-                )
-
-                self.total_prompt_tokens += prompt_tokens
-                self.total_completion_tokens += completion_tokens
-                self.total_cost_usd += cost
-                self.num_queries += 1
-            else:
-                self.num_queries += 1
-        text = response.choices[0].message.content or ""
-        return clean_to_csv(text)
+
+    text = response.choices[0].message.content or ""
+    return _clean_to_csv(text), cost_usd