rithulkamesh
diff --git a/‎CHANGELOG.md‎
Lines changed: 17 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎hivemind/agents/agent.py‎
Lines changed: 13 additions & 2 deletions b/‎hivemind/agents/agent.py‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎hivemind/cli/init.py‎
Lines changed: 23 additions & 0 deletions b/‎hivemind/cli/init.py‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎hivemind/cli/main.py‎
Lines changed: 157 additions & 0 deletions b/‎hivemind/cli/main.py‎
Lines changed: 157 additions & 0 deletions
diff --git a/‎hivemind/tools/scoring/__init__.py‎
Lines changed: 52 additions & 0 deletions b/‎hivemind/tools/scoring/__init__.py‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎hivemind/tools/scoring/report.py‎
Lines changed: 44 additions & 0 deletions b/‎hivemind/tools/scoring/report.py‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎hivemind/tools/scoring/scorer.py‎
Lines changed: 39 additions & 0 deletions b/‎hivemind/tools/scoring/scorer.py‎
Lines changed: 39 additions & 0 deletions
@@ -7,6 +7,23 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [1.3.0] - 2026-03-09
+
+### Added
+
+- **Tool Reliability Scoring (v1.3)** — Every tool gets a runtime score from real usage; scores feed into tool selection so unreliable tools are demoted over time.
+- **`hivemind tools`** — List registered tools with reliability scores (table: Tool Name, Category, Score, Label, Success Rate, Avg Latency, Calls, Last Used). Options: `--category <name>`, `--poor` (score &lt; 0.40). Subcommand `reset <tool_name>` or `reset --all` (with confirmation) wipes score history. Rows colored by label (excellent=green, good=default, degraded=yellow, poor=red); tools with &lt;5 calls show "new".
+- **Scoring module** (`hivemind/tools/scoring/`): `ToolScoreStore` (SQLite at `~/.config/hivemind/tool_scores.db`), `ToolScore` dataclass, `record_tool_result`, `get_tool_score`, `get_default_score_store`; `compute_composite_score` and `score_label` in `scorer.py`; `select_tools_scored` (70% similarity + 30% reliability) in `selector.py`; `generate_tools_report` in `report.py`. New tools (&lt;5 calls) get neutral 0.75; `HIVEMIND_DISABLE_TOOL_SCORING=1` bypasses scoring for tests.
+- **`hivemind doctor`** — Tool scoring checks: info line "Tool scoring database: {N} records, {M} tools tracked"; warns if &gt;20% of tools with 10+ calls are poor; suggests `hivemind tools reset <name>` for tools with 0% success and ≥10 calls.
+- **`hivemind analytics`** — Appends tool reliability report (summary, top 3, bottom 3) when scores exist.
+- **Agent** — Uses blended tool selection (similarity × reliability) and passes `task_type` (role or "general") into tool runner for per-context scoring.
+- **Tests** — `tests/test_tool_scoring.py`: composite score (new/reliable/dead), score_label, store record/retrieve/prune/reset, selector prefers reliable, similarity dominates, env bypass.
+
+### Changed
+
+- `run_tool(name, args, task_type=None)` now records each run to the scoring store (success/failure, latency, error_type).
+- `get_tools_for_task(..., score_store=None)` uses `select_tools_scored` when a score store is provided and scoring is not disabled.
+
 ## [1.2.0] - 2026-03-09
 
 ### Added
 
@@ -179,7 +179,18 @@ def _run_with_tools(self, task: Task, memory_section: str = "", role_prefix: str
         from hivemind.tools.tool_runner import run_tool
 
         role = getattr(task, "role", None)
-        tools = get_tools_for_task(task.description if task else "", role=role)
+        task_type = role or "general"
+        score_store = None
+        try:
+            from hivemind.tools.scoring import get_default_score_store
+            score_store = get_default_score_store()
+        except Exception:
+            pass
+        tools = get_tools_for_task(
+            task.description if task else "",
+            role=role,
+            score_store=score_store,
+        )
         tools_section = _format_tools_section(tools)
         prompt = PROMPT_TEMPLATE_WITH_TOOLS.format(
             role_prefix=role_prefix,
@@ -194,7 +205,7 @@ def _run_with_tools(self, task: Task, memory_section: str = "", role_prefix: str
             tool_name, tool_args = _parse_tool_call(response)
             if tool_name is None:
                 return response.strip()
-            result = run_tool(tool_name, tool_args)
+            result = run_tool(tool_name, tool_args, task_type=task_type)
             self._emit(
                 events.TOOL_CALLED,
                 {"task_id": task.id, "tool": tool_name, "result_preview": result[:200]},
 
@@ -451,6 +451,29 @@ def run_doctor() -> int:
     except Exception as e:
         issues.append(f"Tool registry: {e}")
 
+    try:
+        from hivemind.tools.scoring import get_default_score_store
+
+        store = get_default_score_store()
+        n_records = store.result_count()
+        n_tools = store.tool_count()
+        ok.append(f"Tool scoring database: {n_records} records, {n_tools} tools tracked")
+        if n_tools > 0:
+            scores = store.get_all_scores()
+            with_10_plus = [s for s in scores if s.total_calls >= 10]
+            poor = [s for s in with_10_plus if s.composite_score < 0.40]
+            if with_10_plus and len(poor) / len(with_10_plus) > 0.20:
+                warnings.append(
+                    f"Over 20% of tools (10+ calls) are in poor state ({len(poor)}/{len(with_10_plus)}). Consider reviewing with 'hivemind tools --poor'."
+                )
+            dead = [s for s in with_10_plus if s.success_rate == 0.0]
+            for s in dead:
+                warnings.append(
+                    f"Tool '{s.tool_name}' has 0% success with {s.total_calls} calls. Consider: hivemind tools reset {s.tool_name}"
+                )
+    except Exception:
+        pass
+
     _check_plaintext_keys_in_toml(warnings)
 
     from rich.console import Console
 
@@ -312,6 +312,119 @@ def _run_analytics() -> int:
         print(
             f"{s['tool_name']}: count={s['count']} success_rate={s['success_rate']:.1f}% avg_latency_ms={s['avg_latency_ms']}"
         )
+    try:
+        from hivemind.tools.scoring import get_default_score_store
+        from hivemind.tools.scoring.report import generate_tools_report
+        store = get_default_score_store()
+        scores = store.get_all_scores()
+        if scores:
+            print()
+            print(generate_tools_report(scores))
+    except Exception:
+        pass
+    return 0
+
+
+def _run_tools(args: object) -> int:
+    """List tools with reliability scores, or reset score history."""
+    from rich.console import Console
+    from rich.prompt import Confirm
+    from rich.table import Table
+
+    from hivemind.tools.registry import list_tools
+    from hivemind.tools.selector import _tool_category
+    from hivemind.tools.scoring import get_default_score_store
+    from hivemind.tools.scoring.scorer import score_label
+
+    subcommand = getattr(args, "tools_subcommand", None) or "list"
+    category_filter = getattr(args, "category", None)
+    poor_only = getattr(args, "poor", False)
+    reset_all = getattr(args, "reset_all", False)
+    tool_name_reset = getattr(args, "tool_name", None)
+
+    if subcommand == "reset":
+        store = get_default_score_store()
+        if reset_all:
+            if not Confirm.ask("Wipe all tool scores? This cannot be undone."):
+                return 0
+            store.reset(None)
+            print("All tool scores wiped.")
+            return 0
+        if tool_name_reset:
+            store.reset(tool_name_reset)
+            print(f"Score history wiped for: {tool_name_reset}")
+            return 0
+        print("Usage: hivemind tools reset <tool_name> | hivemind tools reset --all", file=sys.stderr)
+        return 1
+
+    # List: all registered tools with scores
+    store = get_default_score_store()
+    scores_by_name = {s.tool_name: s for s in store.get_all_scores()}
+    all_tools = list_tools()
+    if category_filter:
+        allowed = {category_filter.lower().strip()}
+        all_tools = [t for t in all_tools if _tool_category(t) in allowed]
+    rows: list[tuple[str, str, float, str, float, float, int, str, bool]] = []
+    for t in all_tools:
+        s = scores_by_name.get(t.name)
+        if s is None:
+            score_val = 0.75
+            label = "new"
+            success_rate = 0.0
+            avg_lat = 0.0
+            calls = 0
+            last_used = "-"
+            is_new = True
+        else:
+            score_val = s.composite_score
+            label = score_label(s.composite_score)
+            success_rate = s.success_rate
+            avg_lat = s.avg_latency_ms
+            calls = s.total_calls
+            last_used = s.last_updated[:10] if len(s.last_updated) >= 10 else s.last_updated
+            is_new = s.is_new
+        if poor_only and score_val >= 0.40:
+            continue
+        cat = _tool_category(t)
+        rows.append((t.name, cat, score_val, label, success_rate, avg_lat, calls, last_used, is_new))
+
+    rows.sort(key=lambda r: -r[2])
+    table = Table(title="Tool reliability scores")
+    table.add_column("Tool Name", style="bold")
+    table.add_column("Category")
+    table.add_column("Score", justify="right")
+    table.add_column("Label")
+    table.add_column("Success Rate", justify="right")
+    table.add_column("Avg Latency", justify="right")
+    table.add_column("Calls", justify="right")
+    table.add_column("Last Used")
+    for r in rows:
+        name, cat, score_val, label, success_rate, avg_lat, calls, last_used, is_new = r
+        if is_new and label == "new":
+            label_style = "dim"
+        elif label == "excellent":
+            label_style = "green"
+        elif label == "good":
+            label_style = "default"
+        elif label == "degraded":
+            label_style = "yellow"
+        else:
+            label_style = "red"
+        table.add_row(
+            name,
+            cat,
+            f"{score_val:.2f}",
+            f"[{label_style}]{label}[/]",
+            f"{success_rate:.0%}" if not is_new else "-",
+            f"{avg_lat:.0f} ms" if not is_new else "-",
+            str(calls),
+            last_used,
+        )
+    console = Console()
+    if rows:
+        console.print(table)
+    else:
+        console.print("No tools match the filters.")
     return 0
 
 
@@ -592,6 +705,50 @@ def main() -> int:
     )
     analytics_parser.set_defaults(func=lambda a: _run_analytics())
 
+    tools_parser = subparsers.add_parser(
+        "tools",
+        help="List tool reliability scores or reset history",
+        description="List registered tools with reliability scores (excellent/good/degraded/poor), or reset score history.",
+        epilog="""
+Examples:
+  hivemind tools
+  hivemind tools --category research
+  hivemind tools --poor
+  hivemind tools reset my_tool
+  hivemind tools reset --all
+""",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    tools_parser.add_argument(
+        "tools_subcommand",
+        nargs="?",
+        default="list",
+        choices=["list", "reset"],
+        help="list (default) | reset",
+    )
+    tools_parser.add_argument(
+        "tool_name",
+        nargs="?",
+        help="Tool name (for reset)",
+    )
+    tools_parser.add_argument(
+        "--category",
+        metavar="NAME",
+        help="Filter by category",
+    )
+    tools_parser.add_argument(
+        "--poor",
+        action="store_true",
+        help="Show only tools with score < 0.40",
+    )
+    tools_parser.add_argument(
+        "--all",
+        dest="reset_all",
+        action="store_true",
+        help="Wipe all scores (with confirmation; use with reset)",
+    )
+    tools_parser.set_defaults(func=_run_tools)
+
     cache_parser = subparsers.add_parser(
         "cache",
         help="Task result cache",
 
@@ -0,0 +1,52 @@
+"""
+Tool reliability scoring: record results, get scores, blended selection.
+"""
+
+from hivemind.tools.scoring.store import ToolScore, ToolScoreStore
+
+_default_store: ToolScoreStore | None = None
+
+
+def get_default_score_store() -> ToolScoreStore:
+    """Return the default tool score store (singleton)."""
+    global _default_store
+    if _default_store is None:
+        _default_store = ToolScoreStore()
+    return _default_store
+
+
+def record_tool_result(
+    tool_name: str,
+    task_type: str | None,
+    success: bool,
+    latency_ms: int | None = None,
+    error_type: str | None = None,
+) -> None:
+    """Record one tool execution result (success/failure, latency) into the score store."""
+    try:
+        get_default_score_store().record(
+            tool_name=tool_name,
+            task_type=task_type,
+            success=success,
+            latency_ms=latency_ms,
+            error_type=error_type,
+        )
+    except Exception:
+        pass
+
+
+def get_tool_score(tool_name: str) -> ToolScore | None:
+    """Return the current ToolScore for a tool, or None if not tracked."""
+    try:
+        return get_default_score_store().get_score(tool_name)
+    except Exception:
+        return None
+
+
+__all__ = [
+    "record_tool_result",
+    "get_tool_score",
+    "get_default_score_store",
+    "ToolScoreStore",
+    "ToolScore",
+]
@@ -0,0 +1,44 @@
+"""
+Formatting for hivemind tools CLI output and analytics summary.
+"""
+
+from hivemind.tools.scoring.scorer import score_label
+from hivemind.tools.scoring.store import ToolScore
+
+
+def generate_tools_report(scores: list[ToolScore]) -> str:
+    """
+    Summary header: total tools, % excellent/good/degraded/poor;
+    highlight top 3 and bottom 3 tools.
+    """
+    if not scores:
+        return "No tool scores recorded."
+    total = len(scores)
+    excellent = sum(1 for s in scores if s.composite_score >= 0.85)
+    good = sum(1 for s in scores if 0.65 <= s.composite_score < 0.85)
+    degraded = sum(1 for s in scores if 0.40 <= s.composite_score < 0.65)
+    poor = sum(1 for s in scores if s.composite_score < 0.40)
+    p_ex = (excellent / total * 100) if total else 0
+    p_good = (good / total * 100) if total else 0
+    p_deg = (degraded / total * 100) if total else 0
+    p_poor = (poor / total * 100) if total else 0
+    lines = [
+        f"Tool reliability: {total} tools tracked",
+        f"  excellent: {p_ex:.0f}%  good: {p_good:.0f}%  degraded: {p_deg:.0f}%  poor: {p_poor:.0f}%",
+        "",
+    ]
+    sorted_scores = sorted(scores, key=lambda s: -s.composite_score)
+    top3 = sorted_scores[:3]
+    bottom3 = sorted_scores[-3:] if len(sorted_scores) >= 3 else sorted_scores
+    if top3:
+        lines.append("Top 3:")
+        for s in top3:
+            label = score_label(s.composite_score)
+            lines.append(f"  {s.tool_name}: {s.composite_score:.2f} ({label})")
+        lines.append("")
+    if bottom3 and (len(bottom3) < len(top3) or bottom3[0].tool_name != top3[0].tool_name):
+        lines.append("Bottom 3:")
+        for s in reversed(bottom3):
+            label = score_label(s.composite_score)
+            lines.append(f"  {s.tool_name}: {s.composite_score:.2f} ({label})")
+    return "\n".join(lines)
@@ -0,0 +1,39 @@
+"""
+ToolScorer: compute composite score from raw stats.
+"""
+
+
+def _clamp(x: float, lo: float, hi: float) -> float:
+    return max(lo, min(hi, x))
+
+
+def compute_composite_score(stats: dict) -> float:
+    """
+    Compute composite reliability score from stats dict with keys:
+    success_rate, avg_latency_ms, total_calls, recent_failures.
+    Returns 0.05--1.0.
+    """
+    total_calls = stats.get("total_calls", 0)
+    if total_calls < 5:
+        return 0.75
+    success_rate = stats.get("success_rate", 0.0)
+    if success_rate == 0.0 and total_calls >= 10:
+        return 0.05
+    avg_latency_ms = stats.get("avg_latency_ms", 0.0)
+    recent_failures = stats.get("recent_failures", 0)
+    reliability = success_rate
+    speed = 1.0 - _clamp(avg_latency_ms / 10000.0, 0.0, 1.0)
+    recency = 1.0 - (recent_failures / 20.0)
+    composite = (reliability * 0.50) + (speed * 0.30) + (recency * 0.20)
+    return _clamp(composite, 0.05, 1.0)
+
+
+def score_label(score: float) -> str:
+    """Return label for a composite score."""
+    if score >= 0.85:
+        return "excellent"
+    if score >= 0.65:
+        return "good"
+    if score >= 0.40:
+        return "degraded"
+    return "poor"