Skip to content
This repository was archived by the owner on Mar 16, 2026. It is now read-only.

Commit 9e145ff

Browse files
committed
feat: introduce tool reliability scoring and enhance tool selection
- Added a scoring system for tools based on runtime performance, influencing tool selection over time. - Implemented `hivemind tools` command to list tools with reliability scores and options to reset score history. - Enhanced `hivemind doctor` and `hivemind analytics` to include tool scoring checks and reports. - Updated `run_tool` to record tool usage for scoring, and modified tool selection to consider reliability. - Introduced tests for the new scoring functionality and tool selection logic.
1 parent b76d270 commit 9e145ff

18 files changed

+1202
-22
lines changed

CHANGELOG.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,23 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
## [1.3.0] - 2026-03-09
11+
12+
### Added
13+
14+
- **Tool Reliability Scoring (v1.3)** — Every tool gets a runtime score from real usage; scores feed into tool selection so unreliable tools are demoted over time.
15+
- **`hivemind tools`** — List registered tools with reliability scores (table: Tool Name, Category, Score, Label, Success Rate, Avg Latency, Calls, Last Used). Options: `--category <name>`, `--poor` (score &lt; 0.40). Subcommand `reset <tool_name>` or `reset --all` (with confirmation) wipes score history. Rows colored by label (excellent=green, good=default, degraded=yellow, poor=red); tools with &lt;5 calls show "new".
16+
- **Scoring module** (`hivemind/tools/scoring/`): `ToolScoreStore` (SQLite at `~/.config/hivemind/tool_scores.db`), `ToolScore` dataclass, `record_tool_result`, `get_tool_score`, `get_default_score_store`; `compute_composite_score` and `score_label` in `scorer.py`; `select_tools_scored` (70% similarity + 30% reliability) in `selector.py`; `generate_tools_report` in `report.py`. New tools (&lt;5 calls) get neutral 0.75; `HIVEMIND_DISABLE_TOOL_SCORING=1` bypasses scoring for tests.
17+
- **`hivemind doctor`** — Tool scoring checks: info line "Tool scoring database: {N} records, {M} tools tracked"; warns if &gt;20% of tools with 10+ calls are poor; suggests `hivemind tools reset <name>` for tools with 0% success and ≥10 calls.
18+
- **`hivemind analytics`** — Appends tool reliability report (summary, top 3, bottom 3) when scores exist.
19+
- **Agent** — Uses blended tool selection (similarity × reliability) and passes `task_type` (role or "general") into tool runner for per-context scoring.
20+
- **Tests**`tests/test_tool_scoring.py`: composite score (new/reliable/dead), score_label, store record/retrieve/prune/reset, selector prefers reliable, similarity dominates, env bypass.
21+
22+
### Changed
23+
24+
- `run_tool(name, args, task_type=None)` now records each run to the scoring store (success/failure, latency, error_type).
25+
- `get_tools_for_task(..., score_store=None)` uses `select_tools_scored` when a score store is provided and scoring is not disabled.
26+
1027
## [1.2.0] - 2026-03-09
1128

1229
### Added

hivemind/agents/agent.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,18 @@ def _run_with_tools(self, task: Task, memory_section: str = "", role_prefix: str
179179
from hivemind.tools.tool_runner import run_tool
180180

181181
role = getattr(task, "role", None)
182-
tools = get_tools_for_task(task.description if task else "", role=role)
182+
task_type = role or "general"
183+
score_store = None
184+
try:
185+
from hivemind.tools.scoring import get_default_score_store
186+
score_store = get_default_score_store()
187+
except Exception:
188+
pass
189+
tools = get_tools_for_task(
190+
task.description if task else "",
191+
role=role,
192+
score_store=score_store,
193+
)
183194
tools_section = _format_tools_section(tools)
184195
prompt = PROMPT_TEMPLATE_WITH_TOOLS.format(
185196
role_prefix=role_prefix,
@@ -194,7 +205,7 @@ def _run_with_tools(self, task: Task, memory_section: str = "", role_prefix: str
194205
tool_name, tool_args = _parse_tool_call(response)
195206
if tool_name is None:
196207
return response.strip()
197-
result = run_tool(tool_name, tool_args)
208+
result = run_tool(tool_name, tool_args, task_type=task_type)
198209
self._emit(
199210
events.TOOL_CALLED,
200211
{"task_id": task.id, "tool": tool_name, "result_preview": result[:200]},

hivemind/cli/init.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -451,6 +451,29 @@ def run_doctor() -> int:
451451
except Exception as e:
452452
issues.append(f"Tool registry: {e}")
453453

454+
try:
455+
from hivemind.tools.scoring import get_default_score_store
456+
457+
store = get_default_score_store()
458+
n_records = store.result_count()
459+
n_tools = store.tool_count()
460+
ok.append(f"Tool scoring database: {n_records} records, {n_tools} tools tracked")
461+
if n_tools > 0:
462+
scores = store.get_all_scores()
463+
with_10_plus = [s for s in scores if s.total_calls >= 10]
464+
poor = [s for s in with_10_plus if s.composite_score < 0.40]
465+
if with_10_plus and len(poor) / len(with_10_plus) > 0.20:
466+
warnings.append(
467+
f"Over 20% of tools (10+ calls) are in poor state ({len(poor)}/{len(with_10_plus)}). Consider reviewing with 'hivemind tools --poor'."
468+
)
469+
dead = [s for s in with_10_plus if s.success_rate == 0.0]
470+
for s in dead:
471+
warnings.append(
472+
f"Tool '{s.tool_name}' has 0% success with {s.total_calls} calls. Consider: hivemind tools reset {s.tool_name}"
473+
)
474+
except Exception:
475+
pass
476+
454477
_check_plaintext_keys_in_toml(warnings)
455478

456479
from rich.console import Console

hivemind/cli/main.py

Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,119 @@ def _run_analytics() -> int:
312312
print(
313313
f"{s['tool_name']}: count={s['count']} success_rate={s['success_rate']:.1f}% avg_latency_ms={s['avg_latency_ms']}"
314314
)
315+
try:
316+
from hivemind.tools.scoring import get_default_score_store
317+
from hivemind.tools.scoring.report import generate_tools_report
318+
store = get_default_score_store()
319+
scores = store.get_all_scores()
320+
if scores:
321+
print()
322+
print(generate_tools_report(scores))
323+
except Exception:
324+
pass
325+
return 0
326+
327+
328+
def _run_tools(args: object) -> int:
329+
"""List tools with reliability scores, or reset score history."""
330+
from rich.console import Console
331+
from rich.prompt import Confirm
332+
from rich.table import Table
333+
334+
from hivemind.tools.registry import list_tools
335+
from hivemind.tools.selector import _tool_category
336+
from hivemind.tools.scoring import get_default_score_store
337+
from hivemind.tools.scoring.scorer import score_label
338+
339+
subcommand = getattr(args, "tools_subcommand", None) or "list"
340+
category_filter = getattr(args, "category", None)
341+
poor_only = getattr(args, "poor", False)
342+
reset_all = getattr(args, "reset_all", False)
343+
tool_name_reset = getattr(args, "tool_name", None)
344+
345+
if subcommand == "reset":
346+
store = get_default_score_store()
347+
if reset_all:
348+
if not Confirm.ask("Wipe all tool scores? This cannot be undone."):
349+
return 0
350+
store.reset(None)
351+
print("All tool scores wiped.")
352+
return 0
353+
if tool_name_reset:
354+
store.reset(tool_name_reset)
355+
print(f"Score history wiped for: {tool_name_reset}")
356+
return 0
357+
print("Usage: hivemind tools reset <tool_name> | hivemind tools reset --all", file=sys.stderr)
358+
return 1
359+
360+
# List: all registered tools with scores
361+
store = get_default_score_store()
362+
scores_by_name = {s.tool_name: s for s in store.get_all_scores()}
363+
all_tools = list_tools()
364+
if category_filter:
365+
allowed = {category_filter.lower().strip()}
366+
all_tools = [t for t in all_tools if _tool_category(t) in allowed]
367+
rows: list[tuple[str, str, float, str, float, float, int, str, bool]] = []
368+
for t in all_tools:
369+
s = scores_by_name.get(t.name)
370+
if s is None:
371+
score_val = 0.75
372+
label = "new"
373+
success_rate = 0.0
374+
avg_lat = 0.0
375+
calls = 0
376+
last_used = "-"
377+
is_new = True
378+
else:
379+
score_val = s.composite_score
380+
label = score_label(s.composite_score)
381+
success_rate = s.success_rate
382+
avg_lat = s.avg_latency_ms
383+
calls = s.total_calls
384+
last_used = s.last_updated[:10] if len(s.last_updated) >= 10 else s.last_updated
385+
is_new = s.is_new
386+
if poor_only and score_val >= 0.40:
387+
continue
388+
cat = _tool_category(t)
389+
rows.append((t.name, cat, score_val, label, success_rate, avg_lat, calls, last_used, is_new))
390+
391+
rows.sort(key=lambda r: -r[2])
392+
table = Table(title="Tool reliability scores")
393+
table.add_column("Tool Name", style="bold")
394+
table.add_column("Category")
395+
table.add_column("Score", justify="right")
396+
table.add_column("Label")
397+
table.add_column("Success Rate", justify="right")
398+
table.add_column("Avg Latency", justify="right")
399+
table.add_column("Calls", justify="right")
400+
table.add_column("Last Used")
401+
for r in rows:
402+
name, cat, score_val, label, success_rate, avg_lat, calls, last_used, is_new = r
403+
if is_new and label == "new":
404+
label_style = "dim"
405+
elif label == "excellent":
406+
label_style = "green"
407+
elif label == "good":
408+
label_style = "default"
409+
elif label == "degraded":
410+
label_style = "yellow"
411+
else:
412+
label_style = "red"
413+
table.add_row(
414+
name,
415+
cat,
416+
f"{score_val:.2f}",
417+
f"[{label_style}]{label}[/]",
418+
f"{success_rate:.0%}" if not is_new else "-",
419+
f"{avg_lat:.0f} ms" if not is_new else "-",
420+
str(calls),
421+
last_used,
422+
)
423+
console = Console()
424+
if rows:
425+
console.print(table)
426+
else:
427+
console.print("No tools match the filters.")
315428
return 0
316429

317430

@@ -592,6 +705,50 @@ def main() -> int:
592705
)
593706
analytics_parser.set_defaults(func=lambda a: _run_analytics())
594707

708+
tools_parser = subparsers.add_parser(
709+
"tools",
710+
help="List tool reliability scores or reset history",
711+
description="List registered tools with reliability scores (excellent/good/degraded/poor), or reset score history.",
712+
epilog="""
713+
Examples:
714+
hivemind tools
715+
hivemind tools --category research
716+
hivemind tools --poor
717+
hivemind tools reset my_tool
718+
hivemind tools reset --all
719+
""",
720+
formatter_class=argparse.RawDescriptionHelpFormatter,
721+
)
722+
tools_parser.add_argument(
723+
"tools_subcommand",
724+
nargs="?",
725+
default="list",
726+
choices=["list", "reset"],
727+
help="list (default) | reset",
728+
)
729+
tools_parser.add_argument(
730+
"tool_name",
731+
nargs="?",
732+
help="Tool name (for reset)",
733+
)
734+
tools_parser.add_argument(
735+
"--category",
736+
metavar="NAME",
737+
help="Filter by category",
738+
)
739+
tools_parser.add_argument(
740+
"--poor",
741+
action="store_true",
742+
help="Show only tools with score < 0.40",
743+
)
744+
tools_parser.add_argument(
745+
"--all",
746+
dest="reset_all",
747+
action="store_true",
748+
help="Wipe all scores (with confirmation; use with reset)",
749+
)
750+
tools_parser.set_defaults(func=_run_tools)
751+
595752
cache_parser = subparsers.add_parser(
596753
"cache",
597754
help="Task result cache",

hivemind/tools/scoring/__init__.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
"""
2+
Tool reliability scoring: record results, get scores, blended selection.
3+
"""
4+
5+
from hivemind.tools.scoring.store import ToolScore, ToolScoreStore
6+
7+
_default_store: ToolScoreStore | None = None
8+
9+
10+
def get_default_score_store() -> ToolScoreStore:
11+
"""Return the default tool score store (singleton)."""
12+
global _default_store
13+
if _default_store is None:
14+
_default_store = ToolScoreStore()
15+
return _default_store
16+
17+
18+
def record_tool_result(
19+
tool_name: str,
20+
task_type: str | None,
21+
success: bool,
22+
latency_ms: int | None = None,
23+
error_type: str | None = None,
24+
) -> None:
25+
"""Record one tool execution result (success/failure, latency) into the score store."""
26+
try:
27+
get_default_score_store().record(
28+
tool_name=tool_name,
29+
task_type=task_type,
30+
success=success,
31+
latency_ms=latency_ms,
32+
error_type=error_type,
33+
)
34+
except Exception:
35+
pass
36+
37+
38+
def get_tool_score(tool_name: str) -> ToolScore | None:
39+
"""Return the current ToolScore for a tool, or None if not tracked."""
40+
try:
41+
return get_default_score_store().get_score(tool_name)
42+
except Exception:
43+
return None
44+
45+
46+
__all__ = [
47+
"record_tool_result",
48+
"get_tool_score",
49+
"get_default_score_store",
50+
"ToolScoreStore",
51+
"ToolScore",
52+
]

hivemind/tools/scoring/report.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
"""
2+
Formatting for hivemind tools CLI output and analytics summary.
3+
"""
4+
5+
from hivemind.tools.scoring.scorer import score_label
6+
from hivemind.tools.scoring.store import ToolScore
7+
8+
9+
def generate_tools_report(scores: list[ToolScore]) -> str:
10+
"""
11+
Summary header: total tools, % excellent/good/degraded/poor;
12+
highlight top 3 and bottom 3 tools.
13+
"""
14+
if not scores:
15+
return "No tool scores recorded."
16+
total = len(scores)
17+
excellent = sum(1 for s in scores if s.composite_score >= 0.85)
18+
good = sum(1 for s in scores if 0.65 <= s.composite_score < 0.85)
19+
degraded = sum(1 for s in scores if 0.40 <= s.composite_score < 0.65)
20+
poor = sum(1 for s in scores if s.composite_score < 0.40)
21+
p_ex = (excellent / total * 100) if total else 0
22+
p_good = (good / total * 100) if total else 0
23+
p_deg = (degraded / total * 100) if total else 0
24+
p_poor = (poor / total * 100) if total else 0
25+
lines = [
26+
f"Tool reliability: {total} tools tracked",
27+
f" excellent: {p_ex:.0f}% good: {p_good:.0f}% degraded: {p_deg:.0f}% poor: {p_poor:.0f}%",
28+
"",
29+
]
30+
sorted_scores = sorted(scores, key=lambda s: -s.composite_score)
31+
top3 = sorted_scores[:3]
32+
bottom3 = sorted_scores[-3:] if len(sorted_scores) >= 3 else sorted_scores
33+
if top3:
34+
lines.append("Top 3:")
35+
for s in top3:
36+
label = score_label(s.composite_score)
37+
lines.append(f" {s.tool_name}: {s.composite_score:.2f} ({label})")
38+
lines.append("")
39+
if bottom3 and (len(bottom3) < len(top3) or bottom3[0].tool_name != top3[0].tool_name):
40+
lines.append("Bottom 3:")
41+
for s in reversed(bottom3):
42+
label = score_label(s.composite_score)
43+
lines.append(f" {s.tool_name}: {s.composite_score:.2f} ({label})")
44+
return "\n".join(lines)

hivemind/tools/scoring/scorer.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
"""
2+
ToolScorer: compute composite score from raw stats.
3+
"""
4+
5+
6+
def _clamp(x: float, lo: float, hi: float) -> float:
7+
return max(lo, min(hi, x))
8+
9+
10+
def compute_composite_score(stats: dict) -> float:
11+
"""
12+
Compute composite reliability score from stats dict with keys:
13+
success_rate, avg_latency_ms, total_calls, recent_failures.
14+
Returns 0.05--1.0.
15+
"""
16+
total_calls = stats.get("total_calls", 0)
17+
if total_calls < 5:
18+
return 0.75
19+
success_rate = stats.get("success_rate", 0.0)
20+
if success_rate == 0.0 and total_calls >= 10:
21+
return 0.05
22+
avg_latency_ms = stats.get("avg_latency_ms", 0.0)
23+
recent_failures = stats.get("recent_failures", 0)
24+
reliability = success_rate
25+
speed = 1.0 - _clamp(avg_latency_ms / 10000.0, 0.0, 1.0)
26+
recency = 1.0 - (recent_failures / 20.0)
27+
composite = (reliability * 0.50) + (speed * 0.30) + (recency * 0.20)
28+
return _clamp(composite, 0.05, 1.0)
29+
30+
31+
def score_label(score: float) -> str:
32+
"""Return label for a composite score."""
33+
if score >= 0.85:
34+
return "excellent"
35+
if score >= 0.65:
36+
return "good"
37+
if score >= 0.40:
38+
return "degraded"
39+
return "poor"

0 commit comments

Comments
 (0)