from future import annotations
from dataclasses import dataclass from typing import Mapping, Sequence, List, Set
import pandas as pd
try: from sklearn.preprocessing import MultiLabelBinarizer from sklearn.metrics import classification_report except Exception: # pragma: no cover MultiLabelBinarizer = None classification_report = None
LabelDict = Mapping[str, str]
def _ensure_sklearn() -> None: if MultiLabelBinarizer is None or classification_report is None: raise ImportError( "scikit-learn is required for ground-truth diagnostics. " "Install with: pip install scikit-learn" )
def evaluate_with_ground_truth( y_actual: Sequence[Sequence[str]], y_pred: Sequence[Sequence[str]], label_dict: LabelDict, *, zero_division: int = 0, ) -> str: """ Multi-label evaluation with ground truth using sklearn classification_report.
Inputs
------
y_actual : list[list[str]]
Ground truth labels per sample
y_pred : list[list[str]]
Predicted labels per sample
label_dict : dict[str, str]
Canonical labels (keys define label set + order)
Output
------
str : raw sklearn classification report text
"""
_ensure_sklearn()
if len(y_actual) != len(y_pred):
raise ValueError(
f"y_actual and y_pred must have the same length. Got {len(y_actual)} vs {len(y_pred)}."
)
labels = list(label_dict.keys())
mlb = MultiLabelBinarizer(classes=labels)
Y_true = mlb.fit_transform(y_actual)
Y_pred = mlb.transform(y_pred)
return classification_report(
Y_true,
Y_pred,
target_names=mlb.classes_,
zero_division=zero_division,
)
@dataclass(frozen=True) class AlignmentSummary: n_samples: int exact_match_rate: float sample_jaccard_avg: float
def label_alignment_report( y_main: Sequence[Sequence[str]], y_judge: Sequence[Sequence[str]], label_dict: LabelDict, *, include_unknown_labels: bool = True, empty_union_alignment: float = 1.0, # if neither model predicts a label anywhere ) -> tuple[pd.DataFrame, AlignmentSummary]: """ Agreement/alignment diagnostics between two LLM outputs (main vs judge). This is NOT accuracy—just consistency.
Inputs
------
y_main : list[list[str]]
y_judge : list[list[str]]
label_dict : dict[str, str] (keys define canonical label list/order)
Outputs
-------
report_df : pd.DataFrame
Per-label alignment table (worst-aligned first)
summary : AlignmentSummary
Overall agreement: exact set match rate + average per-sample Jaccard
"""
if len(y_main) != len(y_judge):
raise ValueError(
f"y_main and y_judge must have the same length. Got {len(y_main)} vs {len(y_judge)}."
)
base_labels = list(label_dict.keys())
main_sets: List[Set[str]] = [set(r) for r in y_main]
judge_sets: List[Set[str]] = [set(r) for r in y_judge]
labels = base_labels
if include_unknown_labels:
observed = set()
for r in main_sets:
observed.update(r)
for r in judge_sets:
observed.update(r)
unknown = sorted([l for l in observed if l not in set(base_labels)])
labels = base_labels + unknown
n = len(main_sets)
# Overall summary
exact = 0
jacc_sum = 0.0
for a, b in zip(main_sets, judge_sets):
if a == b:
exact += 1
u = a | b
j = (len(a & b) / len(u)) if u else 1.0
jacc_sum += j
summary = AlignmentSummary(
n_samples=n,
exact_match_rate=(exact / n) if n else 0.0,
sample_jaccard_avg=(jacc_sum / n) if n else 0.0,
)
# Per-label aggregation
rows = []
for lab in labels:
both = main_only = judge_only = 0
for a, b in zip(main_sets, judge_sets):
in_a = lab in a
in_b = lab in b
if in_a and in_b:
both += 1
elif in_a and not in_b:
main_only += 1
elif (not in_a) and in_b:
judge_only += 1
union = both + main_only + judge_only
alignment_rate = (both / union) if union > 0 else float(empty_union_alignment)
rows.append(
{
"label": lab,
"definition": label_dict.get(lab, ""),
"both": both,
"main_only": main_only,
"judge_only": judge_only,
"union": union,
"alignment_rate": alignment_rate,
"coverage_main": both + main_only,
"coverage_judge": both + judge_only,
"delta_coverage": (both + main_only) - (both + judge_only),
}
)
report_df = (
pd.DataFrame(rows)
.sort_values(by=["alignment_rate", "union"], ascending=[True, False])
.reset_index(drop=True)
)
return report_df, summary