diagnostics.py

from future import annotations

from dataclasses import dataclass from typing import Mapping, Sequence, List, Set

import pandas as pd

try: from sklearn.preprocessing import MultiLabelBinarizer from sklearn.metrics import classification_report except Exception: # pragma: no cover MultiLabelBinarizer = None classification_report = None

LabelDict = Mapping[str, str]

def _ensure_sklearn() -> None: if MultiLabelBinarizer is None or classification_report is None: raise ImportError( "scikit-learn is required for ground-truth diagnostics. " "Install with: pip install scikit-learn" )

-------------------------

1) Ground-truth evaluation

-------------------------

def evaluate_with_ground_truth( y_actual: Sequence[Sequence[str]], y_pred: Sequence[Sequence[str]], label_dict: LabelDict, *, zero_division: int = 0, ) -> str: """ Multi-label evaluation with ground truth using sklearn classification_report.

Inputs
------
y_actual : list[list[str]]
    Ground truth labels per sample
y_pred : list[list[str]]
    Predicted labels per sample
label_dict : dict[str, str]
    Canonical labels (keys define label set + order)

Output
------
str : raw sklearn classification report text
"""
_ensure_sklearn()

if len(y_actual) != len(y_pred):
    raise ValueError(
        f"y_actual and y_pred must have the same length. Got {len(y_actual)} vs {len(y_pred)}."
    )

labels = list(label_dict.keys())

mlb = MultiLabelBinarizer(classes=labels)
Y_true = mlb.fit_transform(y_actual)
Y_pred = mlb.transform(y_pred)

return classification_report(
    Y_true,
    Y_pred,
    target_names=mlb.classes_,
    zero_division=zero_division,
)

-------------------------

2) LLM-as-auditor alignment

-------------------------

@dataclass(frozen=True) class AlignmentSummary: n_samples: int exact_match_rate: float sample_jaccard_avg: float

def label_alignment_report( y_main: Sequence[Sequence[str]], y_judge: Sequence[Sequence[str]], label_dict: LabelDict, *, include_unknown_labels: bool = True, empty_union_alignment: float = 1.0, # if neither model predicts a label anywhere ) -> tuple[pd.DataFrame, AlignmentSummary]: """ Agreement/alignment diagnostics between two LLM outputs (main vs judge). This is NOT accuracy—just consistency.

Inputs
------
y_main : list[list[str]]
y_judge : list[list[str]]
label_dict : dict[str, str] (keys define canonical label list/order)

Outputs
-------
report_df : pd.DataFrame
    Per-label alignment table (worst-aligned first)
summary : AlignmentSummary
    Overall agreement: exact set match rate + average per-sample Jaccard
"""
if len(y_main) != len(y_judge):
    raise ValueError(
        f"y_main and y_judge must have the same length. Got {len(y_main)} vs {len(y_judge)}."
    )

base_labels = list(label_dict.keys())

main_sets: List[Set[str]] = [set(r) for r in y_main]
judge_sets: List[Set[str]] = [set(r) for r in y_judge]

labels = base_labels
if include_unknown_labels:
    observed = set()
    for r in main_sets:
        observed.update(r)
    for r in judge_sets:
        observed.update(r)
    unknown = sorted([l for l in observed if l not in set(base_labels)])
    labels = base_labels + unknown

n = len(main_sets)

# Overall summary
exact = 0
jacc_sum = 0.0
for a, b in zip(main_sets, judge_sets):
    if a == b:
        exact += 1
    u = a | b
    j = (len(a & b) / len(u)) if u else 1.0
    jacc_sum += j

summary = AlignmentSummary(
    n_samples=n,
    exact_match_rate=(exact / n) if n else 0.0,
    sample_jaccard_avg=(jacc_sum / n) if n else 0.0,
)

# Per-label aggregation
rows = []
for lab in labels:
    both = main_only = judge_only = 0
    for a, b in zip(main_sets, judge_sets):
        in_a = lab in a
        in_b = lab in b
        if in_a and in_b:
            both += 1
        elif in_a and not in_b:
            main_only += 1
        elif (not in_a) and in_b:
            judge_only += 1

    union = both + main_only + judge_only
    alignment_rate = (both / union) if union > 0 else float(empty_union_alignment)

    rows.append(
        {
            "label": lab,
            "definition": label_dict.get(lab, ""),
            "both": both,
            "main_only": main_only,
            "judge_only": judge_only,
            "union": union,
            "alignment_rate": alignment_rate,
            "coverage_main": both + main_only,
            "coverage_judge": both + judge_only,
            "delta_coverage": (both + main_only) - (both + judge_only),
        }
    )

report_df = (
    pd.DataFrame(rows)
    .sort_values(by=["alignment_rate", "union"], ascending=[True, False])
    .reset_index(drop=True)
)

return report_df, summary

Name		Name	Last commit message	Last commit date
Latest commit History 30 Commits
.Rhistory		.Rhistory
.gitignore		.gitignore
FINAL.Rproj		FINAL.Rproj
Global.R		Global.R
README.md		README.md
Report.Rmd		Report.Rmd
anima5.gif		anima5.gif
pincai.png		pincai.png
server.R		server.R
ui.R		ui.R

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

diagnostics.py

-------------------------

1) Ground-truth evaluation

-------------------------

-------------------------

2) LLM-as-auditor alignment

-------------------------

About

Uh oh!

Releases

Packages

Uh oh!

Contributors

Uh oh!

Languages

Ibrokhimsadikov/Final-Projecct

Folders and files

Latest commit

History

Repository files navigation

diagnostics.py

-------------------------

1) Ground-truth evaluation

-------------------------

-------------------------

2) LLM-as-auditor alignment

-------------------------

About

Resources

Uh oh!

Stars

Watchers

Forks

Releases

Packages 0

Uh oh!

Contributors

Uh oh!

Languages

Packages