Skip to content

Ibrokhimsadikov/Final-Projecct

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

30 Commits
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 

Repository files navigation

diagnostics.py

from future import annotations

from dataclasses import dataclass from typing import Mapping, Sequence, List, Set

import pandas as pd

try: from sklearn.preprocessing import MultiLabelBinarizer from sklearn.metrics import classification_report except Exception: # pragma: no cover MultiLabelBinarizer = None classification_report = None

LabelDict = Mapping[str, str]

def _ensure_sklearn() -> None: if MultiLabelBinarizer is None or classification_report is None: raise ImportError( "scikit-learn is required for ground-truth diagnostics. " "Install with: pip install scikit-learn" )

-------------------------

1) Ground-truth evaluation

-------------------------

def evaluate_with_ground_truth( y_actual: Sequence[Sequence[str]], y_pred: Sequence[Sequence[str]], label_dict: LabelDict, *, zero_division: int = 0, ) -> str: """ Multi-label evaluation with ground truth using sklearn classification_report.

Inputs
------
y_actual : list[list[str]]
    Ground truth labels per sample
y_pred : list[list[str]]
    Predicted labels per sample
label_dict : dict[str, str]
    Canonical labels (keys define label set + order)

Output
------
str : raw sklearn classification report text
"""
_ensure_sklearn()

if len(y_actual) != len(y_pred):
    raise ValueError(
        f"y_actual and y_pred must have the same length. Got {len(y_actual)} vs {len(y_pred)}."
    )

labels = list(label_dict.keys())

mlb = MultiLabelBinarizer(classes=labels)
Y_true = mlb.fit_transform(y_actual)
Y_pred = mlb.transform(y_pred)

return classification_report(
    Y_true,
    Y_pred,
    target_names=mlb.classes_,
    zero_division=zero_division,
)

-------------------------

2) LLM-as-auditor alignment

-------------------------

@dataclass(frozen=True) class AlignmentSummary: n_samples: int exact_match_rate: float sample_jaccard_avg: float

def label_alignment_report( y_main: Sequence[Sequence[str]], y_judge: Sequence[Sequence[str]], label_dict: LabelDict, *, include_unknown_labels: bool = True, empty_union_alignment: float = 1.0, # if neither model predicts a label anywhere ) -> tuple[pd.DataFrame, AlignmentSummary]: """ Agreement/alignment diagnostics between two LLM outputs (main vs judge). This is NOT accuracy—just consistency.

Inputs
------
y_main : list[list[str]]
y_judge : list[list[str]]
label_dict : dict[str, str] (keys define canonical label list/order)

Outputs
-------
report_df : pd.DataFrame
    Per-label alignment table (worst-aligned first)
summary : AlignmentSummary
    Overall agreement: exact set match rate + average per-sample Jaccard
"""
if len(y_main) != len(y_judge):
    raise ValueError(
        f"y_main and y_judge must have the same length. Got {len(y_main)} vs {len(y_judge)}."
    )

base_labels = list(label_dict.keys())

main_sets: List[Set[str]] = [set(r) for r in y_main]
judge_sets: List[Set[str]] = [set(r) for r in y_judge]

labels = base_labels
if include_unknown_labels:
    observed = set()
    for r in main_sets:
        observed.update(r)
    for r in judge_sets:
        observed.update(r)
    unknown = sorted([l for l in observed if l not in set(base_labels)])
    labels = base_labels + unknown

n = len(main_sets)

# Overall summary
exact = 0
jacc_sum = 0.0
for a, b in zip(main_sets, judge_sets):
    if a == b:
        exact += 1
    u = a | b
    j = (len(a & b) / len(u)) if u else 1.0
    jacc_sum += j

summary = AlignmentSummary(
    n_samples=n,
    exact_match_rate=(exact / n) if n else 0.0,
    sample_jaccard_avg=(jacc_sum / n) if n else 0.0,
)

# Per-label aggregation
rows = []
for lab in labels:
    both = main_only = judge_only = 0
    for a, b in zip(main_sets, judge_sets):
        in_a = lab in a
        in_b = lab in b
        if in_a and in_b:
            both += 1
        elif in_a and not in_b:
            main_only += 1
        elif (not in_a) and in_b:
            judge_only += 1

    union = both + main_only + judge_only
    alignment_rate = (both / union) if union > 0 else float(empty_union_alignment)

    rows.append(
        {
            "label": lab,
            "definition": label_dict.get(lab, ""),
            "both": both,
            "main_only": main_only,
            "judge_only": judge_only,
            "union": union,
            "alignment_rate": alignment_rate,
            "coverage_main": both + main_only,
            "coverage_judge": both + judge_only,
            "delta_coverage": (both + main_only) - (both + judge_only),
        }
    )

report_df = (
    pd.DataFrame(rows)
    .sort_values(by=["alignment_rate", "union"], ascending=[True, False])
    .reset_index(drop=True)
)

return report_df, summary

About

Instacart dashboard

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

 
 
 

Contributors

Languages