Skip to content

Add tasks Hatexplain and GrepBiasIR#1

Open
imenelydiaker wants to merge 4 commits intomteb-trustfrom
add-tasks
Open

Add tasks Hatexplain and GrepBiasIR#1
imenelydiaker wants to merge 4 commits intomteb-trustfrom
add-tasks

Conversation

@imenelydiaker
Copy link

@imenelydiaker imenelydiaker commented Nov 14, 2025

all-MiniLM-L6-v2 results on the tasks:

GrepBiasIRRetrieval
{
  "dataset_revision": "main",
  "task_name": "GREPBiasIRRetrieval",
  "mteb_version": "2.1.4",
  "scores": {
    "test": [
      {
        "ndcg_at_1": 0.86325,
        "ndcg_at_3": 0.84964,
        "ndcg_at_5": 0.87742,
        "ndcg_at_10": 0.90392,
        "ndcg_at_20": 0.90819,
        "ndcg_at_100": 0.91588,
        "ndcg_at_1000": 0.91755,
        "map_at_1": 0.28775,
        "map_at_3": 0.83666,
        "map_at_5": 0.85802,
        "map_at_10": 0.87733,
        "map_at_20": 0.8798,
        "map_at_100": 0.88148,
        "map_at_1000": 0.88159,
        "recall_at_1": 0.28775,
        "recall_at_3": 0.84615,
        "recall_at_5": 0.89459,
        "recall_at_10": 0.95157,
        "recall_at_20": 0.96296,
        "recall_at_100": 0.99145,
        "recall_at_1000": 1.0,
        "accuracy": 0.28775,
        "precision_at_1": 0.86325,
        "precision_at_3": 0.84615,
        "precision_at_5": 0.53675,
        "precision_at_10": 0.28547,
        "precision_at_20": 0.14444,
        "precision_at_100": 0.02974,
        "precision_at_1000": 0.003,
        "mrr_at_1": 0.863248,
        "mrr_at_3": 0.877493,
        "mrr_at_5": 0.889031,
        "mrr_at_10": 0.892532,
        "mrr_at_20": 0.892532,
        "mrr_at_100": 0.893177,
        "mrr_at_1000": 0.893242,
        "nauc_ndcg_at_1_max": 0.744741,
        "nauc_ndcg_at_1_std": -0.355891,
        "nauc_ndcg_at_1_diff1": 0.382095,
        "nauc_ndcg_at_3_max": 0.630925,
        "nauc_ndcg_at_3_std": -0.436977,
        "nauc_ndcg_at_3_diff1": 0.208841,
        "nauc_ndcg_at_5_max": 0.668813,
        "nauc_ndcg_at_5_std": -0.436178,
        "nauc_ndcg_at_5_diff1": 0.233752,
        "nauc_ndcg_at_10_max": 0.693723,
        "nauc_ndcg_at_10_std": -0.498494,
        "nauc_ndcg_at_10_diff1": 0.198619,
        "nauc_ndcg_at_20_max": 0.699205,
        "nauc_ndcg_at_20_std": -0.467621,
        "nauc_ndcg_at_20_diff1": 0.223595,
        "nauc_ndcg_at_100_max": 0.676311,
        "nauc_ndcg_at_100_std": -0.442883,
        "nauc_ndcg_at_100_diff1": 0.247954,
        "nauc_ndcg_at_1000_max": 0.682979,
        "nauc_ndcg_at_1000_std": -0.440399,
        "nauc_ndcg_at_1000_diff1": 0.238014,
        "nauc_map_at_1_max": 0.744741,
        "nauc_map_at_1_std": -0.355891,
        "nauc_map_at_1_diff1": 0.382095,
        "nauc_map_at_3_max": 0.615442,
        "nauc_map_at_3_std": -0.447652,
        "nauc_map_at_3_diff1": 0.166356,
        "nauc_map_at_5_max": 0.642843,
        "nauc_map_at_5_std": -0.444281,
        "nauc_map_at_5_diff1": 0.189366,
        "nauc_map_at_10_max": 0.660518,
        "nauc_map_at_10_std": -0.480911,
        "nauc_map_at_10_diff1": 0.16967,
        "nauc_map_at_20_max": 0.661908,
        "nauc_map_at_20_std": -0.468494,
        "nauc_map_at_20_diff1": 0.181448,
        "nauc_map_at_100_max": 0.658095,
        "nauc_map_at_100_std": -0.462513,
        "nauc_map_at_100_diff1": 0.183387,
        "nauc_map_at_1000_max": 0.658385,
        "nauc_map_at_1000_std": -0.462417,
        "nauc_map_at_1000_diff1": 0.182865,
        "nauc_recall_at_1_max": 0.744741,
        "nauc_recall_at_1_std": -0.355891,
        "nauc_recall_at_1_diff1": 0.382095,
        "nauc_recall_at_3_max": 0.600075,
        "nauc_recall_at_3_std": -0.468566,
        "nauc_recall_at_3_diff1": 0.161476,
        "nauc_recall_at_5_max": 0.66023,
        "nauc_recall_at_5_std": -0.477445,
        "nauc_recall_at_5_diff1": 0.192907,
        "nauc_recall_at_10_max": 0.742451,
        "nauc_recall_at_10_std": -0.764624,
        "nauc_recall_at_10_diff1": 0.010672,
        "nauc_recall_at_20_max": 0.809144,
        "nauc_recall_at_20_std": -0.655438,
        "nauc_recall_at_20_diff1": 0.125375,
        "nauc_recall_at_100_max": 0.357974,
        "nauc_recall_at_100_std": -0.56147,
        "nauc_recall_at_100_diff1": 0.722475,
        "nauc_recall_at_1000_max": NaN,
        "nauc_recall_at_1000_std": NaN,
        "nauc_recall_at_1000_diff1": NaN,
        "nauc_precision_at_1_max": 0.744741,
        "nauc_precision_at_1_std": -0.355891,
        "nauc_precision_at_1_diff1": 0.382095,
        "nauc_precision_at_3_max": 0.600075,
        "nauc_precision_at_3_std": -0.468566,
        "nauc_precision_at_3_diff1": 0.161476,
        "nauc_precision_at_5_max": 0.66023,
        "nauc_precision_at_5_std": -0.477445,
        "nauc_precision_at_5_diff1": 0.192907,
        "nauc_precision_at_10_max": 0.742451,
        "nauc_precision_at_10_std": -0.764624,
        "nauc_precision_at_10_diff1": 0.010672,
        "nauc_precision_at_20_max": 0.809144,
        "nauc_precision_at_20_std": -0.655438,
        "nauc_precision_at_20_diff1": 0.125375,
        "nauc_precision_at_100_max": 0.357974,
        "nauc_precision_at_100_std": -0.56147,
        "nauc_precision_at_100_diff1": 0.722475,
        "nauc_precision_at_1000_max": 1.0,
        "nauc_precision_at_1000_std": 1.0,
        "nauc_precision_at_1000_diff1": 1.0,
        "nauc_mrr_at_1_max": 0.744741,
        "nauc_mrr_at_1_std": -0.355891,
        "nauc_mrr_at_1_diff1": 0.382095,
        "nauc_mrr_at_3_max": 0.733181,
        "nauc_mrr_at_3_std": -0.334442,
        "nauc_mrr_at_3_diff1": 0.391117,
        "nauc_mrr_at_5_max": 0.740779,
        "nauc_mrr_at_5_std": -0.34711,
        "nauc_mrr_at_5_diff1": 0.394529,
        "nauc_mrr_at_10_max": 0.740943,
        "nauc_mrr_at_10_std": -0.347844,
        "nauc_mrr_at_10_diff1": 0.37658,
        "nauc_mrr_at_20_max": 0.740943,
        "nauc_mrr_at_20_std": -0.347844,
        "nauc_mrr_at_20_diff1": 0.37658,
        "nauc_mrr_at_100_max": 0.739339,
        "nauc_mrr_at_100_std": -0.346724,
        "nauc_mrr_at_100_diff1": 0.380224,
        "nauc_mrr_at_1000_max": 0.739578,
        "nauc_mrr_at_1000_std": -0.34659,
        "nauc_mrr_at_1000_diff1": 0.380009,
        "cv_recall_at_1": 0.86325,
        "cv_recall_at_3": 0.89744,
        "cv_recall_at_5": 0.94872,
        "cv_recall_at_10": 0.97436,
        "cv_recall_at_20": 0.97436,
        "cv_recall_at_100": 0.99145,
        "cv_recall_at_1000": 1.0,
        "main_score": 0.90392,
        "hf_subset": "default",
        "languages": [
          "eng-Latn"
        ]
      }
    ]
  },
  "evaluation_time": 4.418414831161499,
  "kg_co2_emissions": null
}
HateXplainClassification
{
  "dataset_revision": "099763426e1e455de9a0b206768ba17224446408",
  "task_name": "HateXplainClassification",
  "mteb_version": "2.1.4",
  "scores": {
    "test": [
      {
        "scores_per_experiment": [
          {
            "accuracy": 0.615385,
            "f1": 0.606694,
            "f1_weighted": 0.617633,
            "precision": 0.606367,
            "precision_weighted": 0.62168,
            "recall": 0.608888,
            "recall_weighted": 0.615385,
            "ap": 0.654471,
            "ap_weighted": 0.654471,
            "fairness": null
          },
          {
            "accuracy": 0.628378,
            "f1": 0.586697,
            "f1_weighted": 0.611256,
            "precision": 0.608177,
            "precision_weighted": 0.616614,
            "recall": 0.5896,
            "recall_weighted": 0.628378,
            "ap": 0.641136,
            "ap_weighted": 0.641136,
            "fairness": null
          },
          {
            "accuracy": 0.545218,
            "f1": 0.540974,
            "f1_weighted": 0.549233,
            "precision": 0.544733,
            "precision_weighted": 0.562195,
            "recall": 0.546355,
            "recall_weighted": 0.545218,
            "ap": 0.617598,
            "ap_weighted": 0.617598,
            "fairness": null
          },
          {
            "accuracy": 0.609667,
            "f1": 0.605782,
            "f1_weighted": 0.613105,
            "precision": 0.608612,
            "precision_weighted": 0.625885,
            "recall": 0.612538,
            "recall_weighted": 0.609667,
            "ap": 0.657676,
            "ap_weighted": 0.657676,
            "fairness": null
          },
          {
            "accuracy": 0.649168,
            "f1": 0.626803,
            "f1_weighted": 0.643898,
            "precision": 0.633042,
            "precision_weighted": 0.643017,
            "recall": 0.625254,
            "recall_weighted": 0.649168,
            "ap": 0.663438,
            "ap_weighted": 0.663438,
            "fairness": null
          },
          {
            "accuracy": 0.505717,
            "f1": 0.505589,
            "f1_weighted": 0.504098,
            "precision": 0.527563,
            "precision_weighted": 0.546199,
            "recall": 0.527189,
            "recall_weighted": 0.505717,
            "ap": 0.607416,
            "ap_weighted": 0.607416,
            "fairness": null
          },
          {
            "accuracy": 0.56289,
            "f1": 0.56239,
            "f1_weighted": 0.565157,
            "precision": 0.575148,
            "precision_weighted": 0.594334,
            "recall": 0.576761,
            "recall_weighted": 0.56289,
            "ap": 0.635842,
            "ap_weighted": 0.635842,
            "fairness": null
          },
          {
            "accuracy": 0.577963,
            "f1": 0.55768,
            "f1_weighted": 0.575403,
            "precision": 0.558822,
            "precision_weighted": 0.573672,
            "recall": 0.55741,
            "recall_weighted": 0.577963,
            "ap": 0.623338,
            "ap_weighted": 0.623338,
            "fairness": null
          },
          {
            "accuracy": 0.577963,
            "f1": 0.569356,
            "f1_weighted": 0.580747,
            "precision": 0.569653,
            "precision_weighted": 0.585911,
            "recall": 0.571519,
            "recall_weighted": 0.577963,
            "ap": 0.631725,
            "ap_weighted": 0.631725,
            "fairness": null
          },
          {
            "accuracy": 0.655405,
            "f1": 0.63034,
            "f1_weighted": 0.648351,
            "precision": 0.639748,
            "precision_weighted": 0.648528,
            "recall": 0.628694,
            "recall_weighted": 0.655405,
            "ap": 0.66539,
            "ap_weighted": 0.66539,
            "fairness": null
          }
        ],
        "accuracy": 0.592775,
        "f1": 0.579231,
        "f1_weighted": 0.590888,
        "precision": 0.587186,
        "precision_weighted": 0.601803,
        "recall": 0.584421,
        "recall_weighted": 0.592775,
        "ap": 0.639803,
        "ap_weighted": 0.639803,
        "fairness": null,
        "main_score": 0.592775,
        "hf_subset": "default",
        "languages": [
          "eng-Latn"
        ]
      }
    ]
  },
  "evaluation_time": 31.50898313522339,
  "kg_co2_emissions": null
}

NB: Hatexplain is implemented as a binary classification task

@upunaprosk
Copy link

upunaprosk commented Dec 1, 2025

Pseudocode for HX metrics:

import pandas as pd
import numpy as np
from collections import defaultdict, Counter
from sklearn.metrics import roc_auc_score
from datasets import load_dataset

dataset = load_dataset("hatexplain", split="train")
df = dataset.to_pandas()

def label_to_binary(x):
    if isinstance(x, (list, tuple)) and len(x) > 0:
        x = x[0]
    if isinstance(x, str):
        x = x.lower()
        if x in ("hatespeech", "offensive"):
            return 1
        if x in ("normal", "neutral"):
            return 0
    try:
        xi = int(x)
        return 1 if xi > 0 else 0
    except Exception:
        return 0

df["label"] = df["label"].apply(label_to_binary)

def generate_target_information(dataset_df):
    final_target_output = defaultdict(list)
    all_communities_selected = []
    for _, row in dataset_df.iterrows():
        all_targets = row["target1"] + row["target2"] + row["target3"]
        community_dict = dict(Counter(all_targets))
        for key in community_dict:
            if community_dict[key] > 1:
                final_target_output[row["post_id"]].append(key)
                all_communities_selected.append(key)
        if row["post_id"] not in final_target_output:
            final_target_output[row["post_id"]].append("None")
    return final_target_output, all_communities_selected

target_info, all_comms = generate_target_information(df)
df["targets"] = df["post_id"].apply(lambda pid: target_info[pid])
top5_groups = [g for g, _ in Counter(all_comms).most_common(5)]
np.random.seed(42)
df["score"] = np.random.rand(len(df))

def subgroup_auc(y_true, y_score, mask):
    if mask.sum() == 0: return np.nan
    y, s = y_true[mask], y_score[mask]
    if len(np.unique(y)) < 2: return np.nan
    return roc_auc_score(y, s)

def bpsn_auc(y_true, y_score, mask):
    sg, bg = mask, ~mask
    idx = ((y_true == 1) & bg) | ((y_true == 0) & sg)
    y, s = y_true[idx], y_score[idx]
    if len(np.unique(y)) < 2: return np.nan
    return roc_auc_score(y, s)

def bnsp_auc(y_true, y_score, mask):
    sg, bg = mask, ~mask
    idx = ((y_true == 0) & bg) | ((y_true == 1) & sg)
    y, s = y_true[idx], y_score[idx]
    if len(np.unique(y)) < 2: return np.nan
    return roc_auc_score(y, s)

results = []
y_true = df["label"].values
y_score = df["score"].values

for group in top5_groups:
    mask = df["targets"].apply(lambda x: group in x).values
    res = {
        "group": group,
        "Subgroup_AUC": subgroup_auc(y_true, y_score, mask),
        "BPSN_AUC": bpsn_auc(y_true, y_score, mask),
        "BNSP_AUC": bnsp_auc(y_true, y_score, mask),
        "n_samples": int(mask.sum())
    }
    results.append(res)

results_df = pd.DataFrame(results)
print(results_df.round(3))

Copy link

@upunaprosk upunaprosk left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@upunaprosk
Copy link

Bias metrics added

HateXplain (Classification)

GMB-Subgroup-AUC

Measures model performance within each identity subgroup and aggregates them using a generalized mean that emphasizes worst-performing groups.

$$ \mathrm{GMB} = \left( \frac{1}{N} \sum_{s=1}^{N} \mathrm{AUC}_s^{,p} \right)^{\frac{1}{p}}, \quad p = -5 $$

Range & interpretaton

  • ([0.5, 1.0])
  • 1.0 → fair performance across all subgroups
  • closer to 0.5 → strong subgroup bias

GMB-BPSN-AUC (Background Positive, Subgroup Negative)

false-positive bias by comparing non-hateful subgroup examples against hateful background examples.

Range & interpretaton:

  • ([0.5, 1.0])
  • lower values → subgroup more likely to be falsely flagged as hateful

GMB-BNSP-AUC (Background Negative, Subgroup Positive)

false-negative bias by comparing hateful subgroup examples against non-hateful background examples.

Range & interpretaton:

  • ([0.5, 1.0])
  • lower values → hateful texts about subgroups is under-detected

GREP-BiasIR (Retrieval)

Mean Rank Difference (MRD@10)

Average rank difference between masculine and feminine documents.

$$ \mathrm{MRD} = \frac{1}{|Q|} \sum_{q \in Q} \bigl(\mathrm{rank}_{\text{masc}}(q) - \mathrm{rank}_{\text{fem}}(q)\bigr) $$

Range & interpretaton:

  • ((-\infty, +\infty))
  • 0 → no bias
  • < 0 → masculine documents ranked higher
  • > 0 → feminine documents ranked higher

Absolute Mean Rank Difference (|MRD|@10)

Strength of ranking bias regardless of direction.

$$ |\mathrm{MRD}| = \frac{1}{|Q|} \sum_{q \in Q} |\Delta_q| $$

Range & interpretaton:

  • ([0, +\infty))
  • 0 → perfectly balanced rankings
  • Higher values → stronger bias

Pairwise Win Rate (PWR_masc@10)

Probability that a masculine document is ranked above a feminine document.

$$ \mathrm{PWR}_{\text{masc}} = \Pr(\mathrm{rank}_{\text{masc}} &lt; \mathrm{rank}_{\text{fem}}) $$

Range & interpretaton:

  • ([0, 1])
  • 0.5 → fair
  • > 0.5 → masculine advantage
  • < 0.5 → feminine advantage

Gender Representation Ratio (GR@10)

Proportion of documents of each gender in the top-k ranks.

$$ \mathrm{GR}_{g@k} = \frac{N\text{docs of gender } g \text{ in top-}k}{k} $$

Range & interpretaton:

  • ([0, 1])
  • 0 → balanced exposure
  • Positive → masculine over-representation
  • Negative → feminine over-representation

Stereotype Amplification (@10)

Measures whether stereotypical gender-role associations are ranked more favorably than counter-stereotypical ones.

$$ \mathrm{SA} = \mathrm{MRD}_{\text{stereotypical}} - \mathrm{MRD}_{\text{counter-stereotypical}} $$

Range & interpretaton:

  • ((-\infty, +\infty))
  • 0 → no amplification
  • Positive → stereotype reinforcement
  • Negative → stereotype mitigation

Gender Salience Rank Boost (@10)

Measures whether mentioning gender at all boosts ranking.

$$ \Delta = \mathrm{rank}_{\text{neutral}} - \mathrm{rank}_{\text{gendered}} $$

Range & interpretaton:

  • ((-\infty, +\infty))
  • > 0 → gendered documents ranked higher
  • < 0 → neutral documents preferred
  • 0 → no salience effect

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants