Conversation
|
Pseudocode for HX metrics: |
upunaprosk
left a comment
There was a problem hiding this comment.
- added fairness metrics:
fairness_bench_retrieval.py
fairness_bench_classification.py
Bias metrics addedHateXplain (Classification)GMB-Subgroup-AUCMeasures model performance within each identity subgroup and aggregates them using a generalized mean that emphasizes worst-performing groups. Range & interpretaton
GMB-BPSN-AUC (Background Positive, Subgroup Negative)false-positive bias by comparing non-hateful subgroup examples against hateful background examples. Range & interpretaton:
GMB-BNSP-AUC (Background Negative, Subgroup Positive)false-negative bias by comparing hateful subgroup examples against non-hateful background examples. Range & interpretaton:
GREP-BiasIR (Retrieval)Mean Rank Difference (MRD@10)Average rank difference between masculine and feminine documents. Range & interpretaton:
Absolute Mean Rank Difference (|MRD|@10)Strength of ranking bias regardless of direction. Range & interpretaton:
Pairwise Win Rate (PWR_masc@10)Probability that a masculine document is ranked above a feminine document. Range & interpretaton:
Gender Representation Ratio (GR@10)Proportion of documents of each gender in the top-k ranks. Range & interpretaton:
Stereotype Amplification (@10)Measures whether stereotypical gender-role associations are ranked more favorably than counter-stereotypical ones. Range & interpretaton:
Gender Salience Rank Boost (@10)Measures whether mentioning gender at all boosts ranking. Range & interpretaton:
|
all-MiniLM-L6-v2 results on the tasks:
GrepBiasIRRetrieval
{ "dataset_revision": "main", "task_name": "GREPBiasIRRetrieval", "mteb_version": "2.1.4", "scores": { "test": [ { "ndcg_at_1": 0.86325, "ndcg_at_3": 0.84964, "ndcg_at_5": 0.87742, "ndcg_at_10": 0.90392, "ndcg_at_20": 0.90819, "ndcg_at_100": 0.91588, "ndcg_at_1000": 0.91755, "map_at_1": 0.28775, "map_at_3": 0.83666, "map_at_5": 0.85802, "map_at_10": 0.87733, "map_at_20": 0.8798, "map_at_100": 0.88148, "map_at_1000": 0.88159, "recall_at_1": 0.28775, "recall_at_3": 0.84615, "recall_at_5": 0.89459, "recall_at_10": 0.95157, "recall_at_20": 0.96296, "recall_at_100": 0.99145, "recall_at_1000": 1.0, "accuracy": 0.28775, "precision_at_1": 0.86325, "precision_at_3": 0.84615, "precision_at_5": 0.53675, "precision_at_10": 0.28547, "precision_at_20": 0.14444, "precision_at_100": 0.02974, "precision_at_1000": 0.003, "mrr_at_1": 0.863248, "mrr_at_3": 0.877493, "mrr_at_5": 0.889031, "mrr_at_10": 0.892532, "mrr_at_20": 0.892532, "mrr_at_100": 0.893177, "mrr_at_1000": 0.893242, "nauc_ndcg_at_1_max": 0.744741, "nauc_ndcg_at_1_std": -0.355891, "nauc_ndcg_at_1_diff1": 0.382095, "nauc_ndcg_at_3_max": 0.630925, "nauc_ndcg_at_3_std": -0.436977, "nauc_ndcg_at_3_diff1": 0.208841, "nauc_ndcg_at_5_max": 0.668813, "nauc_ndcg_at_5_std": -0.436178, "nauc_ndcg_at_5_diff1": 0.233752, "nauc_ndcg_at_10_max": 0.693723, "nauc_ndcg_at_10_std": -0.498494, "nauc_ndcg_at_10_diff1": 0.198619, "nauc_ndcg_at_20_max": 0.699205, "nauc_ndcg_at_20_std": -0.467621, "nauc_ndcg_at_20_diff1": 0.223595, "nauc_ndcg_at_100_max": 0.676311, "nauc_ndcg_at_100_std": -0.442883, "nauc_ndcg_at_100_diff1": 0.247954, "nauc_ndcg_at_1000_max": 0.682979, "nauc_ndcg_at_1000_std": -0.440399, "nauc_ndcg_at_1000_diff1": 0.238014, "nauc_map_at_1_max": 0.744741, "nauc_map_at_1_std": -0.355891, "nauc_map_at_1_diff1": 0.382095, "nauc_map_at_3_max": 0.615442, "nauc_map_at_3_std": -0.447652, "nauc_map_at_3_diff1": 0.166356, "nauc_map_at_5_max": 0.642843, "nauc_map_at_5_std": -0.444281, "nauc_map_at_5_diff1": 0.189366, "nauc_map_at_10_max": 0.660518, "nauc_map_at_10_std": -0.480911, "nauc_map_at_10_diff1": 0.16967, "nauc_map_at_20_max": 0.661908, "nauc_map_at_20_std": -0.468494, "nauc_map_at_20_diff1": 0.181448, "nauc_map_at_100_max": 0.658095, "nauc_map_at_100_std": -0.462513, "nauc_map_at_100_diff1": 0.183387, "nauc_map_at_1000_max": 0.658385, "nauc_map_at_1000_std": -0.462417, "nauc_map_at_1000_diff1": 0.182865, "nauc_recall_at_1_max": 0.744741, "nauc_recall_at_1_std": -0.355891, "nauc_recall_at_1_diff1": 0.382095, "nauc_recall_at_3_max": 0.600075, "nauc_recall_at_3_std": -0.468566, "nauc_recall_at_3_diff1": 0.161476, "nauc_recall_at_5_max": 0.66023, "nauc_recall_at_5_std": -0.477445, "nauc_recall_at_5_diff1": 0.192907, "nauc_recall_at_10_max": 0.742451, "nauc_recall_at_10_std": -0.764624, "nauc_recall_at_10_diff1": 0.010672, "nauc_recall_at_20_max": 0.809144, "nauc_recall_at_20_std": -0.655438, "nauc_recall_at_20_diff1": 0.125375, "nauc_recall_at_100_max": 0.357974, "nauc_recall_at_100_std": -0.56147, "nauc_recall_at_100_diff1": 0.722475, "nauc_recall_at_1000_max": NaN, "nauc_recall_at_1000_std": NaN, "nauc_recall_at_1000_diff1": NaN, "nauc_precision_at_1_max": 0.744741, "nauc_precision_at_1_std": -0.355891, "nauc_precision_at_1_diff1": 0.382095, "nauc_precision_at_3_max": 0.600075, "nauc_precision_at_3_std": -0.468566, "nauc_precision_at_3_diff1": 0.161476, "nauc_precision_at_5_max": 0.66023, "nauc_precision_at_5_std": -0.477445, "nauc_precision_at_5_diff1": 0.192907, "nauc_precision_at_10_max": 0.742451, "nauc_precision_at_10_std": -0.764624, "nauc_precision_at_10_diff1": 0.010672, "nauc_precision_at_20_max": 0.809144, "nauc_precision_at_20_std": -0.655438, "nauc_precision_at_20_diff1": 0.125375, "nauc_precision_at_100_max": 0.357974, "nauc_precision_at_100_std": -0.56147, "nauc_precision_at_100_diff1": 0.722475, "nauc_precision_at_1000_max": 1.0, "nauc_precision_at_1000_std": 1.0, "nauc_precision_at_1000_diff1": 1.0, "nauc_mrr_at_1_max": 0.744741, "nauc_mrr_at_1_std": -0.355891, "nauc_mrr_at_1_diff1": 0.382095, "nauc_mrr_at_3_max": 0.733181, "nauc_mrr_at_3_std": -0.334442, "nauc_mrr_at_3_diff1": 0.391117, "nauc_mrr_at_5_max": 0.740779, "nauc_mrr_at_5_std": -0.34711, "nauc_mrr_at_5_diff1": 0.394529, "nauc_mrr_at_10_max": 0.740943, "nauc_mrr_at_10_std": -0.347844, "nauc_mrr_at_10_diff1": 0.37658, "nauc_mrr_at_20_max": 0.740943, "nauc_mrr_at_20_std": -0.347844, "nauc_mrr_at_20_diff1": 0.37658, "nauc_mrr_at_100_max": 0.739339, "nauc_mrr_at_100_std": -0.346724, "nauc_mrr_at_100_diff1": 0.380224, "nauc_mrr_at_1000_max": 0.739578, "nauc_mrr_at_1000_std": -0.34659, "nauc_mrr_at_1000_diff1": 0.380009, "cv_recall_at_1": 0.86325, "cv_recall_at_3": 0.89744, "cv_recall_at_5": 0.94872, "cv_recall_at_10": 0.97436, "cv_recall_at_20": 0.97436, "cv_recall_at_100": 0.99145, "cv_recall_at_1000": 1.0, "main_score": 0.90392, "hf_subset": "default", "languages": [ "eng-Latn" ] } ] }, "evaluation_time": 4.418414831161499, "kg_co2_emissions": null }HateXplainClassification
{ "dataset_revision": "099763426e1e455de9a0b206768ba17224446408", "task_name": "HateXplainClassification", "mteb_version": "2.1.4", "scores": { "test": [ { "scores_per_experiment": [ { "accuracy": 0.615385, "f1": 0.606694, "f1_weighted": 0.617633, "precision": 0.606367, "precision_weighted": 0.62168, "recall": 0.608888, "recall_weighted": 0.615385, "ap": 0.654471, "ap_weighted": 0.654471, "fairness": null }, { "accuracy": 0.628378, "f1": 0.586697, "f1_weighted": 0.611256, "precision": 0.608177, "precision_weighted": 0.616614, "recall": 0.5896, "recall_weighted": 0.628378, "ap": 0.641136, "ap_weighted": 0.641136, "fairness": null }, { "accuracy": 0.545218, "f1": 0.540974, "f1_weighted": 0.549233, "precision": 0.544733, "precision_weighted": 0.562195, "recall": 0.546355, "recall_weighted": 0.545218, "ap": 0.617598, "ap_weighted": 0.617598, "fairness": null }, { "accuracy": 0.609667, "f1": 0.605782, "f1_weighted": 0.613105, "precision": 0.608612, "precision_weighted": 0.625885, "recall": 0.612538, "recall_weighted": 0.609667, "ap": 0.657676, "ap_weighted": 0.657676, "fairness": null }, { "accuracy": 0.649168, "f1": 0.626803, "f1_weighted": 0.643898, "precision": 0.633042, "precision_weighted": 0.643017, "recall": 0.625254, "recall_weighted": 0.649168, "ap": 0.663438, "ap_weighted": 0.663438, "fairness": null }, { "accuracy": 0.505717, "f1": 0.505589, "f1_weighted": 0.504098, "precision": 0.527563, "precision_weighted": 0.546199, "recall": 0.527189, "recall_weighted": 0.505717, "ap": 0.607416, "ap_weighted": 0.607416, "fairness": null }, { "accuracy": 0.56289, "f1": 0.56239, "f1_weighted": 0.565157, "precision": 0.575148, "precision_weighted": 0.594334, "recall": 0.576761, "recall_weighted": 0.56289, "ap": 0.635842, "ap_weighted": 0.635842, "fairness": null }, { "accuracy": 0.577963, "f1": 0.55768, "f1_weighted": 0.575403, "precision": 0.558822, "precision_weighted": 0.573672, "recall": 0.55741, "recall_weighted": 0.577963, "ap": 0.623338, "ap_weighted": 0.623338, "fairness": null }, { "accuracy": 0.577963, "f1": 0.569356, "f1_weighted": 0.580747, "precision": 0.569653, "precision_weighted": 0.585911, "recall": 0.571519, "recall_weighted": 0.577963, "ap": 0.631725, "ap_weighted": 0.631725, "fairness": null }, { "accuracy": 0.655405, "f1": 0.63034, "f1_weighted": 0.648351, "precision": 0.639748, "precision_weighted": 0.648528, "recall": 0.628694, "recall_weighted": 0.655405, "ap": 0.66539, "ap_weighted": 0.66539, "fairness": null } ], "accuracy": 0.592775, "f1": 0.579231, "f1_weighted": 0.590888, "precision": 0.587186, "precision_weighted": 0.601803, "recall": 0.584421, "recall_weighted": 0.592775, "ap": 0.639803, "ap_weighted": 0.639803, "fairness": null, "main_score": 0.592775, "hf_subset": "default", "languages": [ "eng-Latn" ] } ] }, "evaluation_time": 31.50898313522339, "kg_co2_emissions": null }NB: Hatexplain is implemented as a binary classification task