|
| 1 | +from eval_mm.metrics.scorer import Scorer |
| 2 | +from sacrebleu import sentence_bleu |
| 3 | +from unicodedata import normalize |
| 4 | + |
| 5 | +ANSWER_TYPE_MAP = { |
| 6 | + "yesno": 0, # Yes/No questions |
| 7 | + "factoid": 1, # Factoid questions |
| 8 | + "numerical": 2, # Numerical questions |
| 9 | + "open-ended": 3, # Open-ended questions |
| 10 | +} |
| 11 | + |
| 12 | +NUM_TO_ANSWER_TYPE = {v: k for k, v in ANSWER_TYPE_MAP.items()} |
| 13 | + |
| 14 | + |
| 15 | +def jdocqa_normalize(text): |
| 16 | + text = ( |
| 17 | + text.replace("です", "") |
| 18 | + .replace("。", "") |
| 19 | + .replace("、", "") |
| 20 | + .replace(" ", "") |
| 21 | + .strip() |
| 22 | + ) |
| 23 | + text = normalize("NFKC", text) |
| 24 | + return text |
| 25 | + |
| 26 | + |
| 27 | +def bleu_ja(refs, pred): |
| 28 | + bleu_score = sentence_bleu( |
| 29 | + hypothesis=pred, |
| 30 | + references=refs, |
| 31 | + smooth_method="exp", |
| 32 | + smooth_value=0.0, |
| 33 | + tokenize="ja-mecab", |
| 34 | + use_effective_order=False, |
| 35 | + lowercase=False, |
| 36 | + ) |
| 37 | + return bleu_score.score |
| 38 | + |
| 39 | + |
| 40 | +class JDocQAScorer(Scorer): |
| 41 | + @staticmethod |
| 42 | + def score(refs: list[str], preds: list[str], **kwargs) -> list[int]: |
| 43 | + docs = kwargs["docs"] |
| 44 | + scores = [] |
| 45 | + |
| 46 | + for doc, ref, pred in zip(docs, refs, preds): |
| 47 | + if doc["answer_type"] == ANSWER_TYPE_MAP["open-ended"]: |
| 48 | + scores.append(bleu_ja([ref], pred)) |
| 49 | + elif doc["answer_type"] in [ |
| 50 | + ANSWER_TYPE_MAP["yesno"], |
| 51 | + ANSWER_TYPE_MAP["factoid"], |
| 52 | + ANSWER_TYPE_MAP["numerical"], |
| 53 | + ]: |
| 54 | + ref = jdocqa_normalize(ref) |
| 55 | + pred = jdocqa_normalize(pred) |
| 56 | + if ref in pred: |
| 57 | + scores.append(1) |
| 58 | + else: |
| 59 | + scores.append(0) |
| 60 | + else: |
| 61 | + raise NotImplementedError("Bad answer type.") |
| 62 | + |
| 63 | + return scores |
| 64 | + |
| 65 | + @staticmethod |
| 66 | + def aggregate(scores: list[int], **kwargs) -> dict: |
| 67 | + docs = kwargs["docs"] |
| 68 | + metrics = { |
| 69 | + "yesno_exact": [], |
| 70 | + "factoid_exact": [], |
| 71 | + "numerical_exact": [], |
| 72 | + "open-ended_bleu": [], |
| 73 | + } |
| 74 | + for doc, score in zip(docs, scores): |
| 75 | + answer_type = doc["answer_type"] |
| 76 | + if answer_type == ANSWER_TYPE_MAP["open-ended"]: |
| 77 | + metrics["open-ended_bleu"].append(score) |
| 78 | + else: |
| 79 | + metrics[f"{NUM_TO_ANSWER_TYPE[answer_type]}_exact"].append(score) |
| 80 | + |
| 81 | + for key, value in metrics.items(): |
| 82 | + if len(value) == 0: |
| 83 | + metrics[key] = 0 |
| 84 | + continue |
| 85 | + metrics[key] = sum(value) / len(value) |
| 86 | + |
| 87 | + return metrics |
| 88 | + |
| 89 | + |
| 90 | +if __name__ == "__main__": |
| 91 | + from datasets import load_dataset |
| 92 | + |
| 93 | + ds = load_dataset("shunk031/JDocQA", split="test") |
| 94 | + ds = ds.select(range(10)) |
| 95 | + |
| 96 | + ref = ds["answer"][0] |
| 97 | + pred = ds["answer"][0] |
| 98 | + print(ref) |
| 99 | + print(pred) |
| 100 | + print(bleu_ja([ref], pred)) |
| 101 | + answer_types = ds["answer_type"] |
| 102 | + answers = ds["answer"] |
| 103 | + print("Original answers") |
| 104 | + for answer_type, answer in zip(answer_types, answers): |
| 105 | + print(NUM_TO_ANSWER_TYPE[answer_type], answer) |
| 106 | + |
| 107 | + print("JDocQA normalized answers") |
| 108 | + jdocqa_normalize_answers = [jdocqa_normalize(x) for x in ds["answer"]] |
| 109 | + for answer_type, answer in zip(answer_types, jdocqa_normalize_answers): |
| 110 | + print(NUM_TO_ANSWER_TYPE[answer_type], answer) |
| 111 | + |
| 112 | + scores = JDocQAScorer.score(refs=ds["answer"], preds=ds["answer"], docs=ds) |
| 113 | + print(scores) |
| 114 | + metrics = JDocQAScorer.aggregate(scores, docs=ds) |
| 115 | + print(metrics) |
0 commit comments