7-romans/data.py at main · obtic-sorbonne/7-romans · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from collections import defaultdict
from typing import Literal, Tuple, List, Dict
import pandas as pd
from datasets import Dataset
from more_itertools import flatten
from renard.ner_utils import hgdataset_from_conll2002, ner_entities
from renard.pipeline.core import Mention
from renard.pipeline.character_unification import Character

NovelTitle = Literal[
    "BelAmi",
    "EugénieGrandet",
    "Germinal",
    "LeRougeEtLeNoir",
    "LesTroisMousquetaires",
    "MadameBovary",
    "NotreDameDeParis",
]

NERLabel = Literal[
    "B-LOC", "B-MISC", "B-ORG", "B-PER", "I-LOC", "I-MISC", "I-ORG", "I-PER", "O"
]

NER_ID2LABEL: Dict[int, NERLabel] = {
    0: "B-LOC",
    1: "B-MISC",
    2: "B-ORG",
    3: "B-PER",
    4: "I-LOC",
    5: "I-MISC",
    6: "I-ORG",
    7: "I-PER",
    8: "O",
}


def instances_nb(ner_dataset: Dataset, ner_label: NERLabel) -> int:
    nb = 0
    for labels in ner_dataset["labels"]:
        for label in labels:
            # NOTE: we use 'get' to account for -100 padding labels
            if NER_ID2LABEL.get(label) == ner_label:
                nb += 1
    return nb


def load_novel(novel_name: NovelTitle) -> Tuple[Dataset, List[Character]]:
    """Load NER and character unification gold data for a novel

    :param novel_name: name of the novel to load

    :return: (NER Huggingface Dataset, list of characters)
    """
    # HACK: we split paragraphs so that they fit into 512
    # tokens. Experimentally, we noted that the original text was 0.77
    # smaller than the wordpiece-tokenized text. hence, we cut
    # paragraphs in chunks of 370 tokens (a bit less than 0.77 * 512)
    ner_dataset = hgdataset_from_conll2002(
        f"./ner/{novel_name}.conll", separator=" ", max_sent_len=370
    )
    per_entities = [
        e
        for e in ner_entities(
            list(flatten(ner_dataset["tokens"])),
            [NER_ID2LABEL[l] for l in flatten(ner_dataset["labels"])],
        )
        if e.tag == "PER"
    ]

    alias_df = pd.read_csv(f"./alias-resolution/{novel_name}.csv")
    char_dict = defaultdict(set)
    for _, row in alias_df.iterrows():
        if row["Entity"] == "?":
            continue
        char_dict[row["Entity"]].add(row["Form"])
    characters = []
    for names in char_dict.values():
        mentions = [
            Mention(e.tokens, e.start_idx, e.end_idx)
            for e in per_entities
            if " ".join(e.tokens) in names
        ]
        characters.append(Character(frozenset(names), mentions))

    return (ner_dataset, characters)