EuroEval-MT/src/scripts/create_lt_history.py at main · alexandrainst/EuroEval-MT · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# /// script
# requires-python = ">=3.10,<4.0"
# dependencies = [
#     "datasets==3.5.0",
#     "huggingface-hub==0.24.0",
#     "pandas==2.2.0",
#     "requests==2.32.3",
#     "scikit-learn<1.6.0",
# ]
# ///

"""Create the LT-History knowledge dataset and upload it to the HF Hub."""

from collections import Counter

import pandas as pd
import requests
from datasets import Dataset, DatasetDict, Split
from huggingface_hub import HfApi

from .constants import (
    CHOICES_MAPPING,
    MAX_NUM_CHARS_IN_INSTRUCTION,
    MAX_NUM_CHARS_IN_OPTION,
    MAX_REPETITIONS,
    MIN_NUM_CHARS_IN_INSTRUCTION,
    MIN_NUM_CHARS_IN_OPTION,
)


def main() -> None:
    """Create the Lithuanian History knowledge dataset and upload it to the HF Hub."""
    # Download the JSON data from GitHub
    url = (
        "https://raw.githubusercontent.com/OpenBabylon/NoDaLiDa2025-LT-History-Eval"
        "/refs/heads/main/lit_data.json"
    )
    response = requests.get(url)
    response.raise_for_status()
    data = response.json()

    # Convert to DataFrame
    rows = []
    for item in data:
        # Extract question stem
        question_stem = item["question"]["stem"]

        # Extract choices
        choices = item["question"]["choices"]
        choice_texts = {choice["label"]: choice["text"] for choice in choices}

        # Get answer key and convert to lowercase
        answer_key = item["answerKey"].lower()

        # Create row
        row = {
            "id": item["id"],
            "instruction": question_stem,
            "option_a": choice_texts.get("A", ""),
            "option_b": choice_texts.get("B", ""),
            "option_c": choice_texts.get("C", ""),
            "option_d": choice_texts.get("D", ""),
            "label": answer_key,
        }
        rows.append(row)

    df = pd.DataFrame(rows)

    # Remove samples with null values in any option columns
    df = df[
        df["option_a"].notnull()
        & df["option_b"].notnull()
        & df["option_c"].notnull()
        & df["option_d"].notnull()
    ]

    # Remove samples with overly short or long texts
    df = df[
        (df.instruction.str.len() >= MIN_NUM_CHARS_IN_INSTRUCTION)
        & (df.instruction.str.len() <= MAX_NUM_CHARS_IN_INSTRUCTION)
        & (df.option_a.str.len() >= MIN_NUM_CHARS_IN_OPTION)
        & (df.option_a.str.len() <= MAX_NUM_CHARS_IN_OPTION)
        & (df.option_b.str.len() >= MIN_NUM_CHARS_IN_OPTION)
        & (df.option_b.str.len() <= MAX_NUM_CHARS_IN_OPTION)
        & (df.option_c.str.len() >= MIN_NUM_CHARS_IN_OPTION)
        & (df.option_c.str.len() <= MAX_NUM_CHARS_IN_OPTION)
        & (df.option_d.str.len() >= MIN_NUM_CHARS_IN_OPTION)
        & (df.option_d.str.len() <= MAX_NUM_CHARS_IN_OPTION)
    ]
    assert isinstance(df, pd.DataFrame)

    def is_repetitive(text: str) -> bool:
        """Return True if the text is repetitive."""
        max_repetitions = max(Counter(text.split()).values())
        return max_repetitions > MAX_REPETITIONS

    # Remove overly repetitive samples
    df = df[
        ~df.instruction.apply(is_repetitive)
        & ~df.option_a.apply(is_repetitive)
        & ~df.option_b.apply(is_repetitive)
        & ~df.option_c.apply(is_repetitive)
        & ~df.option_d.apply(is_repetitive)
    ]

    # Create the `text` column with all options formatted
    df["text"] = [
        row.instruction.replace("\n", " ").strip() + "\n"
        f"{CHOICES_MAPPING['lt']}:\n"
        "a. " + row.option_a.replace("\n", " ").strip() + "\n"
        "b. " + row.option_b.replace("\n", " ").strip() + "\n"
        "c. " + row.option_c.replace("\n", " ").strip() + "\n"
        "d. " + row.option_d.replace("\n", " ").strip()
        for _, row in df.iterrows()
    ]

    # Keep only the required columns for EuroEval format
    df = df[["text", "label"]]
    assert isinstance(df, pd.DataFrame)

    # Remove duplicates
    df.drop_duplicates(inplace=True)
    df.reset_index(drop=True, inplace=True)

    # Create train and test splits
    train_size = 64
    val_size = 32
    train_df = df.sample(n=train_size, random_state=42)
    df.drop(index=train_df.index.tolist(), inplace=True)
    val_df = df.sample(n=val_size, random_state=42)
    test_df = df.drop(index=val_df.index.tolist())

    # Reset the index
    train_df = train_df.reset_index(drop=True)
    val_df = val_df.reset_index(drop=True)
    test_df = test_df.reset_index(drop=True)

    # Collect datasets in a dataset dictionary (no train split)
    dataset = DatasetDict(
        {
            "train": Dataset.from_pandas(train_df, split=Split.TRAIN),
            "val": Dataset.from_pandas(val_df, split=Split.VALIDATION),
            "test": Dataset.from_pandas(test_df, split=Split.TEST),
        }
    )

    # Create dataset ID
    dataset_id = "EuroEval/lt-history"

    # Remove the dataset from Hugging Face Hub if it already exists
    HfApi().delete_repo(dataset_id, repo_type="dataset", missing_ok=True)

    # Push the dataset to the Hugging Face Hub
    dataset.push_to_hub(dataset_id, private=True)


if __name__ == "__main__":
    main()