EuroEval-MT/src/scripts/create_exams_bg.py at main · alexandrainst/EuroEval-MT · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
# /// script
# requires-python = ">=3.10,<4.0"
# dependencies = [
#     "datasets==3.5.0",
#     "huggingface-hub==0.24.0",
#     "pandas==2.2.0",
#     "requests==2.32.3",
# ]
# ///

"""Create the Bulgarian Exams knowledge dataset and upload to HF Hub."""

import json
import tarfile
from pathlib import Path
from tempfile import TemporaryDirectory

import pandas as pd
import requests
from datasets import Dataset, DatasetDict, Split
from huggingface_hub import HfApi

from .constants import CHOICES_MAPPING  # noqa


def main() -> None:
    """Create the Bulgarian Exams knowledge dataset and upload to HF Hub."""
    # URL to the tar.gz file
    url = "https://github.com/bgGLUE/bgglue/raw/refs/heads/main/data/exams.tar.gz"

    with TemporaryDirectory() as temp_dir:
        temp_path = Path(temp_dir)

        data_dir = download_dataset(url=url, temp_path=temp_path)

        train_df = load_split(file_path=data_dir / "train.jsonl")
        val_df = load_split(file_path=data_dir / "dev.jsonl")
        test_df = load_split(file_path=data_dir / "test.jsonl")

        train_df = process_split(df=train_df)
        val_df = process_split(df=val_df)
        test_df = process_split(df=test_df)

        final_train_df, final_val_df, final_test_df = create_splits(
            train_df=train_df, val_df=val_df, test_df=test_df
        )

        dataset = DatasetDict(
            {
                "train": Dataset.from_pandas(final_train_df, split=Split.TRAIN),
                "val": Dataset.from_pandas(final_val_df, split=Split.VALIDATION),
                "test": Dataset.from_pandas(final_test_df, split=Split.TEST),
            }
        )

        dataset_id = "EuroEval/exams-bg-mini"
        HfApi().delete_repo(dataset_id, repo_type="dataset", missing_ok=True)
        dataset.push_to_hub(dataset_id, private=True)


def download_dataset(url: str, temp_path: Path) -> Path:
    """Download the dataset.

    Args:
        url: URL to the tar.gz file.
        temp_path: Path to the temporary directory.

    Returns:
        Path to the data directory.
    """
    tar_path = temp_path / "exams.tar.gz"
    response = requests.get(url)
    response.raise_for_status()
    with open(tar_path, "wb") as f:
        f.write(response.content)

    with tarfile.open(tar_path, "r:gz") as tar:
        tar.extractall(temp_path)

    train_file = list(temp_path.glob("**/train.jsonl"))
    if not train_file:
        raise FileNotFoundError("Could not find train.jsonl in extracted files")

    data_dir = train_file[0].parent
    return data_dir


def load_split(file_path: Path) -> pd.DataFrame:
    """Load a JSONL file into a pandas DataFrame.

    Args:
        file_path: Path to the JSONL file.

    Returns:
        A DataFrame with the data.
    """
    data = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line))

    df = pd.DataFrame(data)
    return df


def process_split(df: pd.DataFrame) -> pd.DataFrame:
    """Process a split of the dataset.

    Args:
        df: The dataframe to process.

    Returns:
        The processed dataframe.
    """
    texts = []
    labels = []

    for _, row in df.iterrows():
        # Extract question stem
        question_stem = row["question"]["stem"]

        # Extract choices
        choices = row["question"]["choices"]
        if not len(choices) == 4:
            # Keep only the samples with 4 choices
            continue

        # Sort choices by label to ensure consistent order (A, B, C, D)
        sorted_choices = sorted(choices, key=lambda x: x["label"])

        # Build the text with choices
        choice_lines: list[str] = []
        for choice in sorted_choices:
            label = choice["label"].lower()
            text = choice["text"]
            choice_lines.append(f"{label}. {text}")

        # Get Bulgarian word for "Choices"
        choices_text = CHOICES_MAPPING.get("bg", "Възможности")

        # Construct the full text
        text = f"{question_stem}\n{choices_text}:\n" + "\n".join(choice_lines)

        # Get the correct answer label (lowercase)
        label = row.answerKey.lower()

        texts.append(text)
        labels.append(label)

    result_df = pd.DataFrame({"text": texts, "label": labels})

    # Drop duplicates based on text
    result_df = result_df.drop_duplicates(subset="text").reset_index(drop=True)

    return result_df


def create_splits(
    train_df: pd.DataFrame, val_df: pd.DataFrame, test_df: pd.DataFrame
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """Create splits for the dataset.

    Args:
        train_df: The training dataframe.
        val_df: The validation dataframe.
        test_df: The test dataframe.

    Returns:
        The final training, validation, and test dataframes.
    """
    train_size = 1024
    test_size = 2048

    final_train_df = train_df.sample(n=train_size, random_state=4242)
    remaining_train_df = train_df.drop(final_train_df.index.tolist())
    test_df_with_remaining_train_samples = pd.concat(
        [test_df, remaining_train_df], ignore_index=True
    )

    n_missing_samples = test_size - len(test_df_with_remaining_train_samples)
    additional_val_samples = val_df.sample(n=n_missing_samples, random_state=4242)
    final_test_df = pd.concat(
        [test_df_with_remaining_train_samples, additional_val_samples],
        ignore_index=True,
    )
    final_val_df = val_df.drop(additional_val_samples.index.tolist())

    final_train_df = final_train_df.reset_index(drop=True)
    final_val_df = final_val_df.reset_index(drop=True)
    final_test_df = final_test_df.reset_index(drop=True)

    return final_train_df, final_val_df, final_test_df


if __name__ == "__main__":
    main()