forked from EuroEval/EuroEval
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcreate_lt_history.py
More file actions
158 lines (132 loc) · 4.97 KB
/
create_lt_history.py
File metadata and controls
158 lines (132 loc) · 4.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# /// script
# requires-python = ">=3.10,<4.0"
# dependencies = [
# "datasets==3.5.0",
# "huggingface-hub==0.24.0",
# "pandas==2.2.0",
# "requests==2.32.3",
# "scikit-learn<1.6.0",
# ]
# ///
"""Create the LT-History knowledge dataset and upload it to the HF Hub."""
from collections import Counter
import pandas as pd
import requests
from datasets import Dataset, DatasetDict, Split
from huggingface_hub import HfApi
from .constants import (
CHOICES_MAPPING,
MAX_NUM_CHARS_IN_INSTRUCTION,
MAX_NUM_CHARS_IN_OPTION,
MAX_REPETITIONS,
MIN_NUM_CHARS_IN_INSTRUCTION,
MIN_NUM_CHARS_IN_OPTION,
)
def main() -> None:
"""Create the Lithuanian History knowledge dataset and upload it to the HF Hub."""
# Download the JSON data from GitHub
url = (
"https://raw.githubusercontent.com/OpenBabylon/NoDaLiDa2025-LT-History-Eval"
"/refs/heads/main/lit_data.json"
)
response = requests.get(url)
response.raise_for_status()
data = response.json()
# Convert to DataFrame
rows = []
for item in data:
# Extract question stem
question_stem = item["question"]["stem"]
# Extract choices
choices = item["question"]["choices"]
choice_texts = {choice["label"]: choice["text"] for choice in choices}
# Get answer key and convert to lowercase
answer_key = item["answerKey"].lower()
# Create row
row = {
"id": item["id"],
"instruction": question_stem,
"option_a": choice_texts.get("A", ""),
"option_b": choice_texts.get("B", ""),
"option_c": choice_texts.get("C", ""),
"option_d": choice_texts.get("D", ""),
"label": answer_key,
}
rows.append(row)
df = pd.DataFrame(rows)
# Remove samples with null values in any option columns
df = df[
df["option_a"].notnull()
& df["option_b"].notnull()
& df["option_c"].notnull()
& df["option_d"].notnull()
]
# Remove samples with overly short or long texts
df = df[
(df.instruction.str.len() >= MIN_NUM_CHARS_IN_INSTRUCTION)
& (df.instruction.str.len() <= MAX_NUM_CHARS_IN_INSTRUCTION)
& (df.option_a.str.len() >= MIN_NUM_CHARS_IN_OPTION)
& (df.option_a.str.len() <= MAX_NUM_CHARS_IN_OPTION)
& (df.option_b.str.len() >= MIN_NUM_CHARS_IN_OPTION)
& (df.option_b.str.len() <= MAX_NUM_CHARS_IN_OPTION)
& (df.option_c.str.len() >= MIN_NUM_CHARS_IN_OPTION)
& (df.option_c.str.len() <= MAX_NUM_CHARS_IN_OPTION)
& (df.option_d.str.len() >= MIN_NUM_CHARS_IN_OPTION)
& (df.option_d.str.len() <= MAX_NUM_CHARS_IN_OPTION)
]
assert isinstance(df, pd.DataFrame)
def is_repetitive(text: str) -> bool:
"""Return True if the text is repetitive."""
max_repetitions = max(Counter(text.split()).values())
return max_repetitions > MAX_REPETITIONS
# Remove overly repetitive samples
df = df[
~df.instruction.apply(is_repetitive)
& ~df.option_a.apply(is_repetitive)
& ~df.option_b.apply(is_repetitive)
& ~df.option_c.apply(is_repetitive)
& ~df.option_d.apply(is_repetitive)
]
# Create the `text` column with all options formatted
df["text"] = [
row.instruction.replace("\n", " ").strip() + "\n"
f"{CHOICES_MAPPING['lt']}:\n"
"a. " + row.option_a.replace("\n", " ").strip() + "\n"
"b. " + row.option_b.replace("\n", " ").strip() + "\n"
"c. " + row.option_c.replace("\n", " ").strip() + "\n"
"d. " + row.option_d.replace("\n", " ").strip()
for _, row in df.iterrows()
]
# Keep only the required columns for EuroEval format
df = df[["text", "label"]]
assert isinstance(df, pd.DataFrame)
# Remove duplicates
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
# Create train and test splits
train_size = 64
val_size = 32
train_df = df.sample(n=train_size, random_state=42)
df.drop(index=train_df.index.tolist(), inplace=True)
val_df = df.sample(n=val_size, random_state=42)
test_df = df.drop(index=val_df.index.tolist())
# Reset the index
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
# Collect datasets in a dataset dictionary (no train split)
dataset = DatasetDict(
{
"train": Dataset.from_pandas(train_df, split=Split.TRAIN),
"val": Dataset.from_pandas(val_df, split=Split.VALIDATION),
"test": Dataset.from_pandas(test_df, split=Split.TEST),
}
)
# Create dataset ID
dataset_id = "EuroEval/lt-history"
# Remove the dataset from Hugging Face Hub if it already exists
HfApi().delete_repo(dataset_id, repo_type="dataset", missing_ok=True)
# Push the dataset to the Hugging Face Hub
dataset.push_to_hub(dataset_id, private=True)
if __name__ == "__main__":
main()