forked from EuroEval/EuroEval
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcreate_exam_et.py
More file actions
138 lines (118 loc) · 4.53 KB
/
create_exam_et.py
File metadata and controls
138 lines (118 loc) · 4.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# /// script
# requires-python = ">=3.10,<4.0"
# dependencies = [
# "datasets==3.5.0",
# "huggingface-hub==0.24.0",
# "pandas==2.2.0",
# "requests==2.32.3",
# "scikit-learn<1.6.0",
# ]
# ///
"""Create the Exam-et dataset and upload it to the HF Hub."""
from collections import Counter
import pandas as pd
from datasets import Dataset, DatasetDict, load_dataset
from huggingface_hub import HfApi
from sklearn.model_selection import train_test_split
from .constants import (
CHOICES_MAPPING,
MAX_NUM_CHARS_IN_INSTRUCTION,
MAX_NUM_CHARS_IN_OPTION,
MAX_REPETITIONS,
MIN_NUM_CHARS_IN_INSTRUCTION,
MIN_NUM_CHARS_IN_OPTION,
)
def main() -> None:
"""Create the Exam-et dataset and upload it to the HF Hub."""
repo_id = "TalTechNLP/exam_et"
target_repo_id = "EuroEval/exam-et"
# Get all the subsets of the dataset
api = HfApi(token=True)
repo_info = api.repo_info(repo_id=repo_id, repo_type="dataset")
subsets = [config["config_name"] for config in repo_info.card_data.configs]
# Download all subsets and merge them
rename_mapping = {
"küsimus": "instruction",
"vastusevariandid": "choices",
"vastus": "label",
}
dfs: list[pd.DataFrame] = []
for subset in subsets:
ds = load_dataset(path=repo_id, name=subset, split="train", token=True)
assert isinstance(ds, Dataset), f"Expected Dataset, got {type(ds)}"
df = ds.to_pandas()
assert isinstance(df, pd.DataFrame), f"Expected DataFrame, got {type(df)}"
df.rename(columns=rename_mapping, inplace=True)
df["category"] = subset
df = df.loc[:, ["instruction", "choices", "label", "category"]]
dfs.append(df)
df = pd.concat(dfs, ignore_index=True)
# Remove the samples with overly short or long texts
df = df.loc[
(df.instruction.str.len() >= MIN_NUM_CHARS_IN_INSTRUCTION)
& (df.instruction.str.len() <= MAX_NUM_CHARS_IN_INSTRUCTION)
& df.choices.map(
lambda endings: min(len(ending) for ending in endings)
>= MIN_NUM_CHARS_IN_OPTION
and max(len(ending) for ending in endings) <= MAX_NUM_CHARS_IN_OPTION
)
]
def is_repetitive(text: str) -> bool:
"""Return True if the text is repetitive."""
max_repetitions = max(Counter(text.split()).values())
return max_repetitions > MAX_REPETITIONS
# Remove overly repetitive samples
df = df.loc[
~df.instruction.apply(is_repetitive)
& ~df.choices.map(
lambda endings: any(is_repetitive(ending) for ending in endings)
)
]
# Make a `text` column with all the options in it
label_str = "abcdefghijklmnopqrstuvwxyz"
df["text"] = [
row.instruction.replace("\n", " ").strip()
+ "\n"
+ f"{CHOICES_MAPPING['et']}:\n"
+ "\n".join(
f"{char}. {option.strip()}" for char, option in zip(label_str, row.choices)
)
for _, row in df.iterrows()
]
df["label"] = df.label.map(lambda x: label_str[x])
# Keep only the relevant columns
df = df[["text", "label", "category"]]
train_size = 512
val_size = 64
test_size = 896
assert len(df) == train_size + val_size + test_size, (
f"Expected {train_size + val_size + test_size:,} samples, got {len(df):,}"
)
# Create splits, stratifiying by category
train_df, valtest_df = train_test_split(
df, train_size=train_size, random_state=4242, stratify=df.category
)
val_df, test_df = train_test_split(
valtest_df, test_size=test_size, random_state=4242, stratify=valtest_df.category
)
assert isinstance(train_df, pd.DataFrame), (
f"Expected DataFrame, got {type(train_df)}"
)
assert isinstance(val_df, pd.DataFrame), f"Expected DataFrame, got {type(val_df)}"
assert isinstance(test_df, pd.DataFrame), f"Expected DataFrame, got {type(test_df)}"
# Shuffle the splits
train_df = train_df.sample(frac=1, random_state=4242).reset_index(drop=True)
val_df = val_df.sample(frac=1, random_state=4242).reset_index(drop=True)
test_df = test_df.sample(frac=1, random_state=4242).reset_index(drop=True)
# Convert to DatasetDict
dataset = DatasetDict(
{
"train": Dataset.from_pandas(train_df),
"val": Dataset.from_pandas(val_df),
"test": Dataset.from_pandas(test_df),
}
)
api.delete_repo(repo_id=target_repo_id, repo_type="dataset", missing_ok=True)
dataset.push_to_hub(target_repo_id, private=True)
if __name__ == "__main__":
main()