forked from EuroEval/EuroEval
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcreate_danske_talemaader.py
More file actions
117 lines (98 loc) · 3.73 KB
/
create_danske_talemaader.py
File metadata and controls
117 lines (98 loc) · 3.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# /// script
# requires-python = ">=3.10,<4.0"
# dependencies = [
# "datasets==3.5.0",
# "huggingface-hub==0.24.0",
# "pandas==2.2.0",
# "requests==2.32.3",
# "scikit-learn<1.6.0",
# ]
# ///
"""Create the the Danske Talemåder dataset and upload it to the HF Hub."""
import io
from zipfile import ZipFile
import pandas as pd
import requests as rq
from datasets import Dataset, DatasetDict, Split
from huggingface_hub import HfApi
from sklearn.model_selection import train_test_split
from .constants import CHOICES_MAPPING
def main() -> None:
"""Create the Danske Talemåder dataset and upload it to the HF Hub."""
# Download the ZIP file
url = (
"https://sprogtek-ressources.digst.govcloud.dk/1000%20danske%20talemaader"
"%20og%20faste%20udtryk/talemaader_csv.zip"
)
response = rq.get(url=url)
response.raise_for_status()
# Get the data from the ZIP file
with ZipFile(file=io.BytesIO(initial_bytes=response.content)) as zip_file:
no_labels_csv_file = [
zip_file.read(name=file_name)
for file_name in zip_file.namelist()
if file_name == "talemaader_leverance_2_uden_labels.csv"
][0]
only_labels_csv_file = [
zip_file.read(name=file_name)
for file_name in zip_file.namelist()
if file_name == "talemaader_leverance_2_kun_labels.csv"
][0]
no_labels_df = pd.read_csv(
filepath_or_buffer=io.BytesIO(initial_bytes=no_labels_csv_file),
delimiter="\t",
)
only_labels_df = pd.read_csv(
filepath_or_buffer=io.BytesIO(initial_bytes=only_labels_csv_file),
delimiter="\t",
)
# Set up the data as a dataframe
df = pd.merge(left=no_labels_df, right=only_labels_df)
df["text"] = [
"Hvad betyder udtrykket '"
+ row.talemaade_udtryk.replace("\n", " ").strip()
+ "'?\n"
f"{CHOICES_MAPPING['da']}:\n"
"a. " + row.A.replace("\n", " ").strip() + "\n"
"b. " + row.B.replace("\n", " ").strip() + "\n"
"c. " + row.C.replace("\n", " ").strip() + "\n"
"d. " + row.D.replace("\n", " ").strip()
for _, row in df.iterrows()
]
df["label"] = df.korrekt_def.map({0: "a", 1: "b", 2: "c", 3: "d"})
df = df[["text", "label"]]
# Remove duplicates
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
# Create validation split
val_size = 64
traintest_arr, val_arr = train_test_split(df, test_size=val_size, random_state=4242)
traintest_df = pd.DataFrame(traintest_arr, columns=df.columns)
val_df = pd.DataFrame(val_arr, columns=df.columns)
# Create train and test split
train_size = 128
train_arr, test_arr = train_test_split(
traintest_df, train_size=train_size, random_state=4242
)
train_df = pd.DataFrame(train_arr, columns=df.columns)
test_df = pd.DataFrame(test_arr, columns=df.columns)
# Reset the index
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
# Collect datasets in a dataset dictionary
dataset = DatasetDict(
{
"train": Dataset.from_pandas(train_df, split=Split.TRAIN),
"val": Dataset.from_pandas(val_df, split=Split.VALIDATION),
"test": Dataset.from_pandas(test_df, split=Split.TEST),
}
)
# Create dataset ID
dataset_id = "EuroEval/danske-talemaader"
# Remove the dataset from Hugging Face Hub if it already exists
HfApi().delete_repo(dataset_id, repo_type="dataset", missing_ok=True)
# Push the dataset to the Hugging Face Hub
dataset.push_to_hub(dataset_id, private=True)
if __name__ == "__main__":
main()