forked from EuroEval/EuroEval
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcreate_lr_sum.py
More file actions
136 lines (105 loc) · 4.24 KB
/
create_lr_sum.py
File metadata and controls
136 lines (105 loc) · 4.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# /// script
# requires-python = ">=3.10,<4.0"
# dependencies = [
# "datasets==3.5.0",
# "huggingface-hub==0.24.0",
# "pandas==2.2.0",
# "requests==2.32.3",
# ]
# ///
"""Create the LR-Sum summarisation datasets."""
import pandas as pd
from datasets import Dataset, DatasetDict, Split, load_dataset
from huggingface_hub import HfApi
from .constants import MAX_NUM_CHARS_IN_ARTICLE, MIN_NUM_CHARS_IN_ARTICLE
LANGUAGES = {"uk": "ukr", "sr": "srp", "bs": "bos", "sq": "sqi"}
def main() -> None:
"""Create the LR-Sum summarisation mini datasets and upload to HF Hub."""
dataset_id = "bltlab/lr-sum"
for language, subset in LANGUAGES.items():
# Load the full dataset and filter for the subset
dataset = load_dataset(dataset_id, subset)
assert isinstance(dataset, DatasetDict)
dataset = dataset.map(make_columns)
train_df = dataset["train"].to_pandas()
val_df = dataset["validation"].to_pandas()
test_df = dataset["test"].to_pandas()
assert isinstance(train_df, pd.DataFrame)
assert isinstance(val_df, pd.DataFrame)
assert isinstance(test_df, pd.DataFrame)
train_df = process_df(df=train_df)
val_df = process_df(df=val_df)
test_df = process_df(df=test_df)
# Make splits
test_size = 2048
val_size = 256
train_size = 1024
# Calculate how many additional samples needed for test split
additional_test_samples_needed = test_size - len(test_df)
if additional_test_samples_needed > 0:
# Take additional samples from training set for test split
additional_test_samples = train_df.sample(
n=additional_test_samples_needed, random_state=4242
)
else:
additional_test_samples = pd.DataFrame()
# Combine original test with additional samples from train
final_test_df = pd.concat([test_df, additional_test_samples], ignore_index=True)
# Remove the additional samples from train set
remaining_train_df = train_df[
~train_df.index.isin(additional_test_samples.index)
]
# Sample final splits
final_test_df = final_test_df.sample(
n=test_size, random_state=4242
).reset_index(drop=True)
final_val_df = val_df.sample(n=val_size, random_state=4242).reset_index(
drop=True
)
final_train_df = remaining_train_df.sample(
n=train_size, random_state=4242
).reset_index(drop=True)
assert isinstance(final_train_df, pd.DataFrame)
assert isinstance(final_val_df, pd.DataFrame)
assert isinstance(final_test_df, pd.DataFrame)
# Collect datasets in a dataset dictionary
mini_dataset = DatasetDict(
{
"train": Dataset.from_pandas(final_train_df, split=Split.TRAIN),
"val": Dataset.from_pandas(final_val_df, split=Split.VALIDATION),
"test": Dataset.from_pandas(final_test_df, split=Split.TEST),
}
)
# Create dataset ID
mini_dataset_id = f"EuroEval/lr-sum-{language}-mini"
# Remove the dataset from Hugging Face Hub if it already exists
HfApi().delete_repo(mini_dataset_id, repo_type="dataset", missing_ok=True)
# Push the dataset to the Hugging Face Hub
mini_dataset.push_to_hub(mini_dataset_id, private=True)
def make_columns(sample: dict) -> dict:
"""Map the dataset to have the text and target_text columns.
Args:
sample: A sample from the dataset.
Returns:
A sample with the text and target_text columns.
"""
sample["text"] = f"{sample['title']}\n\n{sample['text']}"
sample["target_text"] = sample["summary"]
return sample
def process_df(df: pd.DataFrame) -> pd.DataFrame:
"""Process the dataframe.
Args:
df: A dataframe to process.
Returns:
Processed dataframe.
"""
lengths = df.text.str.len()
lower_bound = MIN_NUM_CHARS_IN_ARTICLE
upper_bound = MAX_NUM_CHARS_IN_ARTICLE
df = df.loc[lengths.between(lower_bound, upper_bound)]
keep_columns = ["text", "target_text"]
df = df[keep_columns]
df = df.reset_index(drop=True)
return df
if __name__ == "__main__":
main()