EuroEval-MT/src/scripts/create_mms.py at main · alexandrainst/EuroEval-MT · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# /// script
# requires-python = ">=3.10,<4.0"
# dependencies = [
#     "datasets==3.5.0",
#     "huggingface-hub==0.24.0",
#     "pandas==2.2.0",
#     "requests==2.32.3",
# ]
# ///

"""Create the MMS sentiment datasets and upload to HF Hub."""

import pandas as pd
from datasets import Dataset, DatasetDict, Split, load_dataset
from huggingface_hub import HfApi
from sklearn.utils import resample

LANGUAGES = ["sr", "hr", "bs", "sq"]


def main() -> None:
    """Create the MMS sentiment datasets and upload to HF Hub."""
    # Define the dataset repository
    repo_id = "Brand24/mms"

    # Download the dataset
    dataset = load_dataset(path=repo_id)
    assert isinstance(dataset, DatasetDict)

    # Get the train split (MMS only has train split)
    df_all_languages = dataset["train"].to_pandas()
    assert isinstance(df_all_languages, pd.DataFrame)

    for language in LANGUAGES:
        # Filter based on language
        df = df_all_languages[df_all_languages["language"] == language].reset_index(
            drop=True
        )
        assert isinstance(df, pd.DataFrame)

        # Map numeric labels to string labels
        # 0: negative, 1: neutral, 2: positive
        df["label"] = df["label"].map(
            lambda x: {0: "negative", 1: "neutral", 2: "positive"}[x]
        )

        # Remove duplicates
        df = df.drop_duplicates().reset_index(drop=True)

        # Create a uniform distribution based on the original dataset and label columns
        df_uniform_original_dataset = create_uniform_distribution(
            df=df, column="original_dataset"
        )
        df_uniform_label = create_uniform_distribution(
            df=df_uniform_original_dataset, column="label"
        )
        df_remaining = df[~df.index.isin(df_uniform_label.index)]
        assert isinstance(df_remaining, pd.DataFrame)
        df_remaining_uniform_label = create_uniform_distribution(
            df=df_remaining, column="label"
        )
        df = df_uniform_label

        # Define split sizes
        test_size = 2048
        val_size = 256
        train_size = 1024

        n_missing_samples = test_size + val_size + train_size - len(df)

        if n_missing_samples > 0:
            df_additional = df_remaining_uniform_label.sample(
                n=n_missing_samples, random_state=4242
            )
            df = pd.concat([df, df_additional])

        # Shuffle the dataframe
        df = df.sample(frac=1, random_state=4242).reset_index(drop=True)

        # Keep only text and label columns
        df = df[["text", "label"]]

        # Create splits
        test_df = df.iloc[:test_size].reset_index(drop=True)
        val_df = df.iloc[test_size : test_size + val_size].reset_index(drop=True)
        train_df = df.iloc[
            test_size + val_size : test_size + val_size + train_size
        ].reset_index(drop=True)

        # Create dataset dictionary with custom splits
        dataset_dict = DatasetDict(
            {
                "train": Dataset.from_pandas(train_df, split=Split.TRAIN),
                "val": Dataset.from_pandas(val_df, split=Split.VALIDATION),
                "test": Dataset.from_pandas(test_df, split=Split.TEST),
            }
        )

        # Push the dataset to the Hugging Face Hub
        dataset_id = f"EuroEval/mms-{language}-mini"
        HfApi().delete_repo(dataset_id, repo_type="dataset", missing_ok=True)
        dataset_dict.push_to_hub(dataset_id, private=True)


def create_uniform_distribution(
    df: pd.DataFrame, column: str = "label", random_state: int = 4242
) -> pd.DataFrame:
    """Create a sampled dataset with a uniform distribution for the given column.

    Args:
        df: The input dataframe.
        column: The name of the column to create a uniform label distribution for
        random_state: The random state for reproducibility.

    Returns:
        A dataframe with a uniform label distribution.
    """
    # Separate each class
    classes = df[column].unique()
    class_dfs = [df[df[column] == label] for label in classes]

    # Find the size of the smallest class
    min_size = min(len(class_df) for class_df in class_dfs)

    # Resample each class to the size of the smallest class
    resampled_dfs = [
        resample(class_df, replace=False, n_samples=min_size, random_state=random_state)
        for class_df in class_dfs
    ]

    # Combine the resampled dataframes (keep original indices!)
    balanced_df = pd.concat(resampled_dfs, ignore_index=False)

    return balanced_df


if __name__ == "__main__":
    main()