Skip to content

Commit 9739bc2

Browse files
authored
Add Tatoeba QnA dataset (#3114)
Multilingual Tatoeba Q&A Translation Dataset ============================================ 120K entries This dataset contains a list of instructions to translate or paraphrase in multiple languages. It is available in Parquet format and includes the following columns: * INSTRUCTION: The instruction of text to be translated or paraphrased. * RESPONSE: The corresponding response or answer in target language. * SOURCE (tatoeba): The original source text from the Tatoeba database. * METADATA (json): Additional information about each entry, including the target language, UUID, and pair of languages (source and target): `"{"language": "lang", "length": "length of original text","uuid": "uuid (original text + translated text)", "langs-pair": "from_lang-to_lang"}"` The data in this dataset was collected through crowdsourcing efforts and includes translations of various types of content, such as sentences, phrases, idioms, and proverbs. You can find it here: https://huggingface.co/datasets/0x22almostEvil/tatoeba-mt-qna-oa Original dataset is available here: https://huggingface.co/datasets/Helsinki-NLP/tatoeba_mt --- Resolves #3115 --------- Co-authored-by: 0x22almostEvil <0x22almostEvil>
1 parent 354cb6e commit 9739bc2

File tree

6 files changed

+1141
-0
lines changed

6 files changed

+1141
-0
lines changed

data/datasets/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
"oa_stackexchange": "donfu/oa-stackexchange",
2828
"stable_diffusion_instructional_dataset": "MadVoyager/stable_diffusion_instructional_dataset",
2929
"ru_riddles_337": "0x22almostEvil/ru-riddles-377",
30+
"tatoeba_mt_qna_oa": "0x22almostEvil/tatoeba-mt-qna-oa",
3031
}
3132

3233
SAFETY_DATASETS = {
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# Multilingual Tatoeba Q&A Translation Dataset
2+
3+
120K entries
4+
5+
This dataset contains a list of instructions to translate or paraphrase in
6+
multiple languages. It is available in Parquet format and includes the following
7+
columns:
8+
9+
- INSTRUCTION: The instruction of text to be translated or paraphrased.
10+
- RESPONSE: The corresponding response or answer in target language.
11+
- SOURCE (tatoeba): The original source text from the Tatoeba database.
12+
- METADATA (json): Additional information about each entry, including the target
13+
language, UUID, and pair of languages (source and target). "{"language":
14+
"lang", "length": "length of original text","uuid": "uuid (original text +
15+
translated text)", "langs-pair": "from_lang-to_lang"}"
16+
17+
The data in this dataset was collected through crowdsourcing efforts and
18+
includes translations of various types of content, such as sentences, phrases,
19+
idioms, and proverbs.
20+
21+
You can find it here:
22+
https://huggingface.co/datasets/0x22almostEvil/tatoeba-mt-qna-oa Original
23+
dataset is available here:
24+
https://huggingface.co/datasets/Helsinki-NLP/tatoeba_mt
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
import json
2+
import random
3+
import uuid
4+
from dataclasses import dataclass
5+
6+
import datasets
7+
import iso639
8+
import language_names
9+
import language_paraphrase
10+
import language_translate
11+
import pandas as pd
12+
13+
random.seed(42)
14+
15+
16+
class DataProcess:
17+
# list of random quotes
18+
random_quote = [("'", "'"), ("“", "”"), ("῎", "῏"), ("`", "´"), ("«", "»"), ('"', '"')]
19+
20+
# provide instruction with a text; process of randomization of a text
21+
def randomize_text(self, text, original_lang=None, target_lang=None):
22+
templates = (
23+
language_translate.random_templates_translate.get(original_lang, {})
24+
if not ((original_lang == target_lang) and (original_lang is not None) and (target_lang is not None))
25+
else language_paraphrase.random_templates_paraphrase.get(original_lang, {})
26+
)
27+
template = random.choice(list(templates.values()))
28+
quote_pair = random.choice(DataProcess().random_quote)
29+
opening_quote, closing_quote = quote_pair
30+
original_lang_name = DataProcess.language_name(None, original_lang, original_lang)
31+
target_lang_name = DataProcess.language_name(None, target_lang, original_lang)
32+
return template.format(
33+
text=text,
34+
lang1=target_lang_name,
35+
lang2=original_lang_name,
36+
opening_quote=opening_quote,
37+
closing_quote=closing_quote,
38+
)
39+
40+
# convert to iso639_1
41+
def convert_code(self, code):
42+
mapped_code = iso639.to_iso639_1(code)
43+
return mapped_code
44+
45+
# return language #1 name in language #2
46+
def language_name(self, lang1, lang2):
47+
name = language_names.language_names.get(lang1, {}).get(lang2)
48+
if name is not None:
49+
return name
50+
# just in case
51+
elif lang1 == lang2:
52+
iso_name = iso639.to_native(lang1)
53+
return iso_name
54+
else:
55+
return None
56+
57+
58+
converter = DataProcess()
59+
60+
"""
61+
EXAMPLES:
62+
63+
# get language name; iso639_1 code
64+
print(converter.language_name('ru', 'en')) # Output: Russian
65+
print(converter.convert_code("eng")) # Output: en
66+
67+
# convert into INSTRUCTION format: text; to; from
68+
text = "test"
69+
print(converter.randomize_text(text, "uk", "fr")) # Ти можеш перекласти цей вислів: 'test'?
70+
print(converter.randomize_text(text, "uk", "de")) # Переклади наступний текст "test" з мови "німецька мова"
71+
"""
72+
73+
74+
@dataclass
75+
class QnA:
76+
INSTRUCTION: str
77+
RESPONSE: str
78+
SOURCE: str
79+
METADATA: str
80+
81+
82+
# format to QnA
83+
def create_qna(row):
84+
# get rows; create uuid based on texts
85+
text = row["Text"]
86+
text_length = len(text)
87+
translation = row["Translated text"]
88+
lang_from = converter.convert_code(row["Original lang"])
89+
lang_to = converter.convert_code(row["Target lang"])
90+
uuid_val = uuid.uuid3(uuid.NAMESPACE_OID, str(text + translation))
91+
# json with language, original text length, uuid and langs-pair
92+
METADATA = {
93+
"language": f"{lang_to}",
94+
"length": f"{text_length}",
95+
"uuid": f"{uuid_val}",
96+
"langs-pair": f"{lang_from}-{lang_to}",
97+
}
98+
metadata_str = json.dumps(METADATA)
99+
source = "tatoeba"
100+
# randomizing INSTRUCTION
101+
instruction = converter.randomize_text(text, lang_to, lang_from)
102+
response = translation
103+
return QnA(instruction, response, source, metadata_str)
104+
105+
106+
# load the dataset from Hugging Face
107+
hf_dataset = datasets.load_dataset("0x22almostEvil/tatoeba-mt-llama-only", split="train")
108+
109+
# original is ~3M; with num_shards=30 it'll be ~120K
110+
hf_dataset = hf_dataset.shard(num_shards=30, index=0)
111+
print(hf_dataset)
112+
113+
# convert the dataset to a pandas dataframe
114+
df = pd.DataFrame(hf_dataset)
115+
116+
# apply the create_qna function to each row of the dataframe to create QnA objects
117+
qna_list = df.apply(create_qna, axis=1).tolist()
118+
119+
# save the QnA objects as a parquet file
120+
qna_df = pd.DataFrame(qna_list, columns=["INSTRUCTION", "RESPONSE", "SOURCE", "METADATA"])
121+
qna_df.to_parquet("translation-taboeba-qna-120k-oa.parquet", row_group_size=100, engine="pyarrow", index=False)

0 commit comments

Comments
 (0)