Skip to content

Commit 9d8f364

Browse files
Alfonso CastañoThe TensorFlow Datasets Authors
authored andcommitted
Add builder for LBPP
PiperOrigin-RevId: 746328101
1 parent 92cbcff commit 9d8f364

File tree

4 files changed

+219
-0
lines changed

4 files changed

+219
-0
lines changed
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
@inproceedings{matton-etal-2024-leakage,
2+
title = "On Leakage of Code Generation Evaluation Datasets",
3+
author = "Matton, Alexandre and
4+
Sherborne, Tom and
5+
Aumiller, Dennis and
6+
Tommasone, Elena and
7+
Alizadeh, Milad and
8+
He, Jingyi and
9+
Ma, Raymond and
10+
Voisin, Maxime and
11+
Gilsenan-McMahon, Ellen and
12+
Gall{\'e}, Matthias",
13+
editor = "Al-Onaizan, Yaser and
14+
Bansal, Mohit and
15+
Chen, Yun-Nung",
16+
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
17+
month = nov,
18+
year = "2024",
19+
address = "Miami, Florida, USA",
20+
publisher = "Association for Computational Linguistics",
21+
url = "https://aclanthology.org/2024.findings-emnlp.772/",
22+
doi = "10.18653/v1/2024.findings-emnlp.772",
23+
pages = "13215--13223",
24+
}
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
*Less Basic Python Programming* is a collection of 161 programming problems
2+
with accompanying unit tests.
3+
They were created with the aim of being fresh (not leaked at the time of
4+
creation) and more difficult than similar datasets (e.g., HumanEval and MBPP).
5+
It can serve as a drop-in replacement or enrichment of those datasets as they
6+
are structured in an equivalent way.
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# coding=utf-8
2+
# Copyright 2024 The TensorFlow Datasets Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
# coding=utf-8
2+
# Copyright 2024 The TensorFlow Datasets Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
# Copyright 2024 Cohere and the current dataset script contributor.
17+
#
18+
# Licensed under the Apache License, Version 2.0 (the "License");
19+
# you may not use this file except in compliance with the License.
20+
# You may obtain a copy of the License at
21+
#
22+
# http://www.apache.org/licenses/LICENSE-2.0
23+
#
24+
# Unless required by applicable law or agreed to in writing, software
25+
# distributed under the License is distributed on an "AS IS" BASIS,
26+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
27+
# See the License for the specific language governing permissions and
28+
# limitations under the License.
29+
# Author Note: Data loader is heavily inspired by the builder in
30+
# https://github.com/google-research/google-research/tree/main/lbpp_dataset
31+
"""Cohere Less Basic Python Problems. All columns decoded."""
32+
33+
import base64
34+
import json
35+
import pickle
36+
import zlib
37+
38+
from tensorflow_datasets.core.utils.lazy_imports_utils import pandas as pd
39+
import tensorflow_datasets.public_api as tfds
40+
41+
42+
_HOMEPAGE = "https://aclanthology.org/2024.findings-emnlp.772/"
43+
44+
_VERSION = tfds.core.Version("2.0.0")
45+
46+
_COLUMNS = [
47+
"task_id",
48+
"language",
49+
"title",
50+
"instruction",
51+
"completion",
52+
"test_file",
53+
"test_list",
54+
"signature",
55+
"categories",
56+
"test_setup",
57+
]
58+
59+
_LANGUAGES = ["python", "cpp", "go", "java", "js", "rust"]
60+
_ALL_LANGUAGE_ALIASES = ["all", "multilingual"]
61+
_LANGUAGE_ALIAS_MAP = {
62+
"default": "python",
63+
"javascript": "js",
64+
}
65+
66+
67+
def decode_str(str_to_decode: str):
68+
return json.loads(
69+
pickle.loads(
70+
zlib.decompress(base64.b64decode(str_to_decode.encode("utf-8")))
71+
)
72+
)
73+
74+
75+
class LBPPConfig(tfds.core.BuilderConfig):
76+
"""BuilderConfig."""
77+
78+
def __init__(self, name, description, features, **kwargs):
79+
super(LBPPConfig, self).__init__(name=name, version=_VERSION, **kwargs)
80+
self.name = name
81+
self.description = description
82+
self.features = features
83+
84+
85+
class Builder(tfds.core.GeneratorBasedBuilder):
86+
"""Builder for LBPP dataset."""
87+
88+
VERSION = _VERSION
89+
LICENSE = "apache-2.0"
90+
BUILDER_CONFIGS = [
91+
LBPPConfig(
92+
name="all", description="Multilingual LBPP", features=_COLUMNS
93+
),
94+
LBPPConfig(
95+
name="multilingual",
96+
description="Multilingual LBPP",
97+
features=_COLUMNS,
98+
),
99+
LBPPConfig(name="default", description="Python LBPP", features=_COLUMNS),
100+
LBPPConfig(name="python", description="Python LBPP", features=_COLUMNS),
101+
LBPPConfig(name="cpp", description="C++ LBPP", features=_COLUMNS),
102+
LBPPConfig(name="go", description="Go LBPP", features=_COLUMNS),
103+
LBPPConfig(name="java", description="Java LBPP", features=_COLUMNS),
104+
LBPPConfig(name="js", description="JavaScript LBPP", features=_COLUMNS),
105+
LBPPConfig(
106+
name="javascript", description="JavaScript LBPP", features=_COLUMNS
107+
),
108+
LBPPConfig(name="rust", description="JavaScript LBPP", features=_COLUMNS),
109+
]
110+
DEFAULT_CONFIG_NAME = "python"
111+
112+
def _info(self):
113+
return self.dataset_info_from_configs(
114+
features=tfds.features.FeaturesDict({
115+
"task_id": tfds.features.Text(),
116+
"language": tfds.features.Text(),
117+
"title": tfds.features.Text(),
118+
"instruction": tfds.features.Text(),
119+
"completion": tfds.features.Text(),
120+
"test_file": tfds.features.Text(),
121+
"test_list": tfds.features.Sequence(tfds.features.Text()),
122+
"signature": tfds.features.Text(),
123+
"categories": tfds.features.Sequence(tfds.features.Text()),
124+
"test_setup": tfds.features.Text(),
125+
}),
126+
homepage=_HOMEPAGE,
127+
supervised_keys=None,
128+
)
129+
130+
def _split_generators(self, dl_manager):
131+
# Map alias to actual language
132+
data_loading_name = _LANGUAGE_ALIAS_MAP.get(
133+
self.builder_config.name, self.builder_config.name
134+
)
135+
hf_url_prefix = (
136+
"https://huggingface.co/datasets/CohereForAI/lbpp/resolve/main/"
137+
)
138+
if data_loading_name in _ALL_LANGUAGE_ALIASES:
139+
# Download all languages
140+
download_targets = [
141+
f"{hf_url_prefix}{lang}/test.parquet" for lang in _LANGUAGES
142+
]
143+
else:
144+
download_targets = [f"{hf_url_prefix}{data_loading_name}/test.parquet"]
145+
146+
downloaded_files = dl_manager.download(download_targets)
147+
148+
return [
149+
tfds.core.SplitGenerator(
150+
name=tfds.Split.TEST,
151+
gen_kwargs={
152+
"filepaths": downloaded_files,
153+
},
154+
)
155+
]
156+
157+
def _generate_examples(self, filepaths: list[str]):
158+
key = 0
159+
for filepath in filepaths:
160+
df = pd.read_parquet(filepath)
161+
for line in df.to_dict(orient="records"):
162+
yield key, {
163+
"task_id": line["task_id"],
164+
"language": line["language"],
165+
"title": line["title"],
166+
"instruction": line["instruction"],
167+
"completion": decode_str(line["completion"]),
168+
"test_file": decode_str(line["test_file"]),
169+
"test_list": decode_str(line["test_list"]),
170+
"signature": line["signature"] or "",
171+
"categories": line["categories"],
172+
"test_setup": decode_str(line["test_setup"]),
173+
}
174+
key += 1

0 commit comments

Comments
 (0)