Add builder for LBPP

Alfonso Castaño · The TensorFlow Datasets Authors · commit 9d8f3646437d · 2025-04-10T23:39:15.000-07:00
PiperOrigin-RevId: 746328101
diff --git a/tensorflow_datasets/datasets/lbpp/CITATIONS.bib b/tensorflow_datasets/datasets/lbpp/CITATIONS.bib
@@ -0,0 +1,24 @@
+@inproceedings{matton-etal-2024-leakage,
+    title = "On Leakage of Code Generation Evaluation Datasets",
+    author = "Matton, Alexandre  and
+      Sherborne, Tom  and
+      Aumiller, Dennis  and
+      Tommasone, Elena  and
+      Alizadeh, Milad  and
+      He, Jingyi  and
+      Ma, Raymond  and
+      Voisin, Maxime  and
+      Gilsenan-McMahon, Ellen  and
+      Gall{\'e}, Matthias",
+    editor = "Al-Onaizan, Yaser  and
+      Bansal, Mohit  and
+      Chen, Yun-Nung",
+    booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
+    month = nov,
+    year = "2024",
+    address = "Miami, Florida, USA",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2024.findings-emnlp.772/",
+    doi = "10.18653/v1/2024.findings-emnlp.772",
+    pages = "13215--13223",
+}
diff --git a/tensorflow_datasets/datasets/lbpp/README.md b/tensorflow_datasets/datasets/lbpp/README.md
@@ -0,0 +1,6 @@
+*Less Basic Python Programming* is a collection of 161 programming problems
+with accompanying unit tests.
+They were created with the aim of being fresh (not leaked at the time of
+creation) and more difficult than similar datasets (e.g., HumanEval and MBPP).
+It can serve as a drop-in replacement or enrichment of those datasets as they
+are structured in an equivalent way.
diff --git a/tensorflow_datasets/datasets/lbpp/__init__.py b/tensorflow_datasets/datasets/lbpp/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2024 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/tensorflow_datasets/datasets/lbpp/lbpp_dataset_builder.py b/tensorflow_datasets/datasets/lbpp/lbpp_dataset_builder.py
@@ -0,0 +1,174 @@
+# coding=utf-8
+# Copyright 2024 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright 2024 Cohere and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Author Note: Data loader is heavily inspired by the builder in
+# https://github.com/google-research/google-research/tree/main/lbpp_dataset
+"""Cohere Less Basic Python Problems. All columns decoded."""
+
+import base64
+import json
+import pickle
+import zlib
+
+from tensorflow_datasets.core.utils.lazy_imports_utils import pandas as pd
+import tensorflow_datasets.public_api as tfds
+
+
+_HOMEPAGE = "https://aclanthology.org/2024.findings-emnlp.772/"
+
+_VERSION = tfds.core.Version("2.0.0")
+
+_COLUMNS = [
+    "task_id",
+    "language",
+    "title",
+    "instruction",
+    "completion",
+    "test_file",
+    "test_list",
+    "signature",
+    "categories",
+    "test_setup",
+]
+
+_LANGUAGES = ["python", "cpp", "go", "java", "js", "rust"]
+_ALL_LANGUAGE_ALIASES = ["all", "multilingual"]
+_LANGUAGE_ALIAS_MAP = {
+    "default": "python",
+    "javascript": "js",
+}
+
+
+def decode_str(str_to_decode: str):
+  return json.loads(
+      pickle.loads(
+          zlib.decompress(base64.b64decode(str_to_decode.encode("utf-8")))
+      )
+  )
+
+
+class LBPPConfig(tfds.core.BuilderConfig):
+  """BuilderConfig."""
+
+  def __init__(self, name, description, features, **kwargs):
+    super(LBPPConfig, self).__init__(name=name, version=_VERSION, **kwargs)
+    self.name = name
+    self.description = description
+    self.features = features
+
+
+class Builder(tfds.core.GeneratorBasedBuilder):
+  """Builder for LBPP dataset."""
+
+  VERSION = _VERSION
+  LICENSE = "apache-2.0"
+  BUILDER_CONFIGS = [
+      LBPPConfig(
+          name="all", description="Multilingual LBPP", features=_COLUMNS
+      ),
+      LBPPConfig(
+          name="multilingual",
+          description="Multilingual LBPP",
+          features=_COLUMNS,
+      ),
+      LBPPConfig(name="default", description="Python LBPP", features=_COLUMNS),
+      LBPPConfig(name="python", description="Python LBPP", features=_COLUMNS),
+      LBPPConfig(name="cpp", description="C++ LBPP", features=_COLUMNS),
+      LBPPConfig(name="go", description="Go LBPP", features=_COLUMNS),
+      LBPPConfig(name="java", description="Java LBPP", features=_COLUMNS),
+      LBPPConfig(name="js", description="JavaScript LBPP", features=_COLUMNS),
+      LBPPConfig(
+          name="javascript", description="JavaScript LBPP", features=_COLUMNS
+      ),
+      LBPPConfig(name="rust", description="JavaScript LBPP", features=_COLUMNS),
+  ]
+  DEFAULT_CONFIG_NAME = "python"
+
+  def _info(self):
+    return self.dataset_info_from_configs(
+        features=tfds.features.FeaturesDict({
+            "task_id": tfds.features.Text(),
+            "language": tfds.features.Text(),
+            "title": tfds.features.Text(),
+            "instruction": tfds.features.Text(),
+            "completion": tfds.features.Text(),
+            "test_file": tfds.features.Text(),
+            "test_list": tfds.features.Sequence(tfds.features.Text()),
+            "signature": tfds.features.Text(),
+            "categories": tfds.features.Sequence(tfds.features.Text()),
+            "test_setup": tfds.features.Text(),
+        }),
+        homepage=_HOMEPAGE,
+        supervised_keys=None,
+    )
+
+  def _split_generators(self, dl_manager):
+    # Map alias to actual language
+    data_loading_name = _LANGUAGE_ALIAS_MAP.get(
+        self.builder_config.name, self.builder_config.name
+    )
+    hf_url_prefix = (
+        "https://huggingface.co/datasets/CohereForAI/lbpp/resolve/main/"
+    )
+    if data_loading_name in _ALL_LANGUAGE_ALIASES:
+      # Download all languages
+      download_targets = [
+          f"{hf_url_prefix}{lang}/test.parquet" for lang in _LANGUAGES
+      ]
+    else:
+      download_targets = [f"{hf_url_prefix}{data_loading_name}/test.parquet"]
+
+    downloaded_files = dl_manager.download(download_targets)
+
+    return [
+        tfds.core.SplitGenerator(
+            name=tfds.Split.TEST,
+            gen_kwargs={
+                "filepaths": downloaded_files,
+            },
+        )
+    ]
+
+  def _generate_examples(self, filepaths: list[str]):
+    key = 0
+    for filepath in filepaths:
+      df = pd.read_parquet(filepath)
+      for line in df.to_dict(orient="records"):
+        yield key, {
+            "task_id": line["task_id"],
+            "language": line["language"],
+            "title": line["title"],
+            "instruction": line["instruction"],
+            "completion": decode_str(line["completion"]),
+            "test_file": decode_str(line["test_file"]),
+            "test_list": decode_str(line["test_list"]),
+            "signature": line["signature"] or "",
+            "categories": line["categories"],
+            "test_setup": decode_str(line["test_setup"]),
+        }
+        key += 1