Add Dolma dataset to TFDS.

fineguy · The TensorFlow Datasets Authors · commit 9ce0597eb8be · 2024-09-02T03:42:56.000-07:00
PiperOrigin-RevId: 670153829
diff --git a/tensorflow_datasets/datasets/dolma/CITATIONS.bib b/tensorflow_datasets/datasets/dolma/CITATIONS.bib
@@ -0,0 +1,14 @@
+@article{dolma,
+  title = {{Dolma: An Open Corpus of Three Trillion Tokens for Language Model Pretraining Research}},
+  author = {
+    Luca Soldaini and Rodney Kinney and Akshita Bhagia and Dustin Schwenk and David Atkinson and
+    Russell Authur and Ben Bogin and Khyathi Chandu and Jennifer Dumas and Yanai Elazar and
+    Valentin Hofmann and Ananya Harsh Jha and Sachin Kumar and Li Lucy and Xinxi Lyu and Ian Magnusson and
+    Jacob Morrison and Niklas Muennighoff and Aakanksha Naik and Crystal Nam and Matthew E. Peters and
+    Abhilasha Ravichander and Kyle Richardson and Zejiang Shen and Emma Strubell and Nishant Subramani and
+    Oyvind Tafjord and Evan Pete Walsh and Hannaneh Hajishirzi and Noah A. Smith and Luke Zettlemoyer and
+    Iz Beltagy and Dirk Groeneveld and Jesse Dodge and Kyle Lo
+},
+  year = {2024},
+  journal={arXiv preprint},
+}
diff --git a/tensorflow_datasets/datasets/dolma/README.md b/tensorflow_datasets/datasets/dolma/README.md
@@ -0,0 +1,2 @@
+Dolma: an Open Corpus of Three Trillion Tokens for Language Model Pretraining
+Research
diff --git a/tensorflow_datasets/datasets/dolma/TAGS.txt b/tensorflow_datasets/datasets/dolma/TAGS.txt
@@ -0,0 +1,4 @@
+content.data-type.text # Contains text data.
+content.language.en # Contains text in language English / en.
+content.monolingual # Contains text in 1 natural language.
+ml.task.language-modelling # Relates to Language Modelling, a machine learning task.
diff --git a/tensorflow_datasets/datasets/dolma/__init__.py b/tensorflow_datasets/datasets/dolma/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2024 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/tensorflow_datasets/datasets/dolma/checksums.tsv b/tensorflow_datasets/datasets/dolma/checksums.tsv
diff --git a/tensorflow_datasets/datasets/dolma/dolma_dataset_builder.py b/tensorflow_datasets/datasets/dolma/dolma_dataset_builder.py
@@ -0,0 +1,78 @@
+# coding=utf-8
+# Copyright 2024 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Dolma dataset."""
+
+import gzip
+import json
+
+from etils import epath
+import tensorflow_datasets.public_api as tfds
+
+_URL = (
+    'https://huggingface.co/datasets/allenai/dolma/resolve/main/urls/v1_7.txt'
+)
+
+
+class Builder(tfds.core.GeneratorBasedBuilder):
+  """DatasetBuilder for Dolma dataset."""
+
+  VERSION = tfds.core.Version('1.0.0')
+  RELEASE_NOTES = {
+      '1.0.0': 'Initial release.',
+  }
+
+  def _info(self) -> tfds.core.DatasetInfo:
+    """Returns the dataset metadata."""
+    return self.dataset_info_from_configs(
+        features=tfds.features.FeaturesDict({
+            'id': tfds.features.Text(),
+            'text': tfds.features.Text(),
+            'added': tfds.features.Text(),
+            'created': tfds.features.Text(),
+            'source': tfds.features.Text(),
+        }),
+        supervised_keys=None,
+        homepage='https://github.com/allenai/dolma',
+    )
+
+  def _split_generators(self, dl_manager: tfds.download.DownloadManager):
+    """Returns SplitGenerators."""
+    urls_filepath = dl_manager.download_and_extract(_URL)
+    urls = epath.Path(urls_filepath).read_text().splitlines()
+    filepaths = dl_manager.download(urls)
+
+    return {
+        'train': self._generate_examples(filepaths),
+    }
+
+  def _generate_examples(self, filepaths):
+    """Yields examples."""
+    beam = tfds.core.lazy_imports.apache_beam
+
+    def _process_file(file_idx, filepath):
+      with epath.Path(filepath).open('rb') as gz_file, gzip.open(gz_file) as f:
+        for line_idx, line in enumerate(f):
+          row = json.loads(line)
+          yield f'{file_idx}_{line_idx}', {
+              # Note: there are duplicate ids
+              'id': row['id'],
+              'text': row['text'],
+              'added': str(row.get('added', '')),
+              'created': str(row.get('created', '')),
+              'source': row.get('source', ''),
+          }
+
+    return beam.Create(enumerate(filepaths)) | beam.FlatMapTuple(_process_file)
diff --git a/tensorflow_datasets/datasets/dolma/dolma_dataset_builder_test.py b/tensorflow_datasets/datasets/dolma/dolma_dataset_builder_test.py
@@ -0,0 +1,33 @@
+# coding=utf-8
+# Copyright 2024 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Dolma dataset."""
+
+from tensorflow_datasets.datasets.dolma import dolma_dataset_builder
+import tensorflow_datasets.public_api as tfds
+
+
+class DolmaTest(tfds.testing.DatasetBuilderTestCase):
+  """Tests for Dolma dataset."""
+
+  DATASET_CLASS = dolma_dataset_builder.Builder
+  SPLITS = {'train': 1}
+
+  DL_DOWNLOAD_RESULT = ['texts.json.gz']
+  DL_EXTRACT_RESULT = 'urls.txt'
+
+
+if __name__ == '__main__':
+  tfds.testing.test_main()
diff --git a/tensorflow_datasets/datasets/dolma/dummy_data/texts.json.gz b/tensorflow_datasets/datasets/dolma/dummy_data/texts.json.gz
diff --git a/tensorflow_datasets/datasets/dolma/dummy_data/urls.txt b/tensorflow_datasets/datasets/dolma/dummy_data/urls.txt

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+Dolma: an Open Corpus of Three Trillion Tokens for Language Model Pretraining`
	`2`	`+Research`