Skip to content

Commit 9ce0597

Browse files
fineguyThe TensorFlow Datasets Authors
authored andcommitted
Add Dolma dataset to TFDS.
PiperOrigin-RevId: 670153829
1 parent 025d315 commit 9ce0597

File tree

9 files changed

+2566
-0
lines changed

9 files changed

+2566
-0
lines changed
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
@article{dolma,
2+
title = {{Dolma: An Open Corpus of Three Trillion Tokens for Language Model Pretraining Research}},
3+
author = {
4+
Luca Soldaini and Rodney Kinney and Akshita Bhagia and Dustin Schwenk and David Atkinson and
5+
Russell Authur and Ben Bogin and Khyathi Chandu and Jennifer Dumas and Yanai Elazar and
6+
Valentin Hofmann and Ananya Harsh Jha and Sachin Kumar and Li Lucy and Xinxi Lyu and Ian Magnusson and
7+
Jacob Morrison and Niklas Muennighoff and Aakanksha Naik and Crystal Nam and Matthew E. Peters and
8+
Abhilasha Ravichander and Kyle Richardson and Zejiang Shen and Emma Strubell and Nishant Subramani and
9+
Oyvind Tafjord and Evan Pete Walsh and Hannaneh Hajishirzi and Noah A. Smith and Luke Zettlemoyer and
10+
Iz Beltagy and Dirk Groeneveld and Jesse Dodge and Kyle Lo
11+
},
12+
year = {2024},
13+
journal={arXiv preprint},
14+
}
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Dolma: an Open Corpus of Three Trillion Tokens for Language Model Pretraining
2+
Research
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
content.data-type.text # Contains text data.
2+
content.language.en # Contains text in language English / en.
3+
content.monolingual # Contains text in 1 natural language.
4+
ml.task.language-modelling # Relates to Language Modelling, a machine learning task.
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# coding=utf-8
2+
# Copyright 2024 The TensorFlow Datasets Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+

tensorflow_datasets/datasets/dolma/checksums.tsv

Lines changed: 2420 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
# coding=utf-8
2+
# Copyright 2024 The TensorFlow Datasets Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
"""Dolma dataset."""
17+
18+
import gzip
19+
import json
20+
21+
from etils import epath
22+
import tensorflow_datasets.public_api as tfds
23+
24+
_URL = (
25+
'https://huggingface.co/datasets/allenai/dolma/resolve/main/urls/v1_7.txt'
26+
)
27+
28+
29+
class Builder(tfds.core.GeneratorBasedBuilder):
30+
"""DatasetBuilder for Dolma dataset."""
31+
32+
VERSION = tfds.core.Version('1.0.0')
33+
RELEASE_NOTES = {
34+
'1.0.0': 'Initial release.',
35+
}
36+
37+
def _info(self) -> tfds.core.DatasetInfo:
38+
"""Returns the dataset metadata."""
39+
return self.dataset_info_from_configs(
40+
features=tfds.features.FeaturesDict({
41+
'id': tfds.features.Text(),
42+
'text': tfds.features.Text(),
43+
'added': tfds.features.Text(),
44+
'created': tfds.features.Text(),
45+
'source': tfds.features.Text(),
46+
}),
47+
supervised_keys=None,
48+
homepage='https://github.com/allenai/dolma',
49+
)
50+
51+
def _split_generators(self, dl_manager: tfds.download.DownloadManager):
52+
"""Returns SplitGenerators."""
53+
urls_filepath = dl_manager.download_and_extract(_URL)
54+
urls = epath.Path(urls_filepath).read_text().splitlines()
55+
filepaths = dl_manager.download(urls)
56+
57+
return {
58+
'train': self._generate_examples(filepaths),
59+
}
60+
61+
def _generate_examples(self, filepaths):
62+
"""Yields examples."""
63+
beam = tfds.core.lazy_imports.apache_beam
64+
65+
def _process_file(file_idx, filepath):
66+
with epath.Path(filepath).open('rb') as gz_file, gzip.open(gz_file) as f:
67+
for line_idx, line in enumerate(f):
68+
row = json.loads(line)
69+
yield f'{file_idx}_{line_idx}', {
70+
# Note: there are duplicate ids
71+
'id': row['id'],
72+
'text': row['text'],
73+
'added': str(row.get('added', '')),
74+
'created': str(row.get('created', '')),
75+
'source': row.get('source', ''),
76+
}
77+
78+
return beam.Create(enumerate(filepaths)) | beam.FlatMapTuple(_process_file)
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# coding=utf-8
2+
# Copyright 2024 The TensorFlow Datasets Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
"""Dolma dataset."""
17+
18+
from tensorflow_datasets.datasets.dolma import dolma_dataset_builder
19+
import tensorflow_datasets.public_api as tfds
20+
21+
22+
class DolmaTest(tfds.testing.DatasetBuilderTestCase):
23+
"""Tests for Dolma dataset."""
24+
25+
DATASET_CLASS = dolma_dataset_builder.Builder
26+
SPLITS = {'train': 1}
27+
28+
DL_DOWNLOAD_RESULT = ['texts.json.gz']
29+
DL_EXTRACT_RESULT = 'urls.txt'
30+
31+
32+
if __name__ == '__main__':
33+
tfds.testing.test_main()
Binary file not shown.

tensorflow_datasets/datasets/dolma/dummy_data/urls.txt

Whitespace-only changes.

0 commit comments

Comments
 (0)