add Builder for AI2D-Caption dataset.

pierrot0 · The TensorFlow Datasets Authors · commit bc48d0573f34 · 2024-10-14T08:59:05.000-07:00
PiperOrigin-RevId: 685722963
diff --git a/tensorflow_datasets/datasets/ai2dcaption/CITATIONS.bib b/tensorflow_datasets/datasets/ai2dcaption/CITATIONS.bib
@@ -0,0 +1,6 @@
+@inproceedings{Zala2024DiagrammerGPT,
+        author = {Abhay Zala and Han Lin and Jaemin Cho and Mohit Bansal},
+        title = {DiagrammerGPT: Generating Open-Domain, Open-Platform Diagrams via LLM Planning},
+        year = {2024},
+        booktitle = {COLM},
+}
diff --git a/tensorflow_datasets/datasets/ai2dcaption/README.md b/tensorflow_datasets/datasets/ai2dcaption/README.md
@@ -0,0 +1,5 @@
+This dataset is primarily based off the AI2D Dataset (see [here](
+  https://prior.allenai.org/projects/diagram-understanding)).
+
+See [Section 4.1](https://arxiv.org/pdf/2310.12128) of our paper for
+ the AI2D-Caption dataset annotation process.
diff --git a/tensorflow_datasets/datasets/ai2dcaption/TAGS.txt b/tensorflow_datasets/datasets/ai2dcaption/TAGS.txt
@@ -0,0 +1,3 @@
+content.data-type.image # Contains image data.
+content.language.en # Contains text in language English / en.
+content.subject.biology # Relates to biology.
diff --git a/tensorflow_datasets/datasets/ai2dcaption/__init__.py b/tensorflow_datasets/datasets/ai2dcaption/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2024 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/tensorflow_datasets/datasets/ai2dcaption/ai2dcaption_dataset_builder.py b/tensorflow_datasets/datasets/ai2dcaption/ai2dcaption_dataset_builder.py
@@ -0,0 +1,170 @@
+# coding=utf-8
+# Copyright 2024 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""AI2DCaption dataset."""
+
+import json
+import os.path
+
+import tensorflow_datasets.public_api as tfds
+
+LAYOUT_NAMES = [
+    'abstract',
+    'circular',
+    'columns',
+    'linear',
+    'rows',
+    'tree',
+    'unspecified',
+]
+
+TOPIC_NAMES = ['astronomy', 'biology', 'engineering', 'unspecified']
+
+TYPE_NAMES = [
+    'arrow',
+    'image',
+    'object',
+    'relationship',
+    'text',
+]
+
+CATEGORIES = [
+    'imageCaption',
+    'imageTitle',
+    'interObjectLinkage',
+    'intraObjectLabel',
+    'intraObjectLinkage',
+    'intraObjectRegionLabel',
+    'intraObjectTextLinkage',
+    'misc',
+    'sectionTitle',
+    'unspecified',
+]
+
+SPLITS = [
+    'auditor_llm_training_examples',
+    'gpt4v',
+    'llava_15',
+    'planner_llm_training_examples',
+    'test',
+]
+
+JSON_URL_TMPL = 'https://huggingface.co/datasets/abhayzala/AI2D-Caption/resolve/main/ai2d_caption_{split}.json?download=true'
+
+IMAGES_URL = 'http://ai2-website.s3.amazonaws.com/data/ai2d-all.zip'
+
+
+class Builder(tfds.core.GeneratorBasedBuilder):
+  """DatasetBuilder for AI2DCaption dataset."""
+
+  VERSION = tfds.core.Version('1.0.0')
+  RELEASE_NOTES = {
+      '1.0.0': 'Initial release.',
+  }
+
+  def _info(self) -> tfds.core.DatasetInfo:
+    """Returns the dataset metadata."""
+    return self.dataset_info_from_configs(
+        features=tfds.features.FeaturesDict({
+            'image': tfds.features.Image(
+                shape=(None, None, 3),
+                doc=tfds.features.Documentation(
+                    desc='The image of the diagram.',
+                ),
+            ),
+            'image_filename': tfds.features.Text(
+                doc=tfds.features.Documentation(
+                    desc='Image filename. e.g. "1337.png"',
+                ),
+            ),
+            'topic': tfds.features.ClassLabel(names=TOPIC_NAMES),
+            'layout': tfds.features.ClassLabel(names=LAYOUT_NAMES),
+            'caption': tfds.features.Text(),
+            'relationships': tfds.features.Sequence(tfds.features.Text()),
+            'entities': tfds.features.Sequence(
+                tfds.features.FeaturesDict({
+                    'id': tfds.features.Text(),
+                    'type': tfds.features.ClassLabel(names=TYPE_NAMES),
+                    'label': tfds.features.Text(),
+                    'bounds': tfds.features.BBoxFeature(),
+                    # Not always specified:
+                    'cat': tfds.features.ClassLabel(names=CATEGORIES),
+                    'from': tfds.features.Text(),
+                    'to': tfds.features.Text(),
+                })
+            ),
+        }),
+        supervised_keys=None,
+        homepage='https://huggingface.co/datasets/abhayzala/AI2D-Caption',
+    )
+
+  def _split_generators(self, dl_manager: tfds.download.DownloadManager):
+    """Returns SplitGenerators."""
+    paths = {split: JSON_URL_TMPL.format(split=split) for split in SPLITS}
+    paths['images'] = IMAGES_URL
+    dl_paths = dl_manager.download(paths)
+
+    return {
+        split: self._generate_examples(
+            split, dl_paths[split], dl_paths['images']
+        )
+        for split in SPLITS
+    }
+
+  def _generate_examples(self, split, json_path, images_path):
+    """Yields examples."""
+    # Build an images index from JSON:
+    json_data = json.loads(json_path.read_text(encoding='utf-8'))
+    metadata_by_filename = {}  # Maps from image id/filename to image metadata.
+    for image_metadata in json_data:
+      metadata_by_filename[image_metadata['image']] = image_metadata
+    # Iterate over the images,ß yield the ones present in metadata_by_filename:
+    for image_path, file in tfds.download.iter_archive(
+        images_path, tfds.download.ExtractMethod.ZIP
+    ):
+      if not image_path.startswith('ai2d/images/'):
+        continue
+      image_id = os.path.basename(image_path)
+      if (metadata := metadata_by_filename.get(image_id)) is None:
+        continue
+      # Convert bounding box format from REL_XYXY to TFDS format.
+      entities = list(metadata['entities'].values())
+      for entity in entities:
+        # auditor_llm_training_examples split has non-sense bounds (max<min).
+        if (
+            bounds := entity.get('bounds')
+        ) and split != 'auditor_llm_training_examples':
+          xmin, ymin, xmax, ymax = [c / 100.0 for c in bounds]
+        else:
+          xmin, ymin, xmax, ymax = 0.0, 0.0, 0.0, 0.0
+        entity['bounds'] = tfds.features.BBox(ymin, xmin, ymax, xmax)
+        entity.setdefault('label', '')
+        entity.setdefault('cat', 'unspecified')
+        entity.setdefault('from', '')
+        entity.setdefault('to', '')
+      relationships = metadata.get('relationships', [])
+      # ai2d_caption_test.json has a few relationships expressed as a dict.
+      if isinstance(relationships, dict):
+        relationships = list(relationships.values())
+      yield image_id, {
+          'image_filename': image_id,
+          'image': file,
+          'topic': metadata.get('topic', 'unspecified'),
+          # layout may be an empty string, hence the following construct.
+          'layout': metadata.get('layout', None) or 'unspecified',
+          'caption': metadata.get('caption', ''),
+          'relationships': relationships,
+          'entities': entities,
+      }
diff --git a/tensorflow_datasets/datasets/ai2dcaption/checksums.tsv b/tensorflow_datasets/datasets/ai2dcaption/checksums.tsv
@@ -0,0 +1,6 @@
+http://ai2-website.s3.amazonaws.com/data/ai2d-all.zip	990965374	1a6b77eebb8b7dbdf76a0ba6ca76c2f97ce8f81d8ee33b06593aa722e54c4786	ai2d-all.zip
+https://huggingface.co/datasets/abhayzala/AI2D-Caption/resolve/main/ai2d_caption_auditor_llm_training_examples.json?download=true	77845	3af7ef3f3c9e48183b78c521541ac7097156dc80421a2ee21935c95d76c6221e	ai2d_caption_auditor_llm_training_examples.json
+https://huggingface.co/datasets/abhayzala/AI2D-Caption/resolve/main/ai2d_caption_gpt4v.json?download=true	29115582	10aac517f432f384dad6552a88e660ef5966ade75d1550581cc0ad3985d20b43	ai2d_caption_gpt4v.json
+https://huggingface.co/datasets/abhayzala/AI2D-Caption/resolve/main/ai2d_caption_llava_15.json?download=true	29925810	962c923b80bc4f62621ce5b3d572d3bd52d2daef3d68af0999a5ef01d8170a9f	ai2d_caption_llava_15.json
+https://huggingface.co/datasets/abhayzala/AI2D-Caption/resolve/main/ai2d_caption_planner_llm_training_examples.json?download=true	73533	43f1dd0e449f7bdf5c426ddefc8cb9f663757b22c77a4c305cba8c49a2c2ea8f	ai2d_caption_planner_llm_training_examples.json
+https://huggingface.co/datasets/abhayzala/AI2D-Caption/resolve/main/ai2d_caption_test.json?download=true	192643	54e279ba96177d78c4e9c4e8311c17272b94dc9c5ce5a1a1c701ad84e3a2db48	ai2d_caption_test.json

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+content.data-type.image # Contains image data.`
	`2`	`+content.language.en # Contains text in language English / en.`
	`3`	`+content.subject.biology # Relates to biology.`