Skip to content

Commit bc48d05

Browse files
pierrot0The TensorFlow Datasets Authors
authored andcommitted
add Builder for AI2D-Caption dataset.
PiperOrigin-RevId: 685722963
1 parent 1b317d2 commit bc48d05

File tree

6 files changed

+205
-0
lines changed

6 files changed

+205
-0
lines changed
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
@inproceedings{Zala2024DiagrammerGPT,
2+
author = {Abhay Zala and Han Lin and Jaemin Cho and Mohit Bansal},
3+
title = {DiagrammerGPT: Generating Open-Domain, Open-Platform Diagrams via LLM Planning},
4+
year = {2024},
5+
booktitle = {COLM},
6+
}
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
This dataset is primarily based off the AI2D Dataset (see [here](
2+
https://prior.allenai.org/projects/diagram-understanding)).
3+
4+
See [Section 4.1](https://arxiv.org/pdf/2310.12128) of our paper for
5+
the AI2D-Caption dataset annotation process.
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
content.data-type.image # Contains image data.
2+
content.language.en # Contains text in language English / en.
3+
content.subject.biology # Relates to biology.
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# coding=utf-8
2+
# Copyright 2024 The TensorFlow Datasets Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
# coding=utf-8
2+
# Copyright 2024 The TensorFlow Datasets Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
"""AI2DCaption dataset."""
17+
18+
import json
19+
import os.path
20+
21+
import tensorflow_datasets.public_api as tfds
22+
23+
LAYOUT_NAMES = [
24+
'abstract',
25+
'circular',
26+
'columns',
27+
'linear',
28+
'rows',
29+
'tree',
30+
'unspecified',
31+
]
32+
33+
TOPIC_NAMES = ['astronomy', 'biology', 'engineering', 'unspecified']
34+
35+
TYPE_NAMES = [
36+
'arrow',
37+
'image',
38+
'object',
39+
'relationship',
40+
'text',
41+
]
42+
43+
CATEGORIES = [
44+
'imageCaption',
45+
'imageTitle',
46+
'interObjectLinkage',
47+
'intraObjectLabel',
48+
'intraObjectLinkage',
49+
'intraObjectRegionLabel',
50+
'intraObjectTextLinkage',
51+
'misc',
52+
'sectionTitle',
53+
'unspecified',
54+
]
55+
56+
SPLITS = [
57+
'auditor_llm_training_examples',
58+
'gpt4v',
59+
'llava_15',
60+
'planner_llm_training_examples',
61+
'test',
62+
]
63+
64+
JSON_URL_TMPL = 'https://huggingface.co/datasets/abhayzala/AI2D-Caption/resolve/main/ai2d_caption_{split}.json?download=true'
65+
66+
IMAGES_URL = 'http://ai2-website.s3.amazonaws.com/data/ai2d-all.zip'
67+
68+
69+
class Builder(tfds.core.GeneratorBasedBuilder):
70+
"""DatasetBuilder for AI2DCaption dataset."""
71+
72+
VERSION = tfds.core.Version('1.0.0')
73+
RELEASE_NOTES = {
74+
'1.0.0': 'Initial release.',
75+
}
76+
77+
def _info(self) -> tfds.core.DatasetInfo:
78+
"""Returns the dataset metadata."""
79+
return self.dataset_info_from_configs(
80+
features=tfds.features.FeaturesDict({
81+
'image': tfds.features.Image(
82+
shape=(None, None, 3),
83+
doc=tfds.features.Documentation(
84+
desc='The image of the diagram.',
85+
),
86+
),
87+
'image_filename': tfds.features.Text(
88+
doc=tfds.features.Documentation(
89+
desc='Image filename. e.g. "1337.png"',
90+
),
91+
),
92+
'topic': tfds.features.ClassLabel(names=TOPIC_NAMES),
93+
'layout': tfds.features.ClassLabel(names=LAYOUT_NAMES),
94+
'caption': tfds.features.Text(),
95+
'relationships': tfds.features.Sequence(tfds.features.Text()),
96+
'entities': tfds.features.Sequence(
97+
tfds.features.FeaturesDict({
98+
'id': tfds.features.Text(),
99+
'type': tfds.features.ClassLabel(names=TYPE_NAMES),
100+
'label': tfds.features.Text(),
101+
'bounds': tfds.features.BBoxFeature(),
102+
# Not always specified:
103+
'cat': tfds.features.ClassLabel(names=CATEGORIES),
104+
'from': tfds.features.Text(),
105+
'to': tfds.features.Text(),
106+
})
107+
),
108+
}),
109+
supervised_keys=None,
110+
homepage='https://huggingface.co/datasets/abhayzala/AI2D-Caption',
111+
)
112+
113+
def _split_generators(self, dl_manager: tfds.download.DownloadManager):
114+
"""Returns SplitGenerators."""
115+
paths = {split: JSON_URL_TMPL.format(split=split) for split in SPLITS}
116+
paths['images'] = IMAGES_URL
117+
dl_paths = dl_manager.download(paths)
118+
119+
return {
120+
split: self._generate_examples(
121+
split, dl_paths[split], dl_paths['images']
122+
)
123+
for split in SPLITS
124+
}
125+
126+
def _generate_examples(self, split, json_path, images_path):
127+
"""Yields examples."""
128+
# Build an images index from JSON:
129+
json_data = json.loads(json_path.read_text(encoding='utf-8'))
130+
metadata_by_filename = {} # Maps from image id/filename to image metadata.
131+
for image_metadata in json_data:
132+
metadata_by_filename[image_metadata['image']] = image_metadata
133+
# Iterate over the images,ß yield the ones present in metadata_by_filename:
134+
for image_path, file in tfds.download.iter_archive(
135+
images_path, tfds.download.ExtractMethod.ZIP
136+
):
137+
if not image_path.startswith('ai2d/images/'):
138+
continue
139+
image_id = os.path.basename(image_path)
140+
if (metadata := metadata_by_filename.get(image_id)) is None:
141+
continue
142+
# Convert bounding box format from REL_XYXY to TFDS format.
143+
entities = list(metadata['entities'].values())
144+
for entity in entities:
145+
# auditor_llm_training_examples split has non-sense bounds (max<min).
146+
if (
147+
bounds := entity.get('bounds')
148+
) and split != 'auditor_llm_training_examples':
149+
xmin, ymin, xmax, ymax = [c / 100.0 for c in bounds]
150+
else:
151+
xmin, ymin, xmax, ymax = 0.0, 0.0, 0.0, 0.0
152+
entity['bounds'] = tfds.features.BBox(ymin, xmin, ymax, xmax)
153+
entity.setdefault('label', '')
154+
entity.setdefault('cat', 'unspecified')
155+
entity.setdefault('from', '')
156+
entity.setdefault('to', '')
157+
relationships = metadata.get('relationships', [])
158+
# ai2d_caption_test.json has a few relationships expressed as a dict.
159+
if isinstance(relationships, dict):
160+
relationships = list(relationships.values())
161+
yield image_id, {
162+
'image_filename': image_id,
163+
'image': file,
164+
'topic': metadata.get('topic', 'unspecified'),
165+
# layout may be an empty string, hence the following construct.
166+
'layout': metadata.get('layout', None) or 'unspecified',
167+
'caption': metadata.get('caption', ''),
168+
'relationships': relationships,
169+
'entities': entities,
170+
}
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
http://ai2-website.s3.amazonaws.com/data/ai2d-all.zip 990965374 1a6b77eebb8b7dbdf76a0ba6ca76c2f97ce8f81d8ee33b06593aa722e54c4786 ai2d-all.zip
2+
https://huggingface.co/datasets/abhayzala/AI2D-Caption/resolve/main/ai2d_caption_auditor_llm_training_examples.json?download=true 77845 3af7ef3f3c9e48183b78c521541ac7097156dc80421a2ee21935c95d76c6221e ai2d_caption_auditor_llm_training_examples.json
3+
https://huggingface.co/datasets/abhayzala/AI2D-Caption/resolve/main/ai2d_caption_gpt4v.json?download=true 29115582 10aac517f432f384dad6552a88e660ef5966ade75d1550581cc0ad3985d20b43 ai2d_caption_gpt4v.json
4+
https://huggingface.co/datasets/abhayzala/AI2D-Caption/resolve/main/ai2d_caption_llava_15.json?download=true 29925810 962c923b80bc4f62621ce5b3d572d3bd52d2daef3d68af0999a5ef01d8170a9f ai2d_caption_llava_15.json
5+
https://huggingface.co/datasets/abhayzala/AI2D-Caption/resolve/main/ai2d_caption_planner_llm_training_examples.json?download=true 73533 43f1dd0e449f7bdf5c426ddefc8cb9f663757b22c77a4c305cba8c49a2c2ea8f ai2d_caption_planner_llm_training_examples.json
6+
https://huggingface.co/datasets/abhayzala/AI2D-Caption/resolve/main/ai2d_caption_test.json?download=true 192643 54e279ba96177d78c4e9c4e8311c17272b94dc9c5ce5a1a1c701ad84e3a2db48 ai2d_caption_test.json

0 commit comments

Comments
 (0)