tensorflow
diff --git a/‎tensorflow_datasets/datasets/covr/CITATIONS.bib‎
Lines changed: 46 additions & 0 deletions b/‎tensorflow_datasets/datasets/covr/CITATIONS.bib‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎tensorflow_datasets/datasets/covr/README.md‎
Lines changed: 1 addition & 0 deletions b/‎tensorflow_datasets/datasets/covr/README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tensorflow_datasets/datasets/covr/TAGS.txt‎
Lines changed: 8 additions & 0 deletions b/‎tensorflow_datasets/datasets/covr/TAGS.txt‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎tensorflow_datasets/datasets/covr/__init__.py‎
Lines changed: 15 additions & 0 deletions b/‎tensorflow_datasets/datasets/covr/__init__.py‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎tensorflow_datasets/datasets/covr/checksums.tsv‎
Lines changed: 4 additions & 0 deletions b/‎tensorflow_datasets/datasets/covr/checksums.tsv‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎tensorflow_datasets/datasets/covr/covr_dataset_builder.py‎
Lines changed: 114 additions & 0 deletions b/‎tensorflow_datasets/datasets/covr/covr_dataset_builder.py‎
Lines changed: 114 additions & 0 deletions
diff --git a/‎tensorflow_datasets/datasets/covr/covr_dataset_builder_test.py‎
Lines changed: 41 additions & 0 deletions b/‎tensorflow_datasets/datasets/covr/covr_dataset_builder_test.py‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎tensorflow_datasets/datasets/covr/dummy_data/covr/test.jsonl‎
Lines changed: 1 addition & 0 deletions b/‎tensorflow_datasets/datasets/covr/dummy_data/covr/test.jsonl‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tensorflow_datasets/datasets/covr/dummy_data/covr/train.jsonl‎
Lines changed: 1 addition & 0 deletions b/‎tensorflow_datasets/datasets/covr/dummy_data/covr/train.jsonl‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tensorflow_datasets/datasets/covr/dummy_data/covr/val.jsonl‎
Lines changed: 1 addition & 0 deletions b/‎tensorflow_datasets/datasets/covr/dummy_data/covr/val.jsonl‎
Lines changed: 1 addition & 0 deletions
@@ -0,0 +1,46 @@
+@inproceedings{bogin-etal-2021-covr,
+    title = "{COVR}: A Test-Bed for Visually Grounded Compositional Generalization with Real Images",
+    author = "Bogin, Ben  and
+      Gupta, Shivanshu  and
+      Gardner, Matt  and
+      Berant, Jonathan",
+    editor = "Moens, Marie-Francine  and
+      Huang, Xuanjing  and
+      Specia, Lucia  and
+      Yih, Scott Wen-tau",
+    booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
+    month = nov,
+    year = "2021",
+    address = "Online and Punta Cana, Dominican Republic",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2021.emnlp-main.774/",
+    doi = "10.18653/v1/2021.emnlp-main.774",
+    pages = "9824--9846",
+    abstract = "While interest in models that generalize at test time to new compositions has risen in recent years, benchmarks in the visually-grounded domain have thus far been restricted to synthetic images. In this work, we propose COVR, a new test-bed for visually-grounded compositional generalization with real images. To create COVR, we use real images annotated with scene graphs, and propose an almost fully automatic procedure for generating question-answer pairs along with a set of context images. COVR focuses on questions that require complex reasoning, including higher-order operations such as quantification and aggregation. Due to the automatic generation process, COVR facilitates the creation of compositional splits, where models at test time need to generalize to new concepts and compositions in a zero- or few-shot setting. We construct compositional splits using COVR and demonstrate a myriad of cases where state-of-the-art pre-trained language-and-vision models struggle to compositionally generalize."
+}
+
+@inproceedings{yatskar2016,
+  title={Situation Recognition: Visual Semantic Role Labeling for Image Understanding},
+  author={Yatskar, Mark and Zettlemoyer, Luke and Farhadi, Ali},
+  booktitle={Conference on Computer Vision and Pattern Recognition},
+  year={2016}
+}
+
+@article{cite-key,
+	abstract = {Despite progress in perceptual tasks such as image classification, computers still perform poorly on cognitive tasks such as image description and question answering. Cognition is core to tasks that involve not just recognizing, but reasoning about our visual world. However, models used to tackle the rich content in images for cognitive tasks are still being trained using the same datasets designed for perceptual tasks. To achieve success at cognitive tasks, models need to understand the interactions and relationships between objects in an image. When asked ``What vehicle is the person riding?'', computers will need to identify the objects in an image as well as the relationships riding(man, carriage) and pulling(horse, carriage) to answer correctly that ``the person is riding a horse-drawn carriage.''In this paper, we present the Visual Genome dataset to enable the modeling of such relationships. We collect dense annotations of objects, attributes, and relationships within each image to learn these models. Specifically, our dataset contains over 108K images where each image has an average of {\$}{\$}35{\$}{\$}objects, {\$}{\$}26{\$}{\$}attributes, and {\$}{\$}21{\$}{\$}pairwise relationships between objects. We canonicalize the objects, attributes, relationships, and noun phrases in region descriptions and questions answer pairs to WordNet synsets. Together, these annotations represent the densest and largest dataset of image descriptions, objects, attributes, relationships, and question answer pairs.},
+	author = {Krishna, Ranjay and Zhu, Yuke and Groth, Oliver and Johnson, Justin and Hata, Kenji and Kravitz, Joshua and Chen, Stephanie and Kalantidis, Yannis and Li, Li-Jia and Shamma, David A. and Bernstein, Michael S. and Fei-Fei, Li},
+	date = {2017/05/01},
+	date-added = {2025-07-10 08:32:03 -0700},
+	date-modified = {2025-07-10 08:32:03 -0700},
+	doi = {10.1007/s11263-016-0981-7},
+	id = {Krishna2017},
+	isbn = {1573-1405},
+	journal = {International Journal of Computer Vision},
+	number = {1},
+	pages = {32--73},
+	title = {Visual Genome: Connecting Language and Vision Using Crowdsourced Dense Image Annotations},
+	url = {https://doi.org/10.1007/s11263-016-0981-7},
+	volume = {123},
+	year = {2017},
+	bdsk-url-1 = {https://doi.org/10.1007/s11263-016-0981-7}}
+
@@ -0,0 +1 @@
+[COVR](https://covr-dataset.github.io/) dataset with [imSitu](https://github.com/my89/imSitu) and [Visual Genome](https://homes.cs.washington.edu/~ranjay/visualgenome/index.html) images.
@@ -0,0 +1,8 @@
+content.data-type.image # Contains image data.
+content.data-type.text # Contains text data.
+content.language.en # Contains text in language English / en.
+content.monolingual # Contains text in 1 natural language.
+ml.task.common-sense-reasoning # Relates to Common Sense Reasoning, a machine learning task.
+ml.task.natural-language-inference # Relates to Natural Language Inference, a machine learning task.
+ml.task.natural-language-understanding # Relates to Natural Language Understanding, a machine learning task.
+ml.task.object-detection # Relates to Object Detection, a machine learning task.
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2025 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
@@ -0,0 +1,4 @@
+https://cs.stanford.edu/people/rak248/VG_100K_2/images.zip	9731705982	51c682d2721f880150720bb416e0346a4c787e4c55d7f80dfd1bd3f73ba81646	images.zip
+https://cs.stanford.edu/people/rak248/VG_100K_2/images2.zip	5471658058	99da1a0ddf87011319ff3b05cf9176ffee2731cc3c52951162d9ef0d68e3cfb5	images2.zip
+https://drive.google.com/uc?export=download&id=10xlQ6isRdGX94BypoqN6klniGeqdLBJA	21964401	83443ffd6493cdc807aaab8c559a38ad757d47e40d4d6f27b8c65efd4d889091	covr_v1_0.zip
+https://s3.amazonaws.com/my89-frame-annotation/public/of500_images.tar	36690524160	94dee93095d0325fb9aef1e8d956b6be297ab13bf2e62d6027fd5dcc782e8f61	of500_images.tar
@@ -0,0 +1,114 @@
+# coding=utf-8
+# Copyright 2025 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""covr dataset."""
+
+import json
+
+from etils import epath
+import tensorflow_datasets.public_api as tfds
+
+
+class Builder(tfds.core.GeneratorBasedBuilder):
+  """DatasetBuilder for covr dataset."""
+
+  VERSION = tfds.core.Version('1.0.0')
+  RELEASE_NOTES = {
+      '1.0.0': 'Initial release.',
+  }
+
+  def _info(self) -> tfds.core.DatasetInfo:
+    """Returns the dataset metadata."""
+    return self.dataset_info_from_configs(
+        features=tfds.features.FeaturesDict({
+            'utterance': tfds.features.Text(),
+            'scenes': tfds.features.Sequence(
+                feature=tfds.features.Text(),
+            ),
+            'properties': tfds.features.Sequence(
+                feature=tfds.features.Text(),
+            ),
+            'pattern_name': tfds.features.Text(),
+            'program': tfds.features.Text(),
+            'label': tfds.features.Text(),
+            'images': tfds.features.Sequence(
+                feature=tfds.features.Image(),
+            ),
+        }),
+        supervised_keys=None,
+        homepage='https://covr-dataset.github.io/',
+    )
+
+  def _split_generators(self, dl_manager: tfds.download.DownloadManager):
+    """Returns SplitGenerators."""
+    extracted_dirs = dl_manager.download_and_extract({
+        'covr_dir': (
+            'https://drive.google.com/uc?export=download&'
+            'id=10xlQ6isRdGX94BypoqN6klniGeqdLBJA'
+        ),
+        'imsitu_dir': (
+            'https://s3.amazonaws.com/my89-frame-annotation'
+            '/public/of500_images.tar'
+        ),
+        'vg1_dir': 'https://cs.stanford.edu/people/rak248/VG_100K_2/images.zip',
+        'vg2_dir': (
+            'https://cs.stanford.edu/people/rak248/VG_100K_2/images2.zip'
+        ),
+    })
+
+    # Each name is the image file name without the ".jpg" extension, which is
+    # also used as the scene id in COVR.
+    image_path_by_scene_id: dict[str, epath.Path] = {}
+    image_globs = [
+        extracted_dirs['vg1_dir'].glob('*/*.jpg'),
+        extracted_dirs['vg2_dir'].glob('*/*.jpg'),
+        extracted_dirs['imsitu_dir'].glob('of500_images/*/*.jpg'),
+    ]
+    for image_glob in image_globs:
+      for image_path in image_glob:
+        name = image_path.stem
+        image_path_by_scene_id[name] = image_path
+    path = extracted_dirs['covr_dir']
+    return {
+        'train': self._generate_examples(
+            path / 'train.jsonl', image_path_by_scene_id
+        ),
+        'test': self._generate_examples(
+            path / 'test.jsonl', image_path_by_scene_id
+        ),
+        'validation': self._generate_examples(
+            path / 'val.jsonl', image_path_by_scene_id
+        ),
+    }
+
+  def _generate_examples(
+      self, path: epath.Path, image_path_by_scene_id: dict[str, epath.Path]
+  ):
+    """Yields examples."""
+    with path.open() as f:
+      for line in f:
+        item = json.loads(line)
+        images = [
+            image_path_by_scene_id[scene_id] for scene_id in item['scenes']
+        ]
+        yield item['qid'], {
+            'utterance': item['utterance'],
+            'scenes': item['scenes'],
+            'properties': item['properties'],
+            'pattern_name': item['pattern_name'],
+            'program': str(item['program']),
+            'label': str(item.get('answer')),
+            'images': images,
+        }
@@ -0,0 +1,41 @@
+# coding=utf-8
+# Copyright 2025 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""covr dataset."""
+
+from tensorflow_datasets.datasets.covr import covr_dataset_builder
+import tensorflow_datasets.public_api as tfds
+
+
+class CovrTest(tfds.testing.DatasetBuilderTestCase):
+  """Tests for covr dataset."""
+
+  DATASET_CLASS = covr_dataset_builder.Builder
+  SPLITS = {
+      'train': 1,
+      'test': 1,
+      'validation': 1,
+  }
+
+  DL_EXTRACT_RESULT = {
+      'covr_dir': 'covr',
+      'imsitu_dir': 'imsitu',
+      'vg1_dir': 'vg1',
+      'vg2_dir': 'vg2',
+  }
+
+
+if __name__ == '__main__':
+  tfds.testing.test_main()
@@ -0,0 +1 @@
+{"qid": "val_test_299485", "utterance": "There are more people that are wearing hat than cats that are wearing hat", "scenes": ["3", "4", "5", "6", "7"], "properties": ["has_compare", "has_compare_more", "lexical_3", "program_3", "program_2", "has_count"], "pattern_name": "compare_count", "program": [{"operation": "find", "arguments": ["hat"]}, {"operation": "find", "arguments": ["person"]}, {"operation": "with_relation", "arguments": ["wearing"], "dependencies": [1, 0]}, {"operation": "count", "dependencies": [2]}, {"operation": "find", "arguments": ["hat"]}, {"operation": "find", "arguments": ["cat"]}, {"operation": "with_relation", "arguments": ["wearing"], "dependencies": [5, 4]}, {"operation": "count", "dependencies": [6]}, {"operation": "gt", "dependencies": [3, 7]}]}
@@ -0,0 +1 @@
+{"qid": "train_648732", "utterance": "Is the tree that is next to a fence narrow or large?", "scenes": ["1", "2", "3", "4", "5"], "properties": ["program_2"], "pattern_name": "choose_attr", "program": [{"operation": "find", "arguments": ["fence"]}, {"operation": "find", "arguments": ["tree"]}, {"operation": "with_relation", "arguments": ["next to"], "dependencies": [1, 0]}, {"operation": "unique", "dependencies": [2]}, {"operation": "choose_attr", "dependencies": [3], "arguments": ["narrow", "large"]}], "answer": "narrow"}
@@ -0,0 +1 @@
+{"qid": "val_test_747468", "utterance": "Do all phones that are on a table have the same color?", "scenes": ["7", "8"], "properties": ["program_1", "has_same_attribute_color", "has_quantifier", "has_complex_quantifier_scope", "has_quantifier_all", "tpl_verify_quantifier_attribute"], "pattern_name": "quantifier_same_attr", "program": [{"operation": "find", "arguments": ["table"]}, {"operation": "find", "arguments": ["phone"]}, {"operation": "with_relation", "arguments": ["on"], "dependencies": [1, 0]}, {"operation": "all_same", "dependencies": [2], "arguments": ["color"]}], "answer": true}
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+[COVR](https://covr-dataset.github.io/) dataset with [imSitu](https://github.com/my89/imSitu) and [Visual Genome](https://homes.cs.washington.edu/~ranjay/visualgenome/index.html) images.`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+{"qid": "val_test_299485", "utterance": "There are more people that are wearing hat than cats that are wearing hat", "scenes": ["3", "4", "5", "6", "7"], "properties": ["has_compare", "has_compare_more", "lexical_3", "program_3", "program_2", "has_count"], "pattern_name": "compare_count", "program": [{"operation": "find", "arguments": ["hat"]}, {"operation": "find", "arguments": ["person"]}, {"operation": "with_relation", "arguments": ["wearing"], "dependencies": [1, 0]}, {"operation": "count", "dependencies": [2]}, {"operation": "find", "arguments": ["hat"]}, {"operation": "find", "arguments": ["cat"]}, {"operation": "with_relation", "arguments": ["wearing"], "dependencies": [5, 4]}, {"operation": "count", "dependencies": [6]}, {"operation": "gt", "dependencies": [3, 7]}]}
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+{"qid": "train_648732", "utterance": "Is the tree that is next to a fence narrow or large?", "scenes": ["1", "2", "3", "4", "5"], "properties": ["program_2"], "pattern_name": "choose_attr", "program": [{"operation": "find", "arguments": ["fence"]}, {"operation": "find", "arguments": ["tree"]}, {"operation": "with_relation", "arguments": ["next to"], "dependencies": [1, 0]}, {"operation": "unique", "dependencies": [2]}, {"operation": "choose_attr", "dependencies": [3], "arguments": ["narrow", "large"]}], "answer": "narrow"}
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+{"qid": "val_test_747468", "utterance": "Do all phones that are on a table have the same color?", "scenes": ["7", "8"], "properties": ["program_1", "has_same_attribute_color", "has_quantifier", "has_complex_quantifier_scope", "has_quantifier_all", "tpl_verify_quantifier_attribute"], "pattern_name": "quantifier_same_attr", "program": [{"operation": "find", "arguments": ["table"]}, {"operation": "find", "arguments": ["phone"]}, {"operation": "with_relation", "arguments": ["on"], "dependencies": [1, 0]}, {"operation": "all_same", "dependencies": [2], "arguments": ["color"]}], "answer": true}