|
| 1 | +<div itemscope itemtype="http://schema.org/Dataset"> |
| 2 | + <div itemscope itemprop="includedInDataCatalog" itemtype="http://schema.org/DataCatalog"> |
| 3 | + <meta itemprop="name" content="TensorFlow Datasets" /> |
| 4 | + </div> |
| 5 | + <meta itemprop="name" content="covr" /> |
| 6 | + <meta itemprop="description" content="[COVR](https://covr-dataset.github.io/) dataset with [imSitu](https://github.com/my89/imSitu) and [Visual Genome](https://homes.cs.washington.edu/~ranjay/visualgenome/index.html) images. To use this dataset: ```python import tensorflow_datasets as tfds ds = tfds.load('covr', split='train') for ex in ds.take(4): print(ex) ``` See [the guide](https://www.tensorflow.org/datasets/overview) for more informations on [tensorflow_datasets](https://www.tensorflow.org/datasets). " /> |
| 7 | + <meta itemprop="url" content="https://www.tensorflow.org/datasets/catalog/covr" /> |
| 8 | + <meta itemprop="sameAs" content="https://covr-dataset.github.io/" /> |
| 9 | + <meta itemprop="citation" content="@inproceedings{bogin-etal-2021-covr, title = "{COVR}: A Test-Bed for Visually Grounded Compositional Generalization with Real Images", author = "Bogin, Ben and Gupta, Shivanshu and Gardner, Matt and Berant, Jonathan", editor = "Moens, Marie-Francine and Huang, Xuanjing and Specia, Lucia and Yih, Scott Wen-tau", booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing", month = nov, year = "2021", address = "Online and Punta Cana, Dominican Republic", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2021.emnlp-main.774/", doi = "10.18653/v1/2021.emnlp-main.774", pages = "9824--9846", abstract = "While interest in models that generalize at test time to new compositions has risen in recent years, benchmarks in the visually-grounded domain have thus far been restricted to synthetic images. In this work, we propose COVR, a new test-bed for visually-grounded compositional generalization with real images. To create COVR, we use real images annotated with scene graphs, and propose an almost fully automatic procedure for generating question-answer pairs along with a set of context images. COVR focuses on questions that require complex reasoning, including higher-order operations such as quantification and aggregation. Due to the automatic generation process, COVR facilitates the creation of compositional splits, where models at test time need to generalize to new concepts and compositions in a zero- or few-shot setting. We construct compositional splits using COVR and demonstrate a myriad of cases where state-of-the-art pre-trained language-and-vision models struggle to compositionally generalize." } @inproceedings{yatskar2016, title={Situation Recognition: Visual Semantic Role Labeling for Image Understanding}, author={Yatskar, Mark and Zettlemoyer, Luke and Farhadi, Ali}, booktitle={Conference on Computer Vision and Pattern Recognition}, year={2016} } @article{cite-key, abstract = {Despite progress in perceptual tasks such as image classification, computers still perform poorly on cognitive tasks such as image description and question answering. Cognition is core to tasks that involve not just recognizing, but reasoning about our visual world. However, models used to tackle the rich content in images for cognitive tasks are still being trained using the same datasets designed for perceptual tasks. To achieve success at cognitive tasks, models need to understand the interactions and relationships between objects in an image. When asked ``What vehicle is the person riding?'', computers will need to identify the objects in an image as well as the relationships riding(man, carriage) and pulling(horse, carriage) to answer correctly that ``the person is riding a horse-drawn carriage.''In this paper, we present the Visual Genome dataset to enable the modeling of such relationships. We collect dense annotations of objects, attributes, and relationships within each image to learn these models. Specifically, our dataset contains over 108K images where each image has an average of {\$}{\$}35{\$}{\$}objects, {\$}{\$}26{\$}{\$}attributes, and {\$}{\$}21{\$}{\$}pairwise relationships between objects. We canonicalize the objects, attributes, relationships, and noun phrases in region descriptions and questions answer pairs to WordNet synsets. Together, these annotations represent the densest and largest dataset of image descriptions, objects, attributes, relationships, and question answer pairs.}, author = {Krishna, Ranjay and Zhu, Yuke and Groth, Oliver and Johnson, Justin and Hata, Kenji and Kravitz, Joshua and Chen, Stephanie and Kalantidis, Yannis and Li, Li-Jia and Shamma, David A. and Bernstein, Michael S. and Fei-Fei, Li}, date = {2017/05/01}, date-added = {2025-07-10 08:32:03 -0700}, date-modified = {2025-07-10 08:32:03 -0700}, doi = {10.1007/s11263-016-0981-7}, id = {Krishna2017}, isbn = {1573-1405}, journal = {International Journal of Computer Vision}, number = {1}, pages = {32--73}, title = {Visual Genome: Connecting Language and Vision Using Crowdsourced Dense Image Annotations}, url = {https://doi.org/10.1007/s11263-016-0981-7}, volume = {123}, year = {2017}, bdsk-url-1 = {https://doi.org/10.1007/s11263-016-0981-7}}" /> |
| 10 | +</div> |
| 11 | + |
| 12 | +# `covr` |
| 13 | + |
| 14 | + |
| 15 | +Note: This dataset was added recently and is only available in our |
| 16 | +`tfds-nightly` package |
| 17 | +<span class="material-icons" title="Available only in the tfds-nightly package">nights_stay</span>. |
| 18 | + |
| 19 | +* **Description**: |
| 20 | + |
| 21 | +[COVR](https://covr-dataset.github.io/) dataset with |
| 22 | +[imSitu](https://github.com/my89/imSitu) and |
| 23 | +[Visual Genome](https://homes.cs.washington.edu/~ranjay/visualgenome/index.html) |
| 24 | +images. |
| 25 | + |
| 26 | +* **Homepage**: |
| 27 | + [https://covr-dataset.github.io/](https://covr-dataset.github.io/) |
| 28 | + |
| 29 | +* **Source code**: |
| 30 | + [`tfds.datasets.covr.Builder`](https://github.com/tensorflow/datasets/tree/master/tensorflow_datasets/datasets/covr/covr_dataset_builder.py) |
| 31 | + |
| 32 | +* **Versions**: |
| 33 | + |
| 34 | + * **`1.0.0`** (default): Initial release. |
| 35 | + |
| 36 | +* **Download size**: `48.35 GiB` |
| 37 | + |
| 38 | +* **Dataset size**: `173.96 GiB` |
| 39 | + |
| 40 | +* **Auto-cached** |
| 41 | + ([documentation](https://www.tensorflow.org/datasets/performances#auto-caching)): |
| 42 | + No |
| 43 | + |
| 44 | +* **Splits**: |
| 45 | + |
| 46 | +Split | Examples |
| 47 | +:------------- | -------: |
| 48 | +`'test'` | 7,024 |
| 49 | +`'train'` | 248,154 |
| 50 | +`'validation'` | 6,891 |
| 51 | + |
| 52 | +* **Feature structure**: |
| 53 | + |
| 54 | +```python |
| 55 | +FeaturesDict({ |
| 56 | + 'images': Sequence(Image(shape=(None, None, 3), dtype=uint8)), |
| 57 | + 'label': Text(shape=(), dtype=string), |
| 58 | + 'pattern_name': Text(shape=(), dtype=string), |
| 59 | + 'program': Text(shape=(), dtype=string), |
| 60 | + 'properties': Sequence(Text(shape=(), dtype=string)), |
| 61 | + 'scenes': Sequence(Text(shape=(), dtype=string)), |
| 62 | + 'utterance': Text(shape=(), dtype=string), |
| 63 | +}) |
| 64 | +``` |
| 65 | + |
| 66 | +* **Feature documentation**: |
| 67 | + |
| 68 | +Feature | Class | Shape | Dtype | Description |
| 69 | +:----------- | :-------------- | :-------------------- | :----- | :---------- |
| 70 | + | FeaturesDict | | | |
| 71 | +images | Sequence(Image) | (None, None, None, 3) | uint8 | |
| 72 | +label | Text | | string | |
| 73 | +pattern_name | Text | | string | |
| 74 | +program | Text | | string | |
| 75 | +properties | Sequence(Text) | (None,) | string | |
| 76 | +scenes | Sequence(Text) | (None,) | string | |
| 77 | +utterance | Text | | string | |
| 78 | + |
| 79 | +* **Supervised keys** (See |
| 80 | + [`as_supervised` doc](https://www.tensorflow.org/datasets/api_docs/python/tfds/load#args)): |
| 81 | + `None` |
| 82 | + |
| 83 | +* **Figure** |
| 84 | + ([tfds.show_examples](https://www.tensorflow.org/datasets/api_docs/python/tfds/visualization/show_examples)): |
| 85 | + Not supported. |
| 86 | + |
| 87 | +* **Examples** |
| 88 | + ([tfds.as_dataframe](https://www.tensorflow.org/datasets/api_docs/python/tfds/as_dataframe)): |
| 89 | + Missing. |
| 90 | + |
| 91 | +* **Citation**: |
| 92 | + |
| 93 | +``` |
| 94 | +@inproceedings{bogin-etal-2021-covr, |
| 95 | + title = "{COVR}: A Test-Bed for Visually Grounded Compositional Generalization with Real Images", |
| 96 | + author = "Bogin, Ben and |
| 97 | + Gupta, Shivanshu and |
| 98 | + Gardner, Matt and |
| 99 | + Berant, Jonathan", |
| 100 | + editor = "Moens, Marie-Francine and |
| 101 | + Huang, Xuanjing and |
| 102 | + Specia, Lucia and |
| 103 | + Yih, Scott Wen-tau", |
| 104 | + booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing", |
| 105 | + month = nov, |
| 106 | + year = "2021", |
| 107 | + address = "Online and Punta Cana, Dominican Republic", |
| 108 | + publisher = "Association for Computational Linguistics", |
| 109 | + url = "https://aclanthology.org/2021.emnlp-main.774/", |
| 110 | + doi = "10.18653/v1/2021.emnlp-main.774", |
| 111 | + pages = "9824--9846", |
| 112 | + abstract = "While interest in models that generalize at test time to new compositions has risen in recent years, benchmarks in the visually-grounded domain have thus far been restricted to synthetic images. In this work, we propose COVR, a new test-bed for visually-grounded compositional generalization with real images. To create COVR, we use real images annotated with scene graphs, and propose an almost fully automatic procedure for generating question-answer pairs along with a set of context images. COVR focuses on questions that require complex reasoning, including higher-order operations such as quantification and aggregation. Due to the automatic generation process, COVR facilitates the creation of compositional splits, where models at test time need to generalize to new concepts and compositions in a zero- or few-shot setting. We construct compositional splits using COVR and demonstrate a myriad of cases where state-of-the-art pre-trained language-and-vision models struggle to compositionally generalize." |
| 113 | +} |
| 114 | +
|
| 115 | +@inproceedings{yatskar2016, |
| 116 | + title={Situation Recognition: Visual Semantic Role Labeling for Image Understanding}, |
| 117 | + author={Yatskar, Mark and Zettlemoyer, Luke and Farhadi, Ali}, |
| 118 | + booktitle={Conference on Computer Vision and Pattern Recognition}, |
| 119 | + year={2016} |
| 120 | +} |
| 121 | +
|
| 122 | +@article{cite-key, |
| 123 | + abstract = {Despite progress in perceptual tasks such as image classification, computers still perform poorly on cognitive tasks such as image description and question answering. Cognition is core to tasks that involve not just recognizing, but reasoning about our visual world. However, models used to tackle the rich content in images for cognitive tasks are still being trained using the same datasets designed for perceptual tasks. To achieve success at cognitive tasks, models need to understand the interactions and relationships between objects in an image. When asked ``What vehicle is the person riding?'', computers will need to identify the objects in an image as well as the relationships riding(man, carriage) and pulling(horse, carriage) to answer correctly that ``the person is riding a horse-drawn carriage.''In this paper, we present the Visual Genome dataset to enable the modeling of such relationships. We collect dense annotations of objects, attributes, and relationships within each image to learn these models. Specifically, our dataset contains over 108K images where each image has an average of {\$}{\$}35{\$}{\$}objects, {\$}{\$}26{\$}{\$}attributes, and {\$}{\$}21{\$}{\$}pairwise relationships between objects. We canonicalize the objects, attributes, relationships, and noun phrases in region descriptions and questions answer pairs to WordNet synsets. Together, these annotations represent the densest and largest dataset of image descriptions, objects, attributes, relationships, and question answer pairs.}, |
| 124 | + author = {Krishna, Ranjay and Zhu, Yuke and Groth, Oliver and Johnson, Justin and Hata, Kenji and Kravitz, Joshua and Chen, Stephanie and Kalantidis, Yannis and Li, Li-Jia and Shamma, David A. and Bernstein, Michael S. and Fei-Fei, Li}, |
| 125 | + date = {2017/05/01}, |
| 126 | + date-added = {2025-07-10 08:32:03 -0700}, |
| 127 | + date-modified = {2025-07-10 08:32:03 -0700}, |
| 128 | + doi = {10.1007/s11263-016-0981-7}, |
| 129 | + id = {Krishna2017}, |
| 130 | + isbn = {1573-1405}, |
| 131 | + journal = {International Journal of Computer Vision}, |
| 132 | + number = {1}, |
| 133 | + pages = {32--73}, |
| 134 | + title = {Visual Genome: Connecting Language and Vision Using Crowdsourced Dense Image Annotations}, |
| 135 | + url = {https://doi.org/10.1007/s11263-016-0981-7}, |
| 136 | + volume = {123}, |
| 137 | + year = {2017}, |
| 138 | + bdsk-url-1 = {https://doi.org/10.1007/s11263-016-0981-7}} |
| 139 | +``` |
| 140 | + |
0 commit comments