|
| 1 | +<!DOCTYPE html> |
| 2 | +<html data-theme="light"> |
| 3 | + <head> |
| 4 | + <meta charset="utf-8" /> |
| 5 | + <meta |
| 6 | + name="description" |
| 7 | + content="Vision foundation models (VFMs) trained on large-scale image datasets provide high-quality features that have significantly advanced 2D visual recognition. However, their potential in 3D vision remains largely untapped, despite the common availability of 2D images alongside 3D point cloud datasets. While significant research has been dedicated to 2D-3D fusion, recent state-of-the-art 3D methods predominantly focus on 3D data, leaving the integration of VFMs into 3D models underexplored. In this work, we challenge this trend by introducing DITR, a simple yet effective approach that extracts 2D foundation model features, projects them to 3D, and finally injects them into a 3D point cloud segmentation model. DITR achieves state-of-the-art results on both indoor and outdoor 3D semantic segmentation benchmarks. To enable the use of VFMs even when images are unavailable during inference, we further propose to distill 2D foundation models into a 3D backbone as a pretraining task. By initializing the 3D backbone with knowledge distilled from 2D VFMs, we create a strong basis for downstream 3D segmentation tasks, ultimately boosting performance across various datasets." |
| 8 | + /> |
| 9 | + <meta |
| 10 | + name="keywords" |
| 11 | + content="DINOv2, PTv3, 3D Computer Vision, Semantic Segmentation, ScanNet, ScanNet++, S3DIS, nuScenes, SemanticKITTI, Waymo" |
| 12 | + /> |
| 13 | + <meta name="viewport" content="width=device-width, initial-scale=1" /> |
| 14 | + <title> |
| 15 | + DINO in the Room: Leveraging 2D Foundation Models for 3D Segmentation |
| 16 | + </title> |
| 17 | + |
| 18 | + <link |
| 19 | + href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro" |
| 20 | + rel="stylesheet" |
| 21 | + /> |
| 22 | + |
| 23 | + <link rel="stylesheet" href="./static/css/bulma.min.css" /> |
| 24 | + <link rel="stylesheet" href="./static/css/fontawesome.all.min.css" /> |
| 25 | + <link |
| 26 | + rel="stylesheet" |
| 27 | + href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css" |
| 28 | + /> |
| 29 | + <link rel="stylesheet" href="./static/css/index.css" /> |
| 30 | + <link rel="icon" href="./static/images/favicon.svg" /> |
| 31 | + |
| 32 | + <script defer src="./static/js/fontawesome.all.min.js"></script> |
| 33 | + </head> |
| 34 | + <body> |
| 35 | + <section class="hero"> |
| 36 | + <div class="hero-body"> |
| 37 | + <div class="container is-max-desktop"> |
| 38 | + <div class="columns is-centered"> |
| 39 | + <div class="column has-text-centered"> |
| 40 | + <h1 class="title is-1 publication-title"> |
| 41 | + DINO in the Room: Leveraging 2D Foundation Models for 3D |
| 42 | + Segmentation |
| 43 | + </h1> |
| 44 | + <!-- <div class="title is-4 publication-title"> |
| 45 | + VENUE</span> |
| 46 | + </div> --> |
| 47 | + <div class="is-size-5 publication-authors"> |
| 48 | + <span class="author-block"> |
| 49 | + <a href="https://ka.codes">Karim Abou Zeid</a |
| 50 | + ><sup>1</sup></span |
| 51 | + > |
| 52 | + <span class="author-block"> |
| 53 | + <a href="https://github.com/YilmazKadir/">Kadir Yilmaz</a |
| 54 | + ><sup>1</sup> |
| 55 | + </span> |
| 56 | + <span class="author-block"> |
| 57 | + <a href="https://daandegeus.com">Daan de Geus</a |
| 58 | + ><sup>1,2</sup> |
| 59 | + </span> |
| 60 | + <span class="author-block"> |
| 61 | + <a |
| 62 | + href="https://scholar.google.com/citations?user=V0iMeYsAAAAJ" |
| 63 | + >Alexander Hermans</a |
| 64 | + ><sup>1</sup> |
| 65 | + </span> |
| 66 | + <span class="author-block"> |
| 67 | + <a |
| 68 | + href="https://scholar.google.com/citations?user=vpn6QN0AAAAJ" |
| 69 | + >David Adrian</a |
| 70 | + ><sup>3</sup> |
| 71 | + </span> |
| 72 | + <span class="author-block"> |
| 73 | + <a |
| 74 | + href="https://scholar.google.com/citations?user=s3_VpQYAAAAJ" |
| 75 | + >Timm Linder</a |
| 76 | + ><sup>3</sup> |
| 77 | + </span> |
| 78 | + <span class="author-block"> |
| 79 | + <a |
| 80 | + href="https://scholar.google.com/citations?user=ZcULDB0AAAAJ" |
| 81 | + >Bastian Leibe</a |
| 82 | + ><sup>1</sup> |
| 83 | + </span> |
| 84 | + </div> |
| 85 | + |
| 86 | + <div class="is-size-5 publication-authors"> |
| 87 | + <span class="author-block" |
| 88 | + ><sup>1</sup>RWTH Aachen University</span |
| 89 | + > |
| 90 | + <span class="author-block" |
| 91 | + ><sup>2</sup>Eindhoven University of Technology</span |
| 92 | + > |
| 93 | + <span class="author-block" |
| 94 | + ><sup>3</sup>Bosch Center for AI</span |
| 95 | + > |
| 96 | + </div> |
| 97 | + |
| 98 | + <div class="column has-text-centered"> |
| 99 | + <div class="publication-links"> |
| 100 | + <!-- arXiv PDF. --> |
| 101 | + <span class="link-block"> |
| 102 | + <a |
| 103 | + href="https://arxiv.org/pdf/xxxx" |
| 104 | + class="external-link button is-normal is-rounded is-dark" |
| 105 | + > |
| 106 | + <span class="icon"> |
| 107 | + <i class="fas fa-file-pdf"></i> |
| 108 | + </span> |
| 109 | + <span>Paper</span> |
| 110 | + </a> |
| 111 | + </span> |
| 112 | + |
| 113 | + <!-- arXiv abstract. --> |
| 114 | + <span class="link-block"> |
| 115 | + <a |
| 116 | + href="https://arxiv.org/abs/xxxx" |
| 117 | + class="external-link button is-normal is-rounded is-dark" |
| 118 | + > |
| 119 | + <span class="icon"> |
| 120 | + <i class="ai ai-arxiv"></i> |
| 121 | + </span> |
| 122 | + <span>arXiv</span> |
| 123 | + </a> |
| 124 | + </span> |
| 125 | + |
| 126 | + <!-- GitHub. --> |
| 127 | + <span class="link-block"> |
| 128 | + <a |
| 129 | + href="https://github.com/VisualComputingInstitute/DITR" |
| 130 | + class="external-link button is-normal is-rounded is-dark" |
| 131 | + > |
| 132 | + <span class="icon"> |
| 133 | + <i class="fab fa-github"></i> |
| 134 | + </span> |
| 135 | + <span>Code (soon)</span> |
| 136 | + </a> |
| 137 | + </span> |
| 138 | + |
| 139 | + <!-- Hugging Face Space. --> |
| 140 | + <span class="link-block"> |
| 141 | + <a |
| 142 | + href="https://huggingface.co/xxxx" |
| 143 | + class="external-link button is-normal is-rounded is-dark" |
| 144 | + > |
| 145 | + <span class="icon"> |
| 146 | + <img |
| 147 | + src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg" |
| 148 | + alt="Hugging Face Logo" |
| 149 | + style=" |
| 150 | + max-height: 24px; |
| 151 | + max-width: 24px; |
| 152 | + vertical-align: middle; |
| 153 | + background-color: transparent; |
| 154 | + " |
| 155 | + /> |
| 156 | + </span> |
| 157 | + <span>Weights (soon)</span> |
| 158 | + </a> |
| 159 | + </span> |
| 160 | + </div> |
| 161 | + </div> |
| 162 | + |
| 163 | + <div class="image-container has-text-centered"> |
| 164 | + <img |
| 165 | + src="static/img/method.webp" |
| 166 | + alt="Teaser Image" |
| 167 | + style="max-width: 100%; height: auto" |
| 168 | + /> |
| 169 | + <figcaption |
| 170 | + style="font-size: 0.9em; margin-top: 10px; text-align: left" |
| 171 | + > |
| 172 | + <strong>DITR architecture overview.</strong> |
| 173 | + We extract 2D image features from a frozen DINOv2 model (blue) |
| 174 | + and unproject them (2D-to-3D) onto the 3D point cloud. The |
| 175 | + unprojected features are subsequently max-pooled to create a |
| 176 | + multi-scale feature hierarchy. The raw point cloud is fed |
| 177 | + through a 3D backbone (yellow) and the unprojected image |
| 178 | + features are added to the skip connection between the encoder |
| 179 | + $\mathcal{E}_l$ and decoder $\mathcal{D}_l$ block on each |
| 180 | + level. The model is then trained with the regular segmentation |
| 181 | + loss. |
| 182 | + </figcaption> |
| 183 | + </div> |
| 184 | + </div> |
| 185 | + </div> |
| 186 | + </div> |
| 187 | + </div> |
| 188 | + </section> |
| 189 | + |
| 190 | + <section class="section"> |
| 191 | + <div class="container is-max-desktop"> |
| 192 | + <div class="columns is-centered has-text-centered is-6"> |
| 193 | + <div class="column"> |
| 194 | + <h2 class="title is-3">Abstract</h2> |
| 195 | + <div class="content has-text-justified"> |
| 196 | + <p> |
| 197 | + Vision foundation models (VFMs) trained on large-scale image |
| 198 | + datasets provide high-quality features that have significantly |
| 199 | + advanced 2D visual recognition. However, their potential in 3D |
| 200 | + vision remains largely untapped, despite the common availability |
| 201 | + of 2D images alongside 3D point cloud datasets. While |
| 202 | + significant research has been dedicated to 2D-3D fusion, recent |
| 203 | + state-of-the-art 3D methods predominantly focus on 3D data, |
| 204 | + leaving the integration of VFMs into 3D models underexplored. In |
| 205 | + this work, we challenge this trend by introducing DITR, a simple |
| 206 | + yet effective approach that extracts 2D foundation model |
| 207 | + features, projects them to 3D, and finally injects them into a |
| 208 | + 3D point cloud segmentation model. DITR achieves |
| 209 | + state-of-the-art results on both indoor and outdoor 3D semantic |
| 210 | + segmentation benchmarks. To enable the use of VFMs even when |
| 211 | + images are unavailable during inference, we further propose to |
| 212 | + distill 2D foundation models into a 3D backbone as a pretraining |
| 213 | + task. By initializing the 3D backbone with knowledge distilled |
| 214 | + from 2D VFMs, we create a strong basis for downstream 3D |
| 215 | + segmentation tasks, ultimately boosting performance across |
| 216 | + various datasets. |
| 217 | + </p> |
| 218 | + </div> |
| 219 | + </div> |
| 220 | + <div class="column has-text-centered"> |
| 221 | + <figure style="max-width: 100%; margin: 0 auto"> |
| 222 | + <img |
| 223 | + src="static/img/teaser.webp" |
| 224 | + alt="Inference Pipeline" |
| 225 | + style="width: 100%; height: auto" |
| 226 | + /> |
| 227 | + <figcaption |
| 228 | + style="font-size: 0.9em; margin-top: 10px; text-align: left" |
| 229 | + > |
| 230 | + <strong>DITR (a) and D-DITR (b).</strong> In addition to our |
| 231 | + DITR injection approach, we also present D-DITR to distill |
| 232 | + DINOv2 features into 3D semantic segmentation models that yields |
| 233 | + state-of-the-art results across indoor and outdoor 3D |
| 234 | + benchmarks. |
| 235 | + </figcaption> |
| 236 | + </figure> |
| 237 | + </div> |
| 238 | + </div> |
| 239 | + </div> |
| 240 | + </section> |
| 241 | + |
| 242 | + <section class="section"> |
| 243 | + <div class="container is-max-desktop"> |
| 244 | + <div class="columns is-centered"> |
| 245 | + <div class="column has-text-centered"> |
| 246 | + <h2 class="title is-4">DITR Quantiative Results</h2> |
| 247 | + </div> |
| 248 | + </div> |
| 249 | + <div class="columns is-centered is-6"> |
| 250 | + <div class="column has-text-centered"> |
| 251 | + <figure style="max-width: 100%; margin: 0 auto"> |
| 252 | + <img src="static/img/table1.webp" /> |
| 253 | + </figure> |
| 254 | + </div> |
| 255 | + <div class="column has-text-centered"> |
| 256 | + <figure style="max-width: 100%; margin: 0 auto"> |
| 257 | + <img src="static/img/table2.webp" /> |
| 258 | + </figure> |
| 259 | + </div> |
| 260 | + </div> |
| 261 | + </div> |
| 262 | + </section> |
| 263 | + |
| 264 | + <section class="section"> |
| 265 | + <div class="container is-max-desktop"> |
| 266 | + <div class="columns is-centered"> |
| 267 | + <div class="column has-text-centered"> |
| 268 | + <h2 class="title is-4">D-DITR Quantiative Results</h2> |
| 269 | + </div> |
| 270 | + </div> |
| 271 | + <div class="columns is-centered is-6"> |
| 272 | + <div class="column has-text-centered"> |
| 273 | + <figure style="max-width: 100%; margin: 0 auto"> |
| 274 | + <img src="static/img/table3.webp" /> |
| 275 | + </figure> |
| 276 | + </div> |
| 277 | + <div class="column has-text-centered"> |
| 278 | + <figure style="max-width: 100%; margin: 0 auto"> |
| 279 | + <img src="static/img/table4.webp" /> |
| 280 | + </figure> |
| 281 | + </div> |
| 282 | + </div> |
| 283 | + </div> |
| 284 | + </section> |
| 285 | + |
| 286 | + <section class="section" id="BibTeX"> |
| 287 | + <div class="container is-max-desktop content"> |
| 288 | + <h2 class="title">BibTeX</h2> |
| 289 | + <pre><code>@article{abouzeid2025ditr, |
| 290 | + title = {DINO in the Room: Leveraging 2D Foundation Models for 3D Segmentation}, |
| 291 | + author = {Abou Zeid, Karim and Yilmaz, Kadir and de Geus, Daan and Hermans, Alexander and Adrian, David and Linder, Timm and Leibe, Bastian}, |
| 292 | + journal = {arXiv preprint arXiv:xxxx}, |
| 293 | + year = {2025} |
| 294 | +}</code></pre> |
| 295 | + </div> |
| 296 | + </section> |
| 297 | + |
| 298 | + <footer class="footer"> |
| 299 | + <div class="container"> |
| 300 | + <div class="columns is-centered"> |
| 301 | + <div class="column is-8"> |
| 302 | + <div class="content has-text-centered"> |
| 303 | + <p> |
| 304 | + This website is licensed under |
| 305 | + <a |
| 306 | + rel="license" |
| 307 | + href="http://creativecommons.org/licenses/by-sa/4.0/" |
| 308 | + >CC BY-SA 4.0</a |
| 309 | + >. It is based on the |
| 310 | + <a href="https://nerfies.github.io/">Nerfies website</a>. |
| 311 | + </p> |
| 312 | + </div> |
| 313 | + </div> |
| 314 | + </div> |
| 315 | + </div> |
| 316 | + </footer> |
| 317 | + </body> |
| 318 | +</html> |
0 commit comments