DeBias-CLIP-website/index.html at master · TRAILab/DeBias-CLIP-website · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
<!DOCTYPE html>
<html>

<head>
  <meta charset="utf-8">
  <meta name="description"
    content="DeBias-CLIP improves long-context understanding in CLIP-style models by training with long captions and simple text augmentations.">
  <meta name="keywords" content="CLIP, Vision-Language Models, Long-Context Retrieval, Computer Vision">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <title>CLIP Is Shortsighted: Paying Attention Beyond the First Sentence [CVPR 2026]</title>

  <!-- Google Tag Manager -->
  <script>(function (w, d, s, l, i) {
      w[l] = w[l] || []; w[l].push({
        'gtm.start':
          new Date().getTime(), event: 'gtm.js'
      }); var f = d.getElementsByTagName(s)[0],
        j = d.createElement(s), dl = l != 'dataLayer' ? '&l=' + l : ''; j.async = true; j.src =
          'https://www.googletagmanager.com/gtm.js?id=' + i + dl; f.parentNode.insertBefore(j, f);
    })(window, document, 'script', 'dataLayer', 'GTM-TSPQB2LZ');</script>
  <!-- End Google Tag Manager -->

  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro" rel="stylesheet">

  <link rel="stylesheet" href="./static/css/bulma.min.css">
  <link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="./static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="./static/css/index.css">
  <link rel="icon" href="./static/images/TRAIL_BLACK_ICON.png">

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script defer src="./static/js/fontawesome.all.min.js"></script>
  <script src="./static/js/bulma-carousel.min.js"></script>
  <script src="./static/js/bulma-slider.min.js"></script>
  <script src="./static/js/index.js"></script>
</head>

<body>
  <!-- Google Tag Manager (noscript) -->
  <noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-TSPQB2LZ" height="0" width="0"
      style="display:none;visibility:hidden"></iframe></noscript>


  <section class="hero">
    <div class="hero-body">
      <div class="container is-max-desktop">
        <div class="columns is-centered">
          <div class="column has-text-centered">
            <h1 class="title is-1 publication-title">CLIP Is Shortsighted: Paying Attention Beyond the First Sentence [CVPR 2026]</h1>
            <div class="is-size-5 publication-authors">
              <span class="author-block">
                <!-- <a href=" href="https://www.trailab.utias.utoronto.ca/marc-antoine-lavoie"">Marc-Antoine Lavoie</a> -->
              Marc-Antoine Lavoie<sup>1</sup>,</span>
              <span class="author-block">Anas Mahmoud<sup>2</sup>,</span>
              <span class="author-block">Aldo Zaimi<sup>2</sup>,</span>
              <span class="author-block">Arsene Fansi Tchango<sup>2</sup>,</span>
              <span class="author-block">Steven L. Waslander<sup>1</sup>,</span>
            </div>

            <div class="is-size-5 publication-authors">
              <span class="author-block"><sup>1</sup>University of Toronto Robotics Institute,</span>
              <span class="author-block"><sup>2</sup>Mila - Quebec AI Institute</span>
            </div>

            <div class="column has-text-centered">
              <div class="publication-links">
                <!-- PDF Link. -->
                <!-- <span class="link-block">
                  <a href="ADD CVPR LINK"
                    class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      <i class="fas fa-file-pdf"></i>
                    </span>
                    <span>Paper</span>
                  </a>
                </span> -->
                <span class="link-block">
                  <a href="https://arxiv.org/abs/2602.22419" class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      <i class="ai ai-arxiv"></i>
                    </span>
                    <span>arXiv</span>
                  </a>
                </span>
                <!-- Video Link. -->
                <!-- <span class="link-block">
                  <a href="ADD VIDEO LINK"
                    class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      <i class="fab fa-youtube"></i>
                    </span>
                    <span>Video</span>
                  </a>
                </span> -->
                <!-- Code Link. -->
                <span class="link-block">
                  <a href="https://github.com/TRAILab/DeBias-CLIP" class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      <i class="fab fa-github"></i>
                    </span>
                    <span>Code</span>
                  </a>
                </span>
              </div>

            </div>
          </div>
        </div>
      </div>
    </div>
  </section>

  <section class="hero teaser">
    <div class="container is-max-desktop">
      <div class="hero-body">
        <div class="has-text-centered">
            <img src="./static/images/first_figure.svg" style="width: 90%;" alt="DeBias-CLIP schematic">
        </div>
        <h2 class="subtitle has-text-centered">
          <span class="dnerf">DeBias-CLIP</span> improves long-context understanding in CLIP-style models by
          training with long captions with simple text augmentations. Our model has a significantly
          flatter text token attention distribution across the context window compared to the baseline.
        </h2>
      </div>
    </div>
  </section>


  <section class="section">
    <div class="container is-max-desktop">
      <!-- Abstract. -->
      <div class="columns is-centered has-text-centered">
        <div class="column is-four-fifths">
          <h2 class="title is-3">Summary</h2>
          <div class="content has-text-justified">
            <ul>
              <li><strong>CLIP-style Models Are Biased:</strong> We empirically demonstrate that CLIP and its variants (e.g., SigLIP) inherently favor early text tokens, a bias that persists even when training on longer captions (e.g., Long-CLIP).
              <li><strong>Long-Caption Datasets Are Biased:</strong> Training and evaluation datasets used in long-context retrieval (e.g., ShareGPT4V) share a common structure: paragraph-length texts almost always start with a summary sentence, creating a shortcut that is sufficient for image-text matching.
              <li><strong>DeBias-CLIP - Sentence-Level Augmentations:</strong> To resolve this issue, we introduce simple sentence-level augmentations: (1) dropping the initial summary sentence, (2) randomly permuting and subsampling the rest, and (3) padding the token sequence.
              <li><strong>A simple Drop-In Replacement:</strong> Our DeBias-CLIP method introduces zero new trainable parameters beyond extended positional embeddings, and our text augmentations add negligible computational overhead.
              <!-- <li><strong>Better Performance, Better Robustness:</strong> We significantly improve on existing methods on long-caption retrieval benchmarks. -->
            </ul>
          </div>
        </div>
      </div>
      <!--/ Abstract. -->

      <!-- Paper video. -->
      <!-- <div class="columns is-centered has-text-centered">
        <div class="column is-four-fifths">
          <h2 class="title is-3">Video</h2>
          <div class="publication-video">
            <iframe width="560" height="315" src="VIDEO LINK"
              title="YouTube video player" frameborder="0"
              allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share"
              referrerpolicy="strict-origin-when-cross-origin" allowfullscreen></iframe>
          </div>
        </div>
      </div> -->
      <!--/ Paper video. -->
    </div>
  </section>


  <section class="" section>
    <div class="container is-max-desktop">
      <div class="columns is-centered has-text-centered">
        <div class="column is-four-fifths">
          <h2 class="title is-3">Uncovering the Biases of CLIP Text Encoders</h2>
          <div class="content has-text-centered">
              <img src="./static/images/padding_effects_clip_test.svg" style="width: 60%;" alt="Effect of the number of padding sentences on retrieval performance">
          </div>
          <div class="content has-text-justified">
            <p>
              <strong>Highlighting the Bias via Padding:</strong> First, we demonstrate that CLIP text encoders are heavily biased toward early tokens by evaluating retrieval on the first 2 sentences of the DOCCI dataset. When we prepend uninformative filler sentences ("This is a photo.") to the beginning of the caption, performance drops significantly. Simply pushing the informative text further back in the token sequence is enough to affect the model's retrieval accuracy.
            </p>
          </div>
                    <div class="content has-text-centered">
              <img src="./static/images/docci_i2t_baseclip_all_test.svg" style="width: 60%;" alt="Effect of sentence removal and permutation on CLIP retrieval performance">
          </div>
          <div class="content has-text-justified">
            <p>
              <strong>The Impact of Sentence Order:</strong> Next, we show that even among early tokens, sentence order strictly dictates performance. Evaluating retrieval on the first 2 sentences of DOCCI, we compare the original text (First 2) against a permuted version (Swap 2) and a baseline using only the first sentence (First only). For all CLIP models, pushing the highly informative first sentence to the second position causes a measurable drop in accuracy. Notably, this structural reliance is so pronounced in SigLIP models that the performance drop from displacing the summary sentence entirely negates any informational gain provided by the second sentence.
            </p>
          </div>
              <div class="content has-text-centered">
              <img src="./static/images/docci_t2i_lc_vs_ours_all_test.svg" style="width: 60%;" alt="Effect of sentence removal and permutation for retrieval with long-context CLIP models">
          </div>
          <div class="content has-text-justified">
            <p>
              <strong>Long-Context Models Fail Similarly:</strong> Finally, we extend this evaluation to models explicitly trained on long captions, using Long-CLIP as our baseline. Testing on full-length captions, we moved the crucial first sentence to the fourth position. Just like standard CLIP, these long-context baselines suffer a sharp drop in performance when the text is reordered. In contrast, our proposed DeBias-CLIP not only achieves consistently higher overall performance but also proves significantly more robust to this sentence permutation.
            </p>
          </div>
          <h2 class="title is-3">DeBias-CLIP: 3 Simple Text Augmentations</h2>
          <div class="content has-text-justified">
            <!-- <embed src="./static/images/TrackSampling_Vis_63_with_caption.pdf" style="width: 100%;" alt="Track_Sampling_Augmentation"> -->
            <img src="./static/images/text_augmentations.svg" style="width: 100%;" alt="Track_Sampling_Augmentation">
            <p>
              DeBias-CLIP builds upon the Long-CLIP baseline by training on long-caption datasets using two different text inputs: the full original caption and an augmented caption. Instead of reinforcing the early-token bias by training on the first sentence alone, our method proposes <strong>three simple sentence-level augmentations</strong>.
            </p>
            <p>
              First, we <strong>remove the opening summary sentence</strong>. Because this initial sentence is typically highly information-dense, it provides a training shortcut that easily satisfies the text-image contrastive loss. By removing it, we force the model to stop relying on summaries and actually learn from the fine-grained details in the rest of the caption.
            </p>
            <p>
              While this simple trick greatly improves long-text retrieval, it can also negatively impact short-text performance. To counteract this and maintain versatility, we <strong>randomly subsample the remaining sentences</strong>. This dynamically generates shorter training captions, increasing the difficulty of the task by deliberately reducing the available information.
            </p>
            <p>
              However, subsampling alone can cause the model to drift away from the original pretrained CLIP weights, as it forces less informative tokens into the early positions. To prevent this drift, and to properly train the later positional embeddings, we <strong>add padding tokens</strong> to the beginning of the text. This pushes the meaningful information deeper into the context window. Because we simultaneously train on the original full caption, the early positions still learn from the dense summary, while the later positions are finally forced to align with deep caption details.
            </p>
          </div>
      </div>
  </section>

  <section class="section">
    <div class="container is-max-desktop">
      <div class="columns is-centered has-text-centered">
        <div class="column is-four-fifths">
          <h2 class="title is-3">Improved Long-Context Retrieval - Visualizations</h2>
          <div class="content has-text-justified">
            <img src="./static/images/text_matching.svg" style="width: 100%;" alt="Long-Context Retrieval Visualization">
            <!-- <div class="publication-video">
              <iframe width="560" height="315" src="VIDEO LINK"
                title="YouTube video player" frameborder="0"
                allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share"
                referrerpolicy="strict-origin-when-cross-origin" allowfullscreen></iframe>
            </div> -->
          </div>
        </div>
      </div>
  </section>

    <section class="section">
    <div class="container is-max-desktop">
      <div class="columns is-centered has-text-centered">
        <div class="column is-four-fifths">
          <h2 class="title is-3">Improved Image Generation with Stable Diffusion XL</h2>
          <div class="content has-text-justified">
            <img src="./static/images/sdxl_viz.svg" style="width: 100%;" alt="SDXL Generation Comparison">
          </div>
        </div>
      </div>
  </section>

  <section class="section" id="BibTeX">
    <div class="container is-max-desktop content">
      <h2 class="title">BibTeX</h2>
      <pre><code>@article{lavoie2026clip,
  author    = {Lavoie, Marc-Antoine and Mahmoud, Anas and Zaimi, Aldo and Tchango, Arsene Fansi and Waslander, Steven L},
  title     = {CLIP Is Shortsighted: Paying Attention Beyond the First Sentence},
  journal   = {arXiv preprint arXiv:2602.22419},
  year      = {2026},
}</code></pre>
    </div>
  </section>


  <footer class="footer">
    <div class="container">
      <div class="content has-text-centered">
        <a class="icon-link" href="https://github.com/TRAILab" class="external-link" disabled>
          <i class="fab fa-github"></i>
        </a>
      </div>
      <div class="columns is-centered">
        <div class="column is-8">
          <div class="content">
            <p>
              This website is licensed under a <a rel="license"
                href="http://creativecommons.org/licenses/by-sa/4.0/">Creative
                Commons Attribution-ShareAlike 4.0 International License</a>.
            </p>
            <p>
              Thank you to the authors of <a href="https://github.com/nerfies/nerfies.github.io">Nerfies</a> for the
              website template.
            </p>
          </div>
        </div>
      </div>
    </div>
  </footer>

</body>

</html>