Awesome-Text-to-Video-Generation/LLM.bib at main · feifeiobama/Awesome-Text-to-Video-Generation · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
% transformer
@inproceedings{vaswani2017attention,
  title={Attention Is All You Need},
  author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia},
  booktitle=NIPS,
  pages={6000--6010},
  year={2017}
}

@inproceedings{carion2020end,
  title={End-to-End Object Detection With Transformers},
  author={Carion, Nicolas and Massa, Francisco and Synnaeve, Gabriel and Usunier, Nicolas and Kirillov, Alexander and Zagoruyko, Sergey},
  booktitle=ECCV,
  pages={213--229},
  year={2020}
}

@inproceedings{dosovitskiy2021image,
  title={An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale},
  author={Dosovitskiy, Alexey and Beyer, Lucas and Kolesnikov, Alexander and Weissenborn, Dirk and Zhai, Xiaohua and Unterthiner, Thomas and Dehghani, Mostafa and Minderer, Matthias and Heigold, Georg and Gelly, Sylvain and others},
  booktitle=ICLR,
  year={2021}
}

@inproceedings{fan2021multiscale,
  title={Multiscale Vision Transformers},
  author={Fan, Haoqi and Xiong, Bo and Mangalam, Karttikeya and Li, Yanghao and Yan, Zhicheng and Malik, Jitendra and Feichtenhofer, Christoph},
  booktitle=ICCV,
  pages={6824--6835},
  year={2021}
}

@inproceedings{liu2021swin,
  title={Swin Transformer: Hierarchical Vision Transformer Using Shifted Windows},
  author={Liu, Ze and Lin, Yutong and Cao, Yue and Hu, Han and Wei, Yixuan and Zhang, Zheng and Lin, Stephen and Guo, Baining},
  booktitle=ICCV,
  pages={10012--10022},
  year={2021}
}

@article{bommasani2021opportunities,
  title={On the Opportunities and Risks of Foundation Models},
  author={Bommasani, Rishi and Hudson, Drew A and Adeli, Ehsan and Altman, Russ and Arora, Simran and von Arx, Sydney and Bernstein, Michael S and Bohg, Jeannette and Bosselut, Antoine and Brunskill, Emma and others},
  journal={arXiv preprint arXiv:2108.07258},
  year={2021}
}

@article{fedus2022switch,
  title={Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity},
  author={Fedus, William and Zoph, Barret and Shazeer, Noam},
  journal=JMLR,
  volume={23},
  number={1},
  pages={5232--5270},
  year={2022}
}

% language model
%% GPT
@article{radford2018improving,
  title={Improving Language Understanding by Generative Pre-Training},
  author={Radford, Alec and Narasimhan, Karthik and Salimans, Tim and Sutskever, Ilya},
  year={2018}
}

@article{devlin2018bert,
  title={{BERT}: Pre-training of Deep Bidirectional Transformers for Language Understanding},
  author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
  journal={arXiv preprint arXiv:1810.04805},
  year={2018}
}

%% GPT-2
@article{radford2019language,
  title={Language Models are Unsupervised Multitask Learners},
  author={Radford, Alec and Wu, Jeff and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya},
  year={2019}
}

@inproceedings{yang2019xlnet,
  title={{XLNet}: Generalized Autoregressive Pretraining for Language Understanding},
  author={Yang, Zhilin and Dai, Zihang and Yang, Yiming and Carbonell, Jaime and Salakhutdinov, Russ R and Le, Quoc V},
  booktitle=NIPS,
  pages={5753--5763},
  year={2019}
}

@inproceedings{lan2020albert,
  title={{ALBERT}: A Lite {BERT} for Self-supervised Learning of Language Representations},
  author={Lan, Zhenzhong and Chen, Mingda and Goodman, Sebastian and Gimpel, Kevin and Sharma, Piyush and Soricut, Radu},
  booktitle=ICLR,
  year={2020}
}

%% T5
@article{raffel2020exploring,
  title={Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},
  author={Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and Narang, Sharan and Matena, Michael and Zhou, Yanqi and Li, Wei and Liu, Peter J},
  journal=JMLR,
  volume={21},
  number={1},
  pages={5485--5551},
  year={2020}
}

%% GPT-3
@inproceedings{brown2020language,
  title={Language Models are Few-Shot Learners},
  author={Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and others},
  booktitle=NIPS,
  pages={1877--1901},
  year={2020}
}

%% FLAN
@inproceedings{wei2022finetuned,
  title={Finetuned Language Models are Zero-Shot Learners},
  author={Wei, Jason and Bosma, Maarten and Zhao, Vincent and Guu, Kelvin and Yu, Adams Wei and Lester, Brian and Du, Nan and Dai, Andrew M and Le, Quoc V},
  booktitle=ICLR,
  year={2022}
}

%% T0
@inproceedings{sanh2021multitask,
  title={Multitask Prompted Training Enables Zero-Shot Task Generalization},
  author={Sanh, Victor and Webson, Albert and Raffel, Colin and Bach, Stephen and Sutawika, Lintang and Alyafeai, Zaid and Chaffin, Antoine and Stiegler, Arnaud and Raja, Arun and Dey, Manan and others},
  booktitle=ICLR,
  year={2022}
}

@inproceedings{du2022glm,
  title={{GLM}: General Language Model Pretraining with Autoregressive Blank Infilling},
  author={Du, Zhengxiao and Qian, Yujie and Liu, Xiao and Ding, Ming and Qiu, Jiezhong and Yang, Zhilin and Tang, Jie},
  booktitle=ACL22,
  pages={320--335},
  year={2022}
}

%% InstructGPT
@article{ouyang2022training,
  title={Training Language Models to Follow Instructions with Human Feedback},
  author={Ouyang, Long and Wu, Jeffrey and Jiang, Xu and Almeida, Diogo and Wainwright, Carroll and Mishkin, Pamela and Zhang, Chong and Agarwal, Sandhini and Slama, Katarina and Ray, Alex and others},
  booktitle=NIPS,
  pages={27730--27744},
  year={2022}
}

%% Chinchilla
@article{hoffmann2022empirical,
  title={An Empirical Analysis of Compute-Optimal Large Language Model Training},
  author={Hoffmann, Jordan and Borgeaud, Sebastian and Mensch, Arthur and Buchatskaya, Elena and Cai, Trevor and Rutherford, Eliza and de Las Casas, Diego and Hendricks, Lisa Anne and Welbl, Johannes and Clark, Aidan and others},
  booktitle=NIPS,
  pages={30016--30030},
  year={2022}
}

@article{chowdhery2023palm,
  title={{PaLM}: Scaling Language Modeling with Pathways},
  author={Chowdhery, Aakanksha and Narang, Sharan and Devlin, Jacob and Bosma, Maarten and Mishra, Gaurav and Roberts, Adam and Barham, Paul and Chung, Hyung Won and Sutton, Charles and Gehrmann, Sebastian and others},
  journal=JMLR,
  volume={24},
  number={240},
  pages={1--113},
  year={2023}
}

@article{zhang2022opt,
  title={{OPT}: Open Pre-trained Transformer Language Models},
  author={Zhang, Susan and Roller, Stephen and Goyal, Naman and Artetxe, Mikel and Chen, Moya and Chen, Shuohui and Dewan, Christopher and Diab, Mona and Li, Xian and Lin, Xi Victoria and others},
  journal={arXiv preprint arXiv:2205.01068},
  year={2022}
}

%% Flan-T5
@article{chung2022scaling,
  title={Scaling Instruction-Finetuned Language Models},
  author={Chung, Hyung Won and Hou, Le and Longpre, Shayne and Zoph, Barret and Tay, Yi and Fedus, William and Li, Eric and Wang, Xuezhi and Dehghani, Mostafa and Brahma, Siddhartha and others},
  journal={arXiv preprint arXiv:2210.11416},
  year={2022}
}

@inproceedings{wang2023self,
  title={Self-Instruct: Aligning Language Model with Self Generated Instructions},
  author={Wang, Yizhong and Kordi, Yeganeh and Mishra, Swaroop and Liu, Alisa and Smith, Noah A and Khashabi, Daniel and Hajishirzi, Hannaneh},
  booktitle=ACL23,
  pages={13484--13508},
  year={2023}
}

@article{touvron2023llama,
  title={{LLaMA}: Open and Efficient Foundation Language Models},
  author={Touvron, Hugo and Lavril, Thibaut and Izacard, Gautier and Martinet, Xavier and Lachaux, Marie-Anne and Lacroix, Timoth{\'e}e and Rozi{\`e}re, Baptiste and Goyal, Naman and Hambro, Eric and Azhar, Faisal and others},
  journal={arXiv preprint arXiv:2302.13971},
  year={2023}
}

@article{openai2023gpt,
  title={{GPT-4} Technical Report},
  author={OpenAI},
  journal={arXiv preprint arXiv:2303.08774},
  year={2023}
}

@misc{taori2023stanford,
  title={Stanford Alpaca: An Instruction-following {LLaMA} model},
  author={Taori, Rohan and Gulrajani, Ishaan and Zhang, Tianyi and Dubois, Yann and Li, Xuechen and Guestrin, Carlos and Liang, Percy and Hashimoto, Tatsunori B.},
  howpublished={\url{https://github.com/tatsu-lab/stanford_alpaca}},
  year={2023}
}

@misc{chiang2023vicuna,
  title={Vicuna: An Open-Source Chatbot Impressing {GPT}-4 with 90\%* {ChatGPT} Quality},
  author={Chiang, Wei-Lin and Li, Zhuohan and Lin, Zi and Sheng, Ying and Wu, Zhanghao and Zhang, Hao and Zheng, Lianmin and Zhuang, Siyuan and Zhuang, Yonghao and Gonzalez, Joseph E. and Stoica, Ion and Xing, Eric P.},
  howpublished={\url{https://lmsys.org/blog/2023-03-30-vicuna}},
  year={2023}
}

@article{zhang2023llama,
  title={{LLaMA-Adapter}: Efficient Fine-tuning of Language Models with Zero-init Attention},
  author={Zhang, Renrui and Han, Jiaming and Zhou, Aojun and Hu, Xiangfei and Yan, Shilin and Lu, Pan and Li, Hongsheng and Gao, Peng and Qiao, Yu},
  journal={arXiv preprint arXiv:2303.16199},
  year={2023}
}

@article{touvron2023llama,
  title={Llama 2: Open Foundation and Fine-Tuned Chat Models},
  author={Touvron, Hugo and Martin, Louis and Stone, Kevin and Albert, Peter and Almahairi, Amjad and Babaei, Yasmine and Bashlykov, Nikolay and Batra, Soumya and Bhargava, Prajjwal and Bhosale, Shruti and others},
  journal={arXiv preprint arXiv:2307.09288},
  year={2023}
}

% parameter-efficient finetuning
@inproceedings{houlsby2019parameter,
  title={Parameter-Efficient Transfer Learning for {NLP}},
  author={Houlsby, Neil and Giurgiu, Andrei and Jastrzebski, Stanislaw and Morrone, Bruna and De Laroussilhe, Quentin and Gesmundo, Andrea and Attariyan, Mona and Gelly, Sylvain},
  booktitle=ICML,
  pages={2790--2799},
  year={2019}
}

@inproceedings{zhang2020side,
  title={Side-Tuning: A Baseline for Network Adaptation via Additive Side Networks},
  author={Zhang, Jeffrey O and Sax, Alexander and Zamir, Amir and Guibas, Leonidas and Malik, Jitendra},
  booktitle=ECCV,
  pages={698--714},
  year={2020}
}

@inproceedings{li2021prefix,
  title={Prefix-Tuning: Optimizing Continuous Prompts for Generation},
  author={Li, Xiang Lisa and Liang, Percy},
  booktitle=ACL21,
  pages={4582--4597},
  year={2021}
}

@inproceedings{lester2021power,
  title={The Power of Scale for Parameter-Efficient Prompt Tuning},
  author={Lester, Brian and Al-Rfou, Rami and Constant, Noah},
  booktitle=EMNLP21,
  pages={3045--3059},
  year={2021}
}

@inproceedings{hu2022lora,
  title={{LoRA}: Low-Rank Adaptation of Large Language Models},
  author={Hu, Edward J and Shen, Yelong and Wallis, Phillip and Allen-Zhu, Zeyuan and Li, Yuanzhi and Wang, Shean and Wang, Lu and Chen, Weizhu},
  booktitle=ICLR,
  year={2022}
}

@article{liu2021pre,
  title={Pre-train, Prompt, and Predict: A Systematic Survey of Prompting Methods in Natural Language Processing},
  author={Liu, Pengfei and Yuan, Weizhe and Fu, Jinlan and Jiang, Zhengbao and Hayashi, Hiroaki and Neubig, Graham},
  journal={ACM Computing Surveys},
  volume={55},
  number={9},
  pages={1--35},
  year={2023}
}

@article{ding2023parameter,
  title={Parameter-Efficient Fine-Tuning of Large-Scale Pre-trained Language Models},
  author={Ding, Ning and Qin, Yujia and Yang, Guang and Wei, Fuchao and Yang, Zonghan and Su, Yusheng and Hu, Shengding and Chen, Yulin and Chan, Chi-Min and Chen, Weize and others},
  journal={Nature Machine Intelligence},
  volume={5},
  number={3},
  pages={220--235},
  year={2023}
}

@inproceedings{dettmers2023qlora,
  title={{QLoRA}: Efficient Finetuning of Quantized {LLM}s},
  author={Dettmers, Tim and Pagnoni, Artidoro and Holtzman, Ari and Zettlemoyer, Luke},
  booktitle=NIPS,
  year={2023}
}

% multimodal learning
@inproceedings{antol2015vqa,
  title={{VQA}: Visual Question Answering},
  author={Antol, Stanislaw and Agrawal, Aishwarya and Lu, Jiasen and Mitchell, Margaret and Batra, Dhruv and Zitnick, C Lawrence and Parikh, Devi},
  booktitle=ICCV,
  pages={2425--2433},
  year={2015}
}

@inproceedings{lu2019vilbert,
  title={{ViLBERT}: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks},
  author={Lu, Jiasen and Batra, Dhruv and Parikh, Devi and Lee, Stefan},
  booktitle=NIPS,
  pages={13--23},
  year={2019}
}

@inproceedings{su2020vl,
  title={{VL-BERT}: Pre-training of Generic Visual-Linguistic Representations},
  author={Su, Weijie and Zhu, Xizhou and Cao, Yue and Li, Bin and Lu, Lewei and Wei, Furu and Dai, Jifeng},
  booktitle=ICLR,
  year={2020}
}

@inproceedings{lee2021parameter,
  title={Parameter Efficient Multimodal Transformers for Video Representation Learning},
  author={Lee, Sangho and Yu, Youngjae and Kim, Gunhee and Breuel, Thomas and Kautz, Jan and Song, Yale},
  booktitle=ICLR,
  year={2021}
}

%% VL-T5
@inproceedings{cho2021unifying,
  title={Unifying Vision-and-Language Tasks via Text Generation},
  author={Cho, Jaemin and Lei, Jie and Tan, Hao and Bansal, Mohit},
  booktitle=ICML,
  pages={1931--1942},
  year={2021}
}

%% ALIGN
@inproceedings{jia2021scaling,
  title={Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision},
  author={Jia, Chao and Yang, Yinfei and Xia, Ye and Chen, Yi-Ting and Parekh, Zarana and Pham, Hieu and Le, Quoc and Sung, Yun-Hsuan and Li, Zhen and Duerig, Tom},
  booktitle=ICML,
  pages={4904--4916},
  year={2021}
}

@inproceedings{kim2021vilt,
  title={{ViLT}: Vision-and-Language Transformer Without Convolution or Region Supervision},
  author={Kim, Wonjae and Son, Bokyung and Kim, Ildoo},
  booktitle=ICML,
  pages={5583--5594},
  year={2021}
}

%% CLIP
@inproceedings{radford2021learning,
  title={Learning Transferable Visual Models From Natural Language Supervision},
  author={Radford, Alec and Kim, Jong Wook and Hallacy, Chris and Ramesh, Aditya and Goh, Gabriel and Agarwal, Sandhini and Sastry, Girish and Askell, Amanda and Mishkin, Pamela and Clark, Jack and others},
  booktitle=ICML,
  pages={8748--8763},
  year={2021}
}

@inproceedings{li2021align,
  title={Align before Fuse: Vision and Language Representation Learning with Momentum Distillation},
  author={Li, Junnan and Selvaraju, Ramprasaath and Gotmare, Akhilesh and Joty, Shafiq and Xiong, Caiming and Hoi, Steven Chu Hong},
  booktitle=NIPS,
  pages={9694--9705},
  year={2021}
}

@inproceedings{bao2022beit,
  title={{BEiT}: {BERT} Pre-Training of Image Transformers},
  author={Bao, Hangbo and Dong, Li and Piao, Songhao and Wei, Furu},
  booktitle=ICLR,
  year={2022}
}

@inproceedings{chang2022maskgit,
  title={{MaskGIT}: Masked Generative Image Transformer},
  author={Chang, Huiwen and Zhang, Han and Jiang, Lu and Liu, Ce and Freeman, William T},
  booktitle=CVPR,
  pages={11315--11325},
  year={2022}
}

%% MAE
@inproceedings{he2022masked,
  title={Masked Autoencoders Are Scalable Vision Learners},
  author={He, Kaiming and Chen, Xinlei and Xie, Saining and Li, Yanghao and Doll{\'a}r, Piotr and Girshick, Ross},
  booktitle=CVPR,
  pages={16000--16009},
  year={2022}
}

@inproceedings{li2022blip,
  title={{BLIP}: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation},
  author={Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven},
  booktitle=ICML,
  pages={12888--12900},
  year={2022}
}

@inproceedings{wang2022ofa,
  title={{OFA}: Unifying Architectures, Tasks, and Modalities Through a Simple Sequence-to-Sequence Learning Framework},
  author={Wang, Peng and Yang, An and Men, Rui and Lin, Junyang and Bai, Shuai and Li, Zhikang and Ma, Jianxin and Zhou, Chang and Zhou, Jingren and Yang, Hongxia},
  booktitle=ICML,
  pages={23318--23340},
  year={2022}
}

%% X-VLM
@inproceedings{zeng2022multi,
  title={Multi-Grained Vision Language Pre-Training: Aligning Texts with Visual Concepts},
  author={Zeng, Yan and Zhang, Xinsong and Li, Hang},
  booktitle=ICML,
  pages={25994--26009},
  year={2022}
}

@inproceedings{li2022mplug,
  title={{mPLUG}: Effective and Efficient Vision-Language Learning by Cross-modal Skip-connections},
  author={Li, Chenliang and Xu, Haiyang and Tian, Junfeng and Wang, Wei and Yan, Ming and Bi, Bin and Ye, Jiabo and Chen, He and Xu, Guohai and Cao, Zheng and others},
  booktitle=EMNLP22,
  pages={7241--7259},
  year={2022}
}

%% LiMoE
@inproceedings{mustafa2022multimodal,
  title={Multimodal Contrastive Learning with {LIMoE}: the Language-Image Mixture of Experts},
  author={Mustafa, Basil and Riquelme, Carlos and Puigcerver, Joan and Jenatton, Rodolphe and Houlsby, Neil},
  booktitle=NIPS,
  pages={9564--9576},
  year={2022}
}

@inproceedings{alayrac2022flamingo,
  title={Flamingo: A Visual Language Model for Few-Shot Learning},
  author={Alayrac, Jean-Baptiste and Donahue, Jeff and Luc, Pauline and Miech, Antoine and Barr, Iain and Hasson, Yana and Lenc, Karel and Mensch, Arthur and Millican, Katherine and Reynolds, Malcolm and others},
  booktitle=NIPS,
  pages={23716--23736},
  year={2022}
}

@article{chen2022unified,
  title={A Unified Sequence Interface for Vision Tasks},
  author={Chen, Ting and Saxena, Saurabh and Li, Lala and Lin, Tsung-Yi and Fleet, David J and Hinton, Geoffrey E},
  booktitle=NIPS,
  pages={31333--31346},
  year={2022}
}

@inproceedings{bao2022vlmo,
  title={{VLMo}: Unified Vision-Language Pre-Training with Mixture-of-Modality-Experts},
  author={Bao, Hangbo and Wang, Wenhui and Dong, Li and Liu, Qiang and Mohammed, Owais Khan and Aggarwal, Kriti and Som, Subhojit and Piao, Songhao and Wei, Furu},
  booktitle=NIPS,
  pages={32897--32912},
  year={2022}
}

%% X2-VLM
@article{zeng2022x,
  title={{X$^2$-VLM}: All-In-One Pre-trained Model For Vision-Language Tasks},
  author={Zeng, Yan and Zhang, Xinsong and Li, Hang and Wang, Jiawei and Zhang, Jipeng and Zhou, Wangchunshu},
  journal={arXiv preprint arXiv:2211.12402},
  year={2022}
}

@article{xu2022multiinstruct,
  title={{MultiInstruct}: Improving Multi-Modal Zero-Shot Learning via Instruction Tuning},
  author={Xu, Zhiyang and Shen, Ying and Huang, Lifu},
  journal={arXiv preprint arXiv:2212.10773},
  year={2022}
}

@inproceedings{chen2023pali,
  title={{PaLI}: A Jointly-Scaled Multilingual Language-Image Model},
  author={Chen, Xi and Wang, Xiao and Changpinyo, Soravit and Piergiovanni, AJ and Padlewski, Piotr and Salz, Daniel and Goodman, Sebastian and Grycner, Adam and Mustafa, Basil and Beyer, Lucas and others},
  booktitle=ICLR,
  year={2023}
}

@inproceedings{lu2023unified,
  title={{Unified-IO}: A Unified Model for Vision, Language, and Multi-modal Tasks},
  author={Lu, Jiasen and Clark, Christopher and Zellers, Rowan and Mottaghi, Roozbeh and Kembhavi, Aniruddha},
  booktitle=ICLR,
  year={2023}
}

@inproceedings{li2023mage,
  title={{MAGE}: MAsked Generative Encoder to Unify Representation Learning and Image Synthesis},
  author={Li, Tianhong and Chang, Huiwen and Mishra, Shlok and Zhang, Han and Katabi, Dina and Krishnan, Dilip},
  booktitle=CVPR,
  pages={2142--2152},
  year={2023}
}

@inproceedings{yu2023magvit,
  title={{MAGVIT}: Masked Generative Video Transformer},
  author={Yu, Lijun and Cheng, Yong and Sohn, Kihyuk and Lezama, Jos{\'e} and Zhang, Han and Chang, Huiwen and Hauptmann, Alexander G and Yang, Ming-Hsuan and Hao, Yuan and Essa, Irfan and others},
  booktitle=CVPR,
  pages={10459--10469},
  year={2023}
}

%% X-Decoder
@inproceedings{zou2023generalized,
  title={Generalized Decoding for Pixel, Image, and Language},
  author={Zou, Xueyan and Dou, Zi-Yi and Yang, Jianwei and Gan, Zhe and Li, Linjie and Li, Chunyuan and Dai, Xiyang and Behl, Harkirat and Wang, Jianfeng and Yuan, Lu and others},
  booktitle=CVPR,
  pages={15116--15127},
  year={2023}
}

%% BEiT-3
@inproceedings{wang2023image,
  title={Image as a Foreign Language: {BEiT} Pretraining for Vision and Vision-Language Tasks},
  author={Wang, Wenhui and Bao, Hangbo and Dong, Li and Bjorck, Johan and Peng, Zhiliang and Liu, Qiang and Aggarwal, Kriti and Mohammed, Owais Khan and Singhal, Saksham and Som, Subhojit and others},
  booktitle=CVPR,
  pages={19175--19186},
  year={2023}
}

@inproceedings{fang2023eva,
  title={{EVA}: Exploring the Limits of Masked Visual Representation Learning at Scale},
  author={Fang, Yuxin and Wang, Wen and Xie, Binhui and Sun, Quan and Wu, Ledell and Wang, Xinggang and Huang, Tiejun and Wang, Xinlong and Cao, Yue},
  booktitle=CVPR,
  pages={19358--19369},
  year={2023}
}

@inproceedings{chang2023muse,
  title={Muse: Text-To-Image Generation via Masked Generative Transformers},
  author={Chang, Huiwen and Zhang, Han and Barber, Jarred and Maschinot, AJ and Lezama, Jose and Jiang, Lu and Yang, Ming-Hsuan and Murphy, Kevin and Freeman, William T and Rubinstein, Michael and others},
  booktitle=ICML,
  pages={4055--4075},
  year={2023}
}

@inproceedings{driess2023palm,
  title={{PaLM-E}: An Embodied Multimodal Language Model},
  author={Driess, Danny and Xia, Fei and Sajjadi, Mehdi SM and Lynch, Corey and Chowdhery, Aakanksha and Ichter, Brian and Wahid, Ayzaan and Tompson, Jonathan and Vuong, Quan and Yu, Tianhe and others},
  booktitle=ICML,
  pages={8469--8488},
  year={2023}
}

@inproceedings{li2023blip2,
  title={{BLIP-2}: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models},
  author={Li, Junnan and Li, Dongxu and Savarese, Silvio and Hoi, Steven},
  booktitle=ICML,
  pages={19730--19742},
  year={2023}
}

%% SigLIP
@inproceedings{zhai2023sigmoid,
  title={Sigmoid Loss for Language Image Pre-Training},
  author={Zhai, Xiaohua and Mustafa, Basil and Kolesnikov, Alexander and Beyer, Lucas},
  booktitle=ICCV,
  pages={11975--11986},
  year={2023}
}

@inproceedings{shukor2023ep,
  title={{eP-ALM}: Efficient Perceptual Augmentation of Language Models},
  author={Shukor, Mustafa and Dancette, Corentin and Cord, Matthieu},
  booktitle=ICCV,
  pages={22056--22069},
  year={2023}
}

%% Kosmos-1
@inproceedings{huang2023language,
  title={Language Is Not All You Need: Aligning Perception with Language Models},
  author={Huang, Shaohan and Dong, Li and Wang, Wenhui and Hao, Yaru and Singhal, Saksham and Ma, Shuming and Lv, Tengchao and Cui, Lei and Mohammed, Owais Khan and Liu, Qiang and others},
  booktitle=NIPS,
  year={2023}
}

%% LLaVA
@inproceedings{liu2023visual,
  title={Visual Instruction Tuning},
  author={Liu, Haotian and Li, Chunyuan and Wu, Qingyang and Lee, Yong Jae},
  booktitle=NIPS,
  year={2023}
}

@inproceedings{zhang2023transfer,
  title={{VPGTrans}: Transfer Visual Prompt Generator across {LLM}s},
  author={Zhang, Ao and Fei, Hao and Yao, Yuan and Ji, Wei and Li, Li and Liu, Zhiyuan and Chua, Tat-Seng},
  booktitle=NIPS,
  year={2023}
}

@inproceedings{dai2023instructblip,
  title={{InstructBLIP}: Towards General-purpose Vision-Language Models with Instruction Tuning},
  author={Dai, Wenliang and Li, Junnan and Li, Dongxu and Tiong, Anthony Meng Huat and Zhao, Junqi and Wang, Weisheng and Li, Boyang and Fung, Pascale and Hoi, Steven},
  booktitle=NIPS,
  year={2023}
}

@article{li2023blip,
  title={{BLIP-Diffusion}: Pre-trained Subject Representation for Controllable Text-to-Image Generation and Editing},
  author={Li, Dongxu and Li, Junnan and Hoi, Steven},
  booktitle=NIPS,
  year={2023}
}

@inproceedings{wang2023visionllm,
  title={{VisionLLM}: Large Language Model is also an Open-Ended Decoder for Vision-Centric Tasks},
  author={Wang, Wenhai and Chen, Zhe and Chen, Xiaokang and Wu, Jiannan and Zhu, Xizhou and Zeng, Gang and Luo, Ping and Lu, Tong and Zhou, Jie and Qiao, Yu and others},
  booktitle=NIPS,
  year={2023}
}

%% LaVIN
@inproceedings{luo2023cheap,
  title={Cheap and Quick: Efficient Vision-Language Instruction Tuning for Large Language Models},
  author={Luo, Gen and Zhou, Yiyi and Ren, Tianhe and Chen, Shengxin and Sun, Xiaoshuai and Ji, Rongrong},
  booktitle=NIPS,
  year={2023}
}

@inproceedings{yu2023spae,
  title={{SPAE}: Semantic Pyramid AutoEncoder for Multimodal Generation with Frozen {LLM}s},
  author={Yu, Lijun and Cheng, Yong and Wang, Zhiruo and Kumar, Vivek and Macherey, Wolfgang and Huang, Yanping and Ross, David A and Essa, Irfan and Bisk, Yonatan and Yang, Ming-Hsuan and others},
  booktitle=NIPS,
  year={2023}
}

@inproceedings{mizrahi20234m,
  title={{4M}: Massively Multimodal Masked Modeling},
  author={Mizrahi, David and Bachmann, Roman and Kar, Oguzhan Fatih and Yeo, Teresa and Gao, Mingfei and Dehghan, Afshin and Zamir, Amir},
  booktitle=NIPS,
  year={2023}
}

@inproceedings{hu2023avis,
  title={{AVIS}: Autonomous Visual Information Seeking with Large Language Models},
  author={Hu, Ziniu and Iscen, Ahmet and Sun, Chen and Chang, Kai-Wei and Sun, Yizhou and Ross, David A and Schmid, Cordelia and Fathi, Alireza},
  booktitle=NIPS,
  year={2023}
}

@article{zhu2023minigpt,
  title={{MiniGPT-4}: Enhancing Vision-Language Understanding with Advanced Large Language Models},
  author={Zhu, Deyao and Chen, Jun and Shen, Xiaoqian and Li, Xiang and Elhoseiny, Mohamed},
  journal={arXiv preprint arXiv:2304.10592},
  year={2023}
}

@article{ye2023mplug,
  title={{mPLUG-Owl}: Modularization Empowers Large Language Models with Multimodality},
  author={Ye, Qinghao and Xu, Haiyang and Xu, Guohai and Ye, Jiabo and Yan, Ming and Zhou, Yiyang and Wang, Junyang and Hu, Anwen and Shi, Pengcheng and Shi, Yaya and others},
  journal={arXiv preprint arXiv:2304.14178},
  year={2023}
}

@article{gao2023llama,
  title={{LLaMA-Adapter} V2: Parameter-Efficient Visual Instruction Model},
  author={Gao, Peng and Han, Jiaming and Zhang, Renrui and Lin, Ziyi and Geng, Shijie and Zhou, Aojun and Zhang, Wei and Lu, Pan and He, Conghui and Yue, Xiangyu and others},
  journal={arXiv preprint arXiv:2304.15010},
  year={2023}
}

@article{li2023otter,
  title={Otter: A Multi-Modal Model with In-Context Instruction Tuning},
  author={Li, Bo and Zhang, Yuanhan and Chen, Liangyu and Wang, Jinghao and Yang, Jingkang and Liu, Ziwei},
  journal={arXiv preprint arxiv:2305.03726},
  year={2023}
}

@article{chen2023x,
  title={{X-LLM}: Bootstrapping Advanced Large Language Models by Treating Multi-Modalities as Foreign Languages},
  author={Chen, Feilong and Han, Minglun and Zhao, Haozhi and Zhang, Qingyang and Shi, Jing and Xu, Shuang and Xu, Bo},
  journal={arXiv preprint arXiv:2305.04160},
  year={2023}
}

@article{gong2023multimodal,
  title={{MultiModal-GPT}: A Vision and Language Model for Dialogue with Humans},
  author={Gong, Tao and Lyu, Chengqi and Zhang, Shilong and Wang, Yudong and Zheng, Miao and Zhao, Qian and Liu, Kuikun and Zhang, Wenwei and Luo, Ping and Chen, Kai},
  journal={arXiv preprint arXiv:2305.04790},
  year={2023}
}

@article{su2023pandagpt,
  title={{PandaGPT}: One Model To Instruction-Follow Them All},
  author={Su, Yixuan and Lan, Tian and Li, Huayang and Xu, Jialu and Wang, Yan and Cai, Deng},
  journal={arXiv preprint arxiv:2305.16355},
  year={2023}
}

@article{chen2023palix,
  title={{PaLI-X}: On Scaling up a Multilingual Vision and Language Model},
  author={Chen, Xi and Djolonga, Josip and Padlewski, Piotr and Mustafa, Basil and Changpinyo, Soravit and Wu, Jialin and Ruiz, Carlos Riquelme and Goodman, Sebastian and Wang, Xiao and Tay, Yi and others},
  journal={arXiv preprint arXiv:2305.18565},
  year={2023}
}

@article{peng2023kosmos,
  title={Kosmos-2: Grounding Multimodal Large Language Models to the World},
  author={Peng, Zhiliang and Wang, Wenhui and Dong, Li and Hao, Yaru and Huang, Shaohan and Ma, Shuming and Wei, Furu},
  journal={arXiv preprint arXiv:2306.14824},
  year={2023}
}

@article{chen2023shikra,
  title={Shikra: Unleashing Multimodal LLM's Referential Dialogue Magic},
  author={Chen, Keqin and Zhang, Zhao and Zeng, Weili and Zhang, Richong and Zhu, Feng and Zhao, Rui},
  journal={arXiv preprint arXiv:2306.15195},
  year={2023}
}

%% Lynx
@article{zeng2023matters,
  title={What Matters in Training a GPT4-Style Language Model with Multimodal Inputs?},
  author={Zeng, Yan and Zhang, Hanbo and Zheng, Jiani and Xia, Jiangnan and Wei, Guoqiang and Wei, Yang and Zhang, Yuchen and Kong, Tao},
  journal={arXiv preprint arXiv:2307.02469},
  year={2023}
}

%% Emu
@article{sun2023generative,
  title={Generative pretraining in multimodality},
  author={Sun, Quan and Yu, Qiying and Cui, Yufeng and Zhang, Fan and Zhang, Xiaosong and Wang, Yueze and Gao, Hongcheng and Liu, Jingjing and Huang, Tiejun and Wang, Xinlong},
  journal={arXiv preprint arXiv:2307.05222},
  year={2023}
}

@article{zhao2023bubogpt,
  title={{BuboGPT}: Enabling Visual Grounding in Multi-Modal {LLM}s},
  author={Zhao, Yang and Lin, Zhijie and Zhou, Daquan and Huang, Zilong and Feng, Jiashi and Kang, Bingyi},
  journal={arXiv preprint arXiv:2307.08581},
  year={2023}
}

@article{awadalla2023openflamingo,
  title={{OpenFlamingo}: An Open-Source Framework for Training Large Autoregressive Vision-Language Models},
  author={Awadalla, Anas and Gao, Irena and Gardner, Josh and Hessel, Jack and Hanafy, Yusuf and Zhu, Wanrong and Marathe, Kalyani and Bitton, Yonatan and Gadre, Samir and Sagawa, Shiori and others},
  journal={arXiv preprint arXiv:2308.01390},
  year={2023}
}

@article{bai2023qwen,
  title={{Qwen-VL}: A Frontier Large Vision-Language Model with Versatile Abilities},
  author={Bai, Jinze and Bai, Shuai and Yang, Shusheng and Wang, Shijie and Tan, Sinan and Wang, Peng and Lin, Junyang and Zhou, Chang and Zhou, Jingren},
  journal={arXiv preprint arXiv:2308.12966},
  year={2023}
}

%% CM3leon
@article{yu2023scaling,
  title={Scaling Autoregressive Multi-Modal Models: Pretraining and Instruction Tuning},
  author={Yu, Lili and Shi, Bowen and Pasunuru, Ramakanth and Muller, Benjamin and Golovneva, Olga and Wang, Tianlu and Babu, Arun and Tang, Binh and Karrer, Brian and Sheynin, Shelly and others},
  journal={arXiv preprint arXiv:2309.02591},
  year={2023}
}

@article{jin2023unified,
  title={Unified Language-Vision Pretraining in {LLM} with Dynamic Discrete Visual Tokenization},
  author={Jin, Yang and Xu, Kun and Chen, Liwei and Liao, Chao and Tan, Jianchao and Chen, Bin and Lei, Chenyi and Liu, An and Song, Chengru and Lei, Xiaoqiang and others},
  journal={arXiv preprint arXiv:2309.04669},
  year={2023}
}

%% September 25
@misc{openai2023gptv,
  title={{GPT-4V}(ision) System Card},
  author={OpenAI},
  howpublished={\url{https://openai.com/research/gpt-4v-system-card}},
  year={2023}
}

@article{pan2023kosmosg,
  title={{Kosmos-G}: Generating Images in Context with Multimodal Large Language Models},
  author={Pan, Xichen and Dong, Li and Huang, Shaohan and Peng, Zhiliang and Chen, Wenhu and Wei, Furu},
  journal={arXiv preprint arXiv:2310.02992},
  year={2023}
}

@article{li2023leveraging,
  title={Leveraging Unpaired Data for Vision-Language Generative Models via Cycle Consistency},
  author={Li, Tianhong and Bhardwaj, Sangnie and Tian, Yonglong and Zhang, Han and Barber, Jarred and Katabi, Dina and Lajoie, Guillaume and Chang, Huiwen and Krishnan, Dilip},
  journal={arXiv preprint arXiv:2310.03734},
  year={2023}
}

%% MAGVIT-v2
@article{yu2023language,
  title={Language Model Beats Diffusion--Tokenizer is Key to Visual Generation},
  author={Yu, Lijun and Lezama, Jos{\'e} and Gundavarapu, Nitesh B and Versari, Luca and Sohn, Kihyuk and Minnen, David and Cheng, Yong and Gupta, Agrim and Gu, Xiuye and Hauptmann, Alexander G and others},
  journal={arXiv preprint arXiv:2310.05737},
  year={2023}
}

@article{chen2023pali3,
  title={{PaLI-3} Vision Language Models: Smaller, Faster, Stronger},
  author={Chen, Xi and Wang, Xiao and Beyer, Lucas and Kolesnikov, Alexander and Wu, Jialin and Voigtlaender, Paul and Mustafa, Basil and Goodman, Sebastian and Alabdulmohsin, Ibrahim and Padlewski, Piotr and others},
  journal={arXiv preprint arXiv:2310.09199},
  year={2023}
}

%% LLaVA-1.5
@article{liu2023improved,
  title={Improved Baselines with Visual Instruction Tuning},
  author={Liu, Haotian and Li, Chunyuan and Li, Yuheng and Lee, Yong Jae},
  journal={arXiv preprint arXiv:2310.03744},
  year={2023}
}

@article{ye2023mplug,
  title={{mPLUG-Owl2}: Revolutionizing Multi-modal Large Language Model with Modality Collaboration},
  author={Ye, Qinghao and Xu, Haiyang and Ye, Jiabo and Yan, Ming and Liu, Haowei and Qian, Qi and Zhang, Ji and Huang, Fei and Zhou, Jingren},
  journal={arXiv preprint arXiv:2311.04257},
  year={2023}
}

@article{bai2023sequential,
  title={Sequential Modeling Enables Scalable Learning for Large Vision Models},
  author={Bai, Yutong and Geng, Xinyang and Mangalam, Karttikeya and Bar, Amir and Yuille, Alan and Darrell, Trevor and Malik, Jitendra and Efros, Alexei A},
  journal={arXiv preprint arXiv:2312.00785},
  year={2023}
}

@article{gemini2023gemini,
  title={Gemini: A Family of Highly Capable Multimodal Models},
  author={{Gemini Team, Google}},
  journal={arXiv preprint arXiv:2312.11805},
  year={2023}
}

% retrieval
@inproceedings{lewis2020retrieval,
  title={Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks},
  author={Lewis, Patrick and Perez, Ethan and Piktus, Aleksandra and Petroni, Fabio and Karpukhin, Vladimir and Goyal, Naman and K{\"u}ttler, Heinrich and Lewis, Mike and Yih, Wen-tau and Rockt{\"a}schel, Tim and others},
  booktitle=NIPS,
  pages={9459--9474},
  year={2020}
}

@inproceedings{long2022retrieval,
  title={Retrieval Augmented Classification for Long-Tail Visual Recognition},
  author={Long, Alexander and Yin, Wei and Ajanthan, Thalaiyasingam and Nguyen, Vu and Purkait, Pulak and Garg, Ravi and Blair, Alan and Shen, Chunhua and van den Hengel, Anton},
  booktitle=CVPR,
  pages={6959--6969},
  year={2022}
}

@inproceedings{borgeaud2022improving,
  title={Improving Language Models by Retrieving from Trillions of Tokens},
  author={Borgeaud, Sebastian and Mensch, Arthur and Hoffmann, Jordan and Cai, Trevor and Rutherford, Eliza and Millican, Katie and Van Den Driessche, George Bm and Lespiau, Jean-Baptiste and Damoc, Bogdan and Clark, Aidan and others},
  booktitle=ICML,
  pages={2206--2240},
  year={2022}
}

@article{blattmann2022retrieval,
  title={Retrieval-Augmented Diffusion Models},
  author={Blattmann, Andreas and Rombach, Robin and Oktay, Kaan and M{\"u}ller, Jonas and Ommer, Bj{\"o}rn},
  booktitle=NIPS,
  pages={15309--15324},
  year={2022}
}

@inproceedings{chen2022decoupling,
  title={Decoupling Knowledge from Memorization: Retrieval-augmented Prompt Learning},
  author={Chen, Xiang and Li, Lei and Zhang, Ningyu and Liang, Xiaozhuan and Deng, Shumin and Tan, Chuanqi and Huang, Fei and Si, Luo and Chen, Huajun},
  booktitle=NIPS,
  pages={23908--23922},
  year={2022}
}

@inproceedings{wang2023visually,
  title={Visually-Augmented Language Modeling},
  author={Wang, Weizhi and Dong, Li and Cheng, Hao and Song, Haoyu and Liu, Xiaodong and Yan, Xifeng and Gao, Jianfeng and Wei, Furu},
  booktitle=ICLR,
  year={2023}
}

@inproceedings{chen2023re,
  title={{Re-Imagen}: Retrieval-Augmented Text-to-Image Generator},
  author={Chen, Wenhu and Hu, Hexiang and Saharia, Chitwan and Cohen, William W},
  booktitle=ICLR,
  year={2023}
}

@inproceedings{liu2023learning,
  title={Learning Customized Visual Models with Retrieval-Augmented Knowledge},
  author={Liu, Haotian and Son, Kilho and Yang, Jianwei and Liu, Ce and Gao, Jianfeng and Lee, Yong Jae and Li, Chunyuan},
  booktitle=CVPR,
  pages={15148--15158},
  year={2023}
}

@inproceedings{iscen2023improving,
  title={Improving Image Recognition by Retrieving from Web-Scale Image-Text Data},
  author={Iscen, Ahmet and Fathi, Alireza and Schmid, Cordelia},
  booktitle=CVPR,
  pages={19295--19304},
  year={2023}
}

@inproceedings{hu2023reveal,
  title={{REVEAL}: Retrieval-Augmented Visual-Language Pre-Training with Multi-Source Multimodal Knowledge Memory},
  author={Hu, Ziniu and Iscen, Ahmet and Sun, Chen and Wang, Zirui and Chang, Kai-Wei and Sun, Yizhou and Schmid, Cordelia and Ross, David A and Fathi, Alireza},
  booktitle=CVPR,
  pages={23369--23379},
  year={2023}
}

@inproceedings{yasunaga2023retrieval,
  title={Retrieval-Augmented Multimodal Language Modeling},
  author={Yasunaga, Michihiro and Aghajanyan, Armen and Shi, Weijia and James, Rich and Leskovec, Jure and Liang, Percy and Lewis, Mike and Zettlemoyer, Luke and Yih, Wen-tau},
  booktitle=ICML,
  pages={39755--39769},
  year={2023}
}

@inproceedings{udandarao2023sus,
  title={{SuS-X}: Training-Free Name-Only Transfer of Vision-Language Models},
  author={Udandarao, Vishaal and Gupta, Ankush and Albanie, Samuel},
  booktitle=ICCV,
  pages={2725--2736},
  year={2023}
}

@inproceedings{rao2023retrieval,
  title={Retrieval-based Knowledge Augmented Vision Language Pre-training},
  author={Rao, Jiahua and Shan, Zifei and Liu, Longpo and Zhou, Yao and Yang, Yuedong},
  booktitle=ACMMM23,
  year={2023}
}

@inproceedings{lin2023fine,
  title={Fine-grained Late-interaction Multi-modal Retrieval for Retrieval Augmented Visual Question Answering},
  author={Lin, Weizhe and Chen, Jinghong and Mei, Jingbiao and Coca, Alexandru and Byrne, Bill},
  booktitle=NIPS,
  year={2023}
}

% reasoning
@inproceedings{wei2022chain,
  title={Chain-of-Thought Prompting Elicits Reasoning in Large Language Models},
  author={Wei, Jason and Wang, Xuezhi and Schuurmans, Dale and Bosma, Maarten and Xia, Fei and Chi, Ed and Le, Quoc V and Zhou, Denny and others},
  booktitle=NIPS,
  pages={24824--24837},
  year={2022}
}

@inproceedings{yao2023react,
  title={{ReAct}: Synergizing Reasoning and Acting in Language Models},
  author={Yao, Shunyu and Zhao, Jeffrey and Yu, Dian and Du, Nan and Shafran, Izhak and Narasimhan, Karthik R and Cao, Yuan},
  booktitle=ICLR,
  year={2023}
}

@inproceedings{yao2023tree,
  title={Tree of Thoughts: Deliberate Problem Solving with Large Language Models},
  author={Yao, Shunyu and Yu, Dian and Zhao, Jeffrey and Shafran, Izhak and Griffiths, Thomas L and Cao, Yuan and Narasimhan, Karthik},
  booktitle=NIPS,
  year={2023}
}

@inproceedings{dziri2023faith,
  title={Faith and Fate: Limits of Transformers on Compositionality},
  author={Dziri, Nouha and Lu, Ximing and Sclar, Melanie and Li, Xiang Lorraine and Jian, Liwei and Lin, Bill Yuchen and West, Peter and Bhagavatula, Chandra and Bras, Ronan Le and Hwang, Jena D and others},
  booktitle=NIPS,
  year={2023}
}

% tool
@inproceedings{gupta2023visual,
  title={Visual Programming: Compositional Visual Reasoning without Training},
  author={Gupta, Tanmay and Kembhavi, Aniruddha},
  booktitle=CVPR,
  pages={14953--14962},
  year={2023}
}

@inproceedings{schick2023toolformer,
  title={Toolformer: Language Models Can Teach Themselves to Use Tools},
  author={Schick, Timo and Dwivedi-Yu, Jane and Dess{\`\i}, Roberto and Raileanu, Roberta and Lomeli, Maria and Zettlemoyer, Luke and Cancedda, Nicola and Scialom, Thomas},
  booktitle=NIPS,
  year={2023}
}

@inproceedings{hao2023toolkengpt,
  title={{ToolkenGPT}: Augmenting Frozen Language Models with Massive Tools via Tool Embeddings},
  author={Hao, Shibo and Liu, Tianyang and Wang, Zhen and Hu, Zhiting},
  booktitle=NIPS,
  year={2023}
}

@article{wu2023visual,
  title={Visual {ChatGPT}: Talking, Drawing and Editing with Visual Foundation Models},
  author={Wu, Chenfei and Yin, Shengming and Qi, Weizhen and Wang, Xiaodong and Tang, Zecheng and Duan, Nan},