robot-learning-tutorial/main.bib at main · fracapuano/robot-learning-tutorial · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
@misc{agibot-world-contributorsAgiBotWorldColosseo2025,
  title = {{{AgiBot World Colosseo}}: {{A Large-scale Manipulation Platform}} for {{Scalable}} and {{Intelligent Embodied Systems}}},
  shorttitle = {{{AgiBot World Colosseo}}},
  author = {{AgiBot-World-Contributors} and Bu, Qingwen and Cai, Jisong and Chen, Li and Cui, Xiuqi and Ding, Yan and Feng, Siyuan and Gao, Shenyuan and He, Xindong and Hu, Xuan and Huang, Xu and Jiang, Shu and Jiang, Yuxin and Jing, Cheng and Li, Hongyang and Li, Jialu and Liu, Chiming and Liu, Yi and Lu, Yuxiang and Luo, Jianlan and Luo, Ping and Mu, Yao and Niu, Yuehan and Pan, Yixuan and Pang, Jiangmiao and Qiao, Yu and Ren, Guanghui and Ruan, Cheng and Shan, Jiaqi and Shen, Yongjian and Shi, Chengshi and Shi, Mingkang and Shi, Modi and Sima, Chonghao and Song, Jianheng and Wang, Huijie and Wang, Wenhao and Wei, Dafeng and Xie, Chengen and Xu, Guo and Yan, Junchi and Yang, Cunbiao and Yang, Lei and Yang, Shukai and Yao, Maoqing and Zeng, Jia and Zhang, Chi and Zhang, Qinglin and Zhao, Bin and Zhao, Chengyue and Zhao, Jiaqi and Zhu, Jianchao},
  year = {2025},
  month = aug,
  number = {arXiv:2503.06669},
  eprint = {2503.06669},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2503.06669},
  urldate = {2025-08-27},
  abstract = {We explore how scalable robot data can address real-world challenges for generalized robotic manipulation. Introducing AgiBot World, a large-scale platform comprising over 1 million trajectories across 217 tasks in five deployment scenarios, we achieve an order-of-magnitude increase in data scale compared to existing datasets. Accelerated by a standardized collection pipeline with human-in-the-loop verification, AgiBot World guarantees high-quality and diverse data distribution. It is extensible from grippers to dexterous hands and visuo-tactile sensors for fine-grained skill acquisition. Building on top of data, we introduce Genie Operator-1 (GO-1), a novel generalist policy that leverages latent action representations to maximize data utilization, demonstrating predictable performance scaling with increased data volume. Policies pre-trained on our dataset achieve an average performance improvement of 30\% over those trained on Open X-Embodiment, both in in-domain and out-of-distribution scenarios. GO-1 exhibits exceptional capability in real-world dexterous and long-horizon tasks, achieving over 60\% success rate on complex tasks and outperforming prior RDT approach by 32\%. By open-sourcing the dataset, tools, and models, we aim to democratize access to large-scale, high-quality robot data, advancing the pursuit of scalable and general-purpose intelligence.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning,Computer Science - Robotics},
  file = {/Users/fracapuano/Zotero/storage/TGP4C7GA/AgiBot-World-Contributors et al. - 2025 - AgiBot World Colosseo A Large-scale Manipulation Platform for Scalable and Intelligent Embodied Sys.pdf;/Users/fracapuano/Zotero/storage/IC7BUHWR/2503.html}
}

@article{agrawalComputationalSensorimotorLearning,
  title = {Computational {{Sensorimotor Learning}}},
  author = {Agrawal, Pulkit},
  langid = {english},
  file = {/Users/fracapuano/Zotero/storage/KSDX9GA2/Agrawal - Computational Sensorimotor Learning.pdf}
}

@misc{akkayaSolvingRubiksCube2019,
  title = {Solving {{Rubik}}'s {{Cube}} with a {{Robot Hand}}},
  author = {Akkaya, Ilge and Andrychowicz, Marcin and Chociej, Maciek and Litwin, Mateusz and McGrew, Bob and Petron, Arthur and Paino, Alex and Plappert, Matthias and Powell, Glenn and Ribas, Raphael and Schneider, Jonas and Tezak, Nikolas and Tworek, Jerry and Welinder, Peter and Weng, Lilian and Yuan, Qiming and Zaremba, Wojciech and Zhang, Lei},
  year = {2019},
  month = oct,
  number = {arXiv:1910.07113},
  eprint = {1910.07113},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.1910.07113},
  urldate = {2025-08-26},
  abstract = {We demonstrate that models trained only in simulation can be used to solve a manipulation problem of unprecedented complexity on a real robot. This is made possible by two key components: a novel algorithm, which we call automatic domain randomization (ADR) and a robot platform built for machine learning. ADR automatically generates a distribution over randomized environments of ever-increasing difficulty. Control policies and vision state estimators trained with ADR exhibit vastly improved sim2real transfer. For control policies, memory-augmented models trained on an ADR-generated distribution of environments show clear signs of emergent meta-learning at test time. The combination of ADR with our custom robot platform allows us to solve a Rubik's cube with a humanoid robot hand, which involves both control and state estimation problems. Videos summarizing our results are available: https://openai.com/blog/solving-rubiks-cube/},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning,Computer Science - Robotics,Statistics - Machine Learning},
  file = {/Users/fracapuano/Zotero/storage/5HNZLG9D/OpenAI et al. - 2019 - Solving Rubik's Cube with a Robot Hand.pdf;/Users/fracapuano/Zotero/storage/WSM7BJ4I/1910.html}
}

@misc{alayracFlamingoVisualLanguage2022,
  title = {Flamingo: A {{Visual Language Model}} for {{Few-Shot Learning}}},
  shorttitle = {Flamingo},
  author = {Alayrac, Jean-Baptiste and Donahue, Jeff and Luc, Pauline and Miech, Antoine and Barr, Iain and Hasson, Yana and Lenc, Karel and Mensch, Arthur and Millican, Katie and Reynolds, Malcolm and Ring, Roman and Rutherford, Eliza and Cabi, Serkan and Han, Tengda and Gong, Zhitao and Samangooei, Sina and Monteiro, Marianne and Menick, Jacob and Borgeaud, Sebastian and Brock, Andrew and Nematzadeh, Aida and Sharifzadeh, Sahand and Binkowski, Mikolaj and Barreira, Ricardo and Vinyals, Oriol and Zisserman, Andrew and Simonyan, Karen},
  year = {2022},
  month = nov,
  number = {arXiv:2204.14198},
  eprint = {2204.14198},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2204.14198},
  urldate = {2025-08-27},
  abstract = {Building models that can be rapidly adapted to novel tasks using only a handful of annotated examples is an open challenge for multimodal machine learning research. We introduce Flamingo, a family of Visual Language Models (VLM) with this ability. We propose key architectural innovations to: (i) bridge powerful pretrained vision-only and language-only models, (ii) handle sequences of arbitrarily interleaved visual and textual data, and (iii) seamlessly ingest images or videos as inputs. Thanks to their flexibility, Flamingo models can be trained on large-scale multimodal web corpora containing arbitrarily interleaved text and images, which is key to endow them with in-context few-shot learning capabilities. We perform a thorough evaluation of our models, exploring and measuring their ability to rapidly adapt to a variety of image and video tasks. These include open-ended tasks such as visual question-answering, where the model is prompted with a question which it has to answer; captioning tasks, which evaluate the ability to describe a scene or an event; and close-ended tasks such as multiple-choice visual question-answering. For tasks lying anywhere on this spectrum, a single Flamingo model can achieve a new state of the art with few-shot learning, simply by prompting the model with task-specific examples. On numerous benchmarks, Flamingo outperforms models fine-tuned on thousands of times more task-specific data.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning},
  file = {/Users/fracapuano/Zotero/storage/QZ69HN5K/Alayrac et al. - 2022 - Flamingo a Visual Language Model for Few-Shot Learning.pdf;/Users/fracapuano/Zotero/storage/JMAD5HJY/2204.html}
}

@article{aldacoALOHA2Enhanced,
  title = {{{ALOHA}} 2: {{An Enhanced Low-Cost Hardware}} for {{Bimanual Teleoperation}}},
  author = {Aldaco, Jorge and Armstrong, Travis and Baruch, Robert and Bingham, Jeff and Chan, Sanky and Dwibedi, Debidatta and Finn, Chelsea and Florence, Pete and Goodrich, Spencer and Gramlich, Wayne and Herzog, Alexander and Hoech, Jonathan and Nguyen, Thinh and Storz, Ian and Tabanpour, Baruch and Tompson, Jonathan and Wahid, Ayzaan and Wahrburg, Ted and Xu, Sichun and Yaroshenko, Sergey and Zhao, Tony Z},
  langid = {english},
  file = {/Users/fracapuano/Zotero/storage/LDEJG62Q/Aldaco et al. - ALOHA 2 An Enhanced Low-Cost Hardware for Bimanual Teleoperation.pdf}
}

@article{alizadehComprehensiveSurveySpace2024,
  title = {A Comprehensive Survey of Space Robotic Manipulators for On-Orbit Servicing},
  author = {Alizadeh, Mohammad and Zhu, Zheng H.},
  year = {2024},
  month = oct,
  journal = {Frontiers in Robotics and AI},
  volume = {11},
  publisher = {Frontiers},
  issn = {2296-9144},
  doi = {10.3389/frobt.2024.1470950},
  urldate = {2025-08-26},
  abstract = {On-Orbit Servicing (OOS) robots are transforming space exploration by enabling vital maintenance and repair of spacecraft directly in space. However, achieving precise and safe manipulation in microgravity necessitates overcoming significant challenges. This survey delves into four crucial areas essential for successful OOS manipulation: object state estimation, motion planning, and feedback control. Techniques from traditional vision to advanced X-ray and neural network methods are explored for object state estimation. Strategies for fuel-optimized trajectories, docking maneuvers, and collision avoidance are examined in motion planning. The survey also explores control methods for various scenarios, including cooperative manipulation and handling uncertainties, in feedback control. Additionally, this survey examines how Machine learning techniques can further propel OOS robots towards more complex and delicate tasks in space.},
  langid = {english},
  keywords = {control,machine learning,motion planning,on-orbit servicing,pose estimation,robotic manipulator,space robots},
  file = {/Users/fracapuano/Zotero/storage/VA36KZYY/Alizadeh and Zhu - 2024 - A comprehensive survey of space robotic manipulators for on-orbit servicing.pdf}
}

@misc{allalSmolLM2WhenSmol2025,
  title = {{{SmolLM2}}: {{When Smol Goes Big}} -- {{Data-Centric Training}} of a {{Small Language Model}}},
  shorttitle = {{{SmolLM2}}},
  author = {Allal, Loubna Ben and Lozhkov, Anton and Bakouch, Elie and Bl{\'a}zquez, Gabriel Mart{\'i}n and Penedo, Guilherme and Tunstall, Lewis and Marafioti, Andr{\'e}s and Kydl{\'i}{\v c}ek, Hynek and Lajar{\'i}n, Agust{\'i}n Piqueres and Srivastav, Vaibhav and Lochner, Joshua and Fahlgren, Caleb and Nguyen, Xuan-Son and Fourrier, Cl{\'e}mentine and Burtenshaw, Ben and Larcher, Hugo and Zhao, Haojun and Zakka, Cyril and Morlon, Mathieu and Raffel, Colin and von Werra, Leandro and Wolf, Thomas},
  year = {2025},
  month = feb,
  number = {arXiv:2502.02737},
  eprint = {2502.02737},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2502.02737},
  urldate = {2025-09-09},
  abstract = {While large language models have facilitated breakthroughs in many applications of artificial intelligence, their inherent largeness makes them computationally expensive and challenging to deploy in resource-constrained settings. In this paper, we document the development of SmolLM2, a state-of-the-art "small" (1.7 billion parameter) language model (LM). To attain strong performance, we overtrain SmolLM2 on {\textasciitilde}11 trillion tokens of data using a multi-stage training process that mixes web text with specialized math, code, and instruction-following data. We additionally introduce new specialized datasets (FineMath, Stack-Edu, and SmolTalk) at stages where we found existing datasets to be problematically small or low-quality. To inform our design decisions, we perform both small-scale ablations as well as a manual refinement process that updates the dataset mixing rates at each stage based on the performance at the previous stage. Ultimately, we demonstrate that SmolLM2 outperforms other recent small LMs including Qwen2.5-1.5B and Llama3.2-1B. To facilitate future research on LM development as well as applications of small LMs, we release both SmolLM2 as well as all of the datasets we prepared in the course of this project.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computation and Language},
  file = {/Users/fracapuano/Zotero/storage/I7XDMSV7/Allal et al. - 2025 - SmolLM2 When Smol Goes Big -- Data-Centric Training of a Small Language Model.pdf;/Users/fracapuano/Zotero/storage/6MLZI84T/2502.html}
}

@misc{antonovaReinforcementLearningPivoting2017,
  title = {Reinforcement {{Learning}} for {{Pivoting Task}}},
  author = {Antonova, Rika and Cruciani, Silvia and Smith, Christian and Kragic, Danica},
  year = {2017},
  month = mar,
  number = {arXiv:1703.00472},
  eprint = {1703.00472},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.1703.00472},
  urldate = {2025-08-25},
  abstract = {In this work we propose an approach to learn a robust policy for solving the pivoting task. Recently, several model-free continuous control algorithms were shown to learn successful policies without prior knowledge of the dynamics of the task. However, obtaining successful policies required thousands to millions of training episodes, limiting the applicability of these approaches to real hardware. We developed a training procedure that allows us to use a simple custom simulator to learn policies robust to the mismatch of simulation vs robot. In our experiments, we demonstrate that the policy learned in the simulator is able to pivot the object to the desired target angle on the real robot. We also show generalization to an object with different inertia, shape, mass and friction properties than those used during training. This result is a step towards making model-free reinforcement learning available for solving robotics tasks via pre-training in simulators that offer only an imprecise match to the real-world dynamics.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Machine Learning,Computer Science - Robotics},
  file = {/Users/fracapuano/Zotero/storage/WRZCHVGB/Antonova et al. - 2017 - Reinforcement Learning for Pivoting Task.pdf;/Users/fracapuano/Zotero/storage/WJEJ2VGU/1703.html}
}

@article{aractingiControllingSolo12Quadruped2023,
  title = {Controlling the {{Solo12}} Quadruped Robot with Deep Reinforcement Learning},
  author = {Aractingi, Michel and L{\'e}ziart, Pierre-Alexandre and Flayols, Thomas and Perez, Julien and Silander, Tomi and Sou{\`e}res, Philippe},
  year = {2023},
  month = jul,
  journal = {Scientific Reports},
  volume = {13},
  number = {1},
  pages = {11945},
  publisher = {Nature Publishing Group},
  issn = {2045-2322},
  doi = {10.1038/s41598-023-38259-7},
  urldate = {2025-08-27},
  abstract = {Quadruped robots require robust and general locomotion skills to exploit their mobility potential in complex and challenging environments. In this work, we present an implementation of a robust end-to-end learning-based controller on the Solo12 quadruped. Our method is based on deep reinforcement learning of joint impedance references. The resulting control policies follow a commanded velocity reference while being efficient in its energy consumption and easy to deploy. We detail the learning procedure and method for transfer on the real robot. We show elaborate experiments. Finally, we present experimental results of the learned locomotion on various grounds indoors and outdoors. These results show that the Solo12 robot is a suitable open-source platform for research combining learning and control because of the easiness in transferring and deploying learned controllers.},
  copyright = {2023 The Author(s)},
  langid = {english},
  keywords = {Computer science,Information technology},
  file = {/Users/fracapuano/Zotero/storage/84ZFT7RP/Aractingi et al. - 2023 - Controlling the Solo12 quadruped robot with deep reinforcement learning.pdf}
}

@misc{bai2025qwen25vl,
  title = {Qwen2.5-{{VL}} Technical Report},
  author = {Bai, Shuai and Chen, Keqin and Liu, Xuejing and Wang, Jialin and Ge, Wenbin and Song, Sibo and Dang, Kai and Wang, Peng and Wang, Shijie and Tang, Jun and Zhong, Humen and Zhu, Yuanzhi and Yang, Mingkun and Li, Zhaohai and Wan, Jianqiang and Wang, Pengfei and Ding, Wei and Fu, Zheren and Xu, Yiheng and Ye, Jiabo and Zhang, Xi and Xie, Tianbao and Cheng, Zesen and Zhang, Hang and Yang, Zhibo and Xu, Haiyang and Lin, Junyang},
  year = {2025},
  eprint = {2502.13923},
  primaryclass = {cs.CV},
  archiveprefix = {arXiv}
}

@misc{ballEfficientOnlineReinforcement2023,
  title = {Efficient {{Online Reinforcement Learning}} with {{Offline Data}}},
  author = {Ball, Philip J. and Smith, Laura and Kostrikov, Ilya and Levine, Sergey},
  year = {2023},
  month = may,
  number = {arXiv:2302.02948},
  eprint = {2302.02948},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2302.02948},
  urldate = {2025-08-30},
  abstract = {Sample efficiency and exploration remain major challenges in online reinforcement learning (RL). A powerful approach that can be applied to address these issues is the inclusion of offline data, such as prior trajectories from a human expert or a sub-optimal exploration policy. Previous methods have relied on extensive modifications and additional complexity to ensure the effective use of this data. Instead, we ask: can we simply apply existing off-policy methods to leverage offline data when learning online? In this work, we demonstrate that the answer is yes; however, a set of minimal but important changes to existing off-policy RL algorithms are required to achieve reliable performance. We extensively ablate these design choices, demonstrating the key factors that most affect performance, and arrive at a set of recommendations that practitioners can readily apply, whether their data comprise a small number of expert demonstrations or large volumes of sub-optimal trajectories. We see that correct application of these simple recommendations can provide a \${\textbackslash}mathbf\{2.5{\textbackslash}times\}\$ improvement over existing approaches across a diverse set of competitive benchmarks, with no additional computational overhead. We have released our code at https://github.com/ikostrikov/rlpd.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning},
  file = {/Users/fracapuano/Zotero/storage/MUKA5D2V/Ball et al. - 2023 - Efficient Online Reinforcement Learning with Offline Data.pdf;/Users/fracapuano/Zotero/storage/IKURHC3D/2302.html}
}

@misc{bekrisStateRobotMotion2024,
  title = {The {{State}} of {{Robot Motion Generation}}},
  author = {Bekris, Kostas E. and Doerr, Joe and Meng, Patrick and Tangirala, Sumanth},
  year = {2024},
  month = oct,
  number = {arXiv:2410.12172},
  eprint = {2410.12172},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2410.12172},
  urldate = {2025-08-26},
  abstract = {This paper reviews the large spectrum of methods for generating robot motion proposed over the 50 years of robotics research culminating in recent developments. It crosses the boundaries of methodologies, typically not surveyed together, from those that operate over explicit models to those that learn implicit ones. The paper discusses the current state-of-the-art as well as properties of varying methodologies, highlighting opportunities for integration.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Computer Science - Robotics},
  file = {/Users/fracapuano/Zotero/storage/DMJJZFDZ/Bekris et al. - 2024 - The State of Robot Motion Generation.pdf;/Users/fracapuano/Zotero/storage/TL42IRAN/2410.html}
}

@article{bellemareAutonomousNavigationStratospheric2020,
  title = {Autonomous Navigation of Stratospheric Balloons Using Reinforcement Learning},
  author = {Bellemare, Marc G. and Candido, Salvatore and Castro, Pablo Samuel and Gong, Jun and Machado, Marlos C. and Moitra, Subhodeep and Ponda, Sameera S. and Wang, Ziyu},
  year = {2020},
  month = dec,
  journal = {Nature},
  volume = {588},
  number = {7836},
  pages = {77--82},
  publisher = {Nature Publishing Group},
  issn = {1476-4687},
  doi = {10.1038/s41586-020-2939-8},
  urldate = {2025-08-31},
  abstract = {Efficiently navigating a superpressure balloon in the stratosphere1 requires the integration of a multitude of cues, such as wind speed and solar elevation, and the process is complicated by forecast errors and sparse wind measurements. Coupled with the need to make decisions in real time, these factors rule out the use of conventional control techniques2,3. Here we describe the use of reinforcement learning4,5 to create a high-performing flight controller. Our algorithm uses data augmentation6,7 and a self-correcting design to overcome the key technical challenge of reinforcement learning from imperfect data, which has proved to be a major obstacle to its application to physical systems8. We deployed our controller to station Loon superpressure balloons at multiple locations across the globe, including a 39-day controlled experiment over the Pacific Ocean. Analyses show that the controller outperforms Loon's previous algorithm and is robust to the natural diversity in stratospheric winds. These results demonstrate that reinforcement learning is an effective solution to real-world autonomous control problems in which neither conventional methods nor human intervention suffice, offering clues about what may be needed to create artificially intelligent agents that continuously interact with real, dynamic environments.},
  copyright = {2020 The Author(s), under exclusive licence to Springer Nature Limited},
  langid = {english},
  keywords = {Aerospace engineering,Computer science}
}

@article{bellmanMarkovianDecisionProcess1957,
  title = {A {{Markovian Decision Process}}},
  author = {Bellman, Richard},
  year = {1957},
  journal = {Journal of Mathematics and Mechanics},
  volume = {6},
  number = {5},
  eprint = {24900506},
  eprinttype = {jstor},
  pages = {679--684},
  publisher = {Indiana University Mathematics Department},
  issn = {0095-9057},
  urldate = {2025-08-30}
}

@misc{beyerPaliGemmaVersatile3B2024,
  title = {{{PaliGemma}}: {{A}} Versatile {{3B VLM}} for Transfer},
  shorttitle = {{{PaliGemma}}},
  author = {Beyer, Lucas and Steiner, Andreas and Pinto, Andr{\'e} Susano and Kolesnikov, Alexander and Wang, Xiao and Salz, Daniel and Neumann, Maxim and Alabdulmohsin, Ibrahim and Tschannen, Michael and Bugliarello, Emanuele and Unterthiner, Thomas and Keysers, Daniel and Koppula, Skanda and Liu, Fangyu and Grycner, Adam and Gritsenko, Alexey and Houlsby, Neil and Kumar, Manoj and Rong, Keran and Eisenschlos, Julian and Kabra, Rishabh and Bauer, Matthias and Bo{\v s}njak, Matko and Chen, Xi and Minderer, Matthias and Voigtlaender, Paul and Bica, Ioana and Balazevic, Ivana and Puigcerver, Joan and Papalampidi, Pinelopi and Henaff, Olivier and Xiong, Xi and Soricut, Radu and Harmsen, Jeremiah and Zhai, Xiaohua},
  year = {2024},
  month = oct,
  number = {arXiv:2407.07726},
  eprint = {2407.07726},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2407.07726},
  urldate = {2025-09-08},
  abstract = {PaliGemma is an open Vision-Language Model (VLM) that is based on the SigLIP-So400m vision encoder and the Gemma-2B language model. It is trained to be a versatile and broadly knowledgeable base model that is effective to transfer. It achieves strong performance on a wide variety of open-world tasks. We evaluate PaliGemma on almost 40 diverse tasks including standard VLM benchmarks, but also more specialized tasks such as remote-sensing and segmentation.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning},
  file = {/Users/fracapuano/Zotero/storage/IPDYNWC4/Beyer et al. - 2024 - PaliGemma A versatile 3B VLM for transfer.pdf;/Users/fracapuano/Zotero/storage/R7UVD9WC/2407.html}
}

@misc{bjorckGR00TN1Open2025,
  title = {{{GR00T N1}}: {{An Open Foundation Model}} for {{Generalist Humanoid Robots}}},
  shorttitle = {{{GR00T N1}}},
  author = {Bjorck, Johan and Casta{\~n}eda, Fernando and Cherniadev, Nikita and Da, Xingye and Ding, Runyu and Fan, Linxi "Jim" and Fang, Yu and Fox, Dieter and Hu, Fengyuan and Huang, Spencer and Jang, Joel and Jiang, Zhenyu and Kautz, Jan and Kundalia, Kaushil and Lao, Lawrence and Li, Zhiqi and Lin, Zongyu and Lin, Kevin and Liu, Guilin and Llontop, Edith and Magne, Loic and Mandlekar, Ajay and Narayan, Avnish and Nasiriany, Soroush and Reed, Scott and Tan, You Liang and Wang, Guanzhi and Wang, Zu and Wang, Jing and Wang, Qi and Xiang, Jiannan and Xie, Yuqi and Xu, Yinzhen and Xu, Zhenjia and Ye, Seonghyeon and Yu, Zhiding and Zhang, Ao and Zhang, Hao and Zhao, Yizhou and Zheng, Ruijie and Zhu, Yuke},
  year = {2025},
  month = mar,
  number = {arXiv:2503.14734},
  eprint = {2503.14734},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2503.14734},
  urldate = {2025-08-26},
  abstract = {General-purpose robots need a versatile body and an intelligent mind. Recent advancements in humanoid robots have shown great promise as a hardware platform for building generalist autonomy in the human world. A robot foundation model, trained on massive and diverse data sources, is essential for enabling the robots to reason about novel situations, robustly handle real-world variability, and rapidly learn new tasks. To this end, we introduce GR00T N1, an open foundation model for humanoid robots. GR00T N1 is a Vision-Language-Action (VLA) model with a dual-system architecture. The vision-language module (System 2) interprets the environment through vision and language instructions. The subsequent diffusion transformer module (System 1) generates fluid motor actions in real time. Both modules are tightly coupled and jointly trained end-to-end. We train GR00T N1 with a heterogeneous mixture of real-robot trajectories, human videos, and synthetically generated datasets. We show that our generalist robot model GR00T N1 outperforms the state-of-the-art imitation learning baselines on standard simulation benchmarks across multiple robot embodiments. Furthermore, we deploy our model on the Fourier GR-1 humanoid robot for language-conditioned bimanual manipulation tasks, achieving strong performance with high data efficiency.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Computer Science - Robotics},
  file = {/Users/fracapuano/Zotero/storage/BDNSKFA6/NVIDIA et al. - 2025 - GR00T N1 An Open Foundation Model for Generalist Humanoid Robots.pdf;/Users/fracapuano/Zotero/storage/FENU9PQR/2503.html}
}

@misc{black$p_0$VisionLanguageActionFlow2024,
  title = {\${$\pi\_$}0\$: {{A Vision-Language-Action Flow Model}} for {{General Robot Control}}},
  shorttitle = {\${$\pi\_$}0\$},
  author = {Black, Kevin and Brown, Noah and Driess, Danny and Esmail, Adnan and Equi, Michael and Finn, Chelsea and Fusai, Niccolo and Groom, Lachy and Hausman, Karol and Ichter, Brian and Jakubczak, Szymon and Jones, Tim and Ke, Liyiming and Levine, Sergey and {Li-Bell}, Adrian and Mothukuri, Mohith and Nair, Suraj and Pertsch, Karl and Shi, Lucy Xiaoyang and Tanner, James and Vuong, Quan and Walling, Anna and Wang, Haohuan and Zhilinsky, Ury},
  year = {2024},
  month = oct,
  number = {arXiv:2410.24164},
  eprint = {2410.24164},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2410.24164},
  urldate = {2025-08-28},
  abstract = {Robot learning holds tremendous promise to unlock the full potential of flexible, general, and dexterous robot systems, as well as to address some of the deepest questions in artificial intelligence. However, bringing robot learning to the level of generality required for effective real-world systems faces major obstacles in terms of data, generalization, and robustness. In this paper, we discuss how generalist robot policies (i.e., robot foundation models) can address these challenges, and how we can design effective generalist robot policies for complex and highly dexterous tasks. We propose a novel flow matching architecture built on top of a pre-trained vision-language model (VLM) to inherit Internet-scale semantic knowledge. We then discuss how this model can be trained on a large and diverse dataset from multiple dexterous robot platforms, including single-arm robots, dual-arm robots, and mobile manipulators. We evaluate our model in terms of its ability to perform tasks in zero shot after pre-training, follow language instructions from people and from a high-level VLM policy, and its ability to acquire new skills via fine-tuning. Our results cover a wide variety of tasks, such as laundry folding, table cleaning, and assembling boxes.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Machine Learning,Computer Science - Robotics},
  file = {/Users/fracapuano/Zotero/storage/GUEM37NZ/Black et al. - 2024 - $π_0$ A Vision-Language-Action Flow Model for General Robot Control.pdf;/Users/fracapuano/Zotero/storage/FHYXZWF8/2410.html}
}

@inproceedings{BLIP-2,
  title = {{{BLIP-2}}: Bootstrapping Language-Image Pre-Training with Frozen Image Encoders and Large Language Models},
  booktitle = {Proceedings of the 40th International Conference on Machine Learning},
  author = {Li, Junnan and Li, Dongxu and Savarese, Silvio and Hoi, Steven},
  year = {2023},
  series = {{{ICML}}'23},
  publisher = {JMLR.org},
  address = {, Honolulu, Hawaii, USA,},
  abstract = {The cost of vision-and-language pre-training has become increasingly prohibitive due to end-to-end training of large-scale models. This paper proposes BLIP-2, a generic and efficient pretraining strategy that bootstraps vision-language pre-training from off-the-shelf frozen pretrained image encoders and frozen large language models. BLIP-2 bridges the modality gap with a lightweight Querying Transformer, which is pretrained in two stages. The first stage bootstraps vision-language representation learning from a frozen image encoder. The second stage bootstraps vision-to-language generative learning from a frozen language model. BLIP-2 achieves state-of-the-art performance on various vision-language tasks, despite having significantly fewer trainable parameters than existing methods. For example, our model outperforms Flamingo80B by 8.7\% on zero-shot VQAv2 with 54x fewer trainable parameters. We also demonstrate the model's capabilities of zero-shot image-to-text generation that can follow natural language instructions.},
  articleno = {814}
}

@misc{brohanRT1RoboticsTransformer2023,
  title = {{{RT-1}}: {{Robotics Transformer}} for {{Real-World Control}} at {{Scale}}},
  shorttitle = {{{RT-1}}},
  author = {Brohan, Anthony and Brown, Noah and Carbajal, Justice and Chebotar, Yevgen and Dabis, Joseph and Finn, Chelsea and Gopalakrishnan, Keerthana and Hausman, Karol and Herzog, Alex and Hsu, Jasmine and Ibarz, Julian and Ichter, Brian and Irpan, Alex and Jackson, Tomas and Jesmonth, Sally and Joshi, Nikhil J. and Julian, Ryan and Kalashnikov, Dmitry and Kuang, Yuheng and Leal, Isabel and Lee, Kuang-Huei and Levine, Sergey and Lu, Yao and Malla, Utsav and Manjunath, Deeksha and Mordatch, Igor and Nachum, Ofir and Parada, Carolina and Peralta, Jodilyn and Perez, Emily and Pertsch, Karl and Quiambao, Jornell and Rao, Kanishka and Ryoo, Michael and Salazar, Grecia and Sanketi, Pannag and Sayed, Kevin and Singh, Jaspiar and Sontakke, Sumedh and Stone, Austin and Tan, Clayton and Tran, Huong and Vanhoucke, Vincent and Vega, Steve and Vuong, Quan and Xia, Fei and Xiao, Ted and Xu, Peng and Xu, Sichun and Yu, Tianhe and Zitkovich, Brianna},
  year = {2023},
  month = aug,
  number = {arXiv:2212.06817},
  eprint = {2212.06817},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2212.06817},
  urldate = {2025-09-07},
  abstract = {By transferring knowledge from large, diverse, task-agnostic datasets, modern machine learning models can solve specific downstream tasks either zero-shot or with small task-specific datasets to a high level of performance. While this capability has been demonstrated in other fields such as computer vision, natural language processing or speech recognition, it remains to be shown in robotics, where the generalization capabilities of the models are particularly critical due to the difficulty of collecting real-world robotic data. We argue that one of the keys to the success of such general robotic models lies with open-ended task-agnostic training, combined with high-capacity architectures that can absorb all of the diverse, robotic data. In this paper, we present a model class, dubbed Robotics Transformer, that exhibits promising scalable model properties. We verify our conclusions in a study of different model classes and their ability to generalize as a function of the data size, model size, and data diversity based on a large-scale data collection on real robots performing real-world tasks. The project's website and videos can be found at robotics-transformer1.github.io},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning,Computer Science - Robotics},
  file = {/Users/fracapuano/Zotero/storage/TTBN3M5Y/Brohan et al. - 2023 - RT-1 Robotics Transformer for Real-World Control at Scale.pdf;/Users/fracapuano/Zotero/storage/DK3D593W/2212.html}
}

@misc{brohanRT2VisionLanguageActionModels2023,
  title = {{{RT-2}}: {{Vision-Language-Action Models Transfer Web Knowledge}} to {{Robotic Control}}},
  shorttitle = {{{RT-2}}},
  author = {Brohan, Anthony and Brown, Noah and Carbajal, Justice and Chebotar, Yevgen and Chen, Xi and Choromanski, Krzysztof and Ding, Tianli and Driess, Danny and Dubey, Avinava and Finn, Chelsea and Florence, Pete and Fu, Chuyuan and Arenas, Montse Gonzalez and Gopalakrishnan, Keerthana and Han, Kehang and Hausman, Karol and Herzog, Alexander and Hsu, Jasmine and Ichter, Brian and Irpan, Alex and Joshi, Nikhil and Julian, Ryan and Kalashnikov, Dmitry and Kuang, Yuheng and Leal, Isabel and Lee, Lisa and Lee, Tsang-Wei Edward and Levine, Sergey and Lu, Yao and Michalewski, Henryk and Mordatch, Igor and Pertsch, Karl and Rao, Kanishka and Reymann, Krista and Ryoo, Michael and Salazar, Grecia and Sanketi, Pannag and Sermanet, Pierre and Singh, Jaspiar and Singh, Anikait and Soricut, Radu and Tran, Huong and Vanhoucke, Vincent and Vuong, Quan and Wahid, Ayzaan and Welker, Stefan and Wohlhart, Paul and Wu, Jialin and Xia, Fei and Xiao, Ted and Xu, Peng and Xu, Sichun and Yu, Tianhe and Zitkovich, Brianna},
  year = {2023},
  month = jul,
  number = {arXiv:2307.15818},
  eprint = {2307.15818},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2307.15818},
  urldate = {2025-09-07},
  abstract = {We study how vision-language models trained on Internet-scale data can be incorporated directly into end-to-end robotic control to boost generalization and enable emergent semantic reasoning. Our goal is to enable a single end-to-end trained model to both learn to map robot observations to actions and enjoy the benefits of large-scale pretraining on language and vision-language data from the web. To this end, we propose to co-fine-tune state-of-the-art vision-language models on both robotic trajectory data and Internet-scale vision-language tasks, such as visual question answering. In contrast to other approaches, we propose a simple, general recipe to achieve this goal: in order to fit both natural language responses and robotic actions into the same format, we express the actions as text tokens and incorporate them directly into the training set of the model in the same way as natural language tokens. We refer to such category of models as vision-language-action models (VLA) and instantiate an example of such a model, which we call RT-2. Our extensive evaluation (6k evaluation trials) shows that our approach leads to performant robotic policies and enables RT-2 to obtain a range of emergent capabilities from Internet-scale training. This includes significantly improved generalization to novel objects, the ability to interpret commands not present in the robot training data (such as placing an object onto a particular number or icon), and the ability to perform rudimentary reasoning in response to user commands (such as picking up the smallest or largest object, or the one closest to another object). We further show that incorporating chain of thought reasoning allows RT-2 to perform multi-stage semantic reasoning, for example figuring out which object to pick up for use as an improvised hammer (a rock), or which type of drink is best suited for someone who is tired (an energy drink).},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computation and Language,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning,Computer Science - Robotics},
  file = {/Users/fracapuano/Zotero/storage/CZHMNYPG/Brohan et al. - 2023 - RT-2 Vision-Language-Action Models Transfer Web Knowledge to Robotic Control.pdf;/Users/fracapuano/Zotero/storage/WN2E7AZH/2307.html}
}

@misc{brownLanguageModelsAre2020,
  title = {Language {{Models}} Are {{Few-Shot Learners}}},
  author = {Brown, Tom B. and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and {Herbert-Voss}, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel M. and Wu, Jeffrey and Winter, Clemens and Hesse, Christopher and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},
  year = {2020},
  month = jul,
  number = {arXiv:2005.14165},
  eprint = {2005.14165},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2005.14165},
  urldate = {2025-08-28},
  abstract = {Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions - something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. Specifically, we train GPT-3, an autoregressive language model with 175 billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computation and Language},
  file = {/Users/fracapuano/Zotero/storage/L6J45ZW7/Brown et al. - 2020 - Language Models are Few-Shot Learners.pdf;/Users/fracapuano/Zotero/storage/52DC5AT2/2005.html}
}

@article{burridgeSequentialCompositionDynamically1999b,
  title = {Sequential {{Composition}} of {{Dynamically Dexterous Robot Behaviors}}},
  author = {Burridge, R. R. and Rizzi, A. A. and Koditschek, D. E.},
  year = {1999},
  month = jun,
  journal = {The International Journal of Robotics Research},
  volume = {18},
  number = {6},
  pages = {534--555},
  issn = {0278-3649, 1741-3176},
  doi = {10.1177/02783649922066385},
  urldate = {2025-08-26},
  abstract = {We report on our efforts to develop a sequential robot controllercomposition technique in the context of dexterous ``batting'' maneuvers. A robot with a flat paddle is required to strike repeatedly at a thrown ball until the ball is brought to rest on the paddle at a specified location. The robot's reachable workspace is blocked by an obstacle that disconnects the free space formed when the ball and paddle remain in contact, forcing the machine to ``let go'' for a time to bring the ball to the desired state. The controller compositions we create guarantee that a ball introduced in the ``safe workspace'' remains there and is ultimately brought to the goal. We report on experimental results from an implementation of these formal composition methods, and present descriptive statistics characterizing the experiments.},
  copyright = {https://journals.sagepub.com/page/policies/text-and-data-mining-license},
  langid = {english},
  file = {/Users/fracapuano/Zotero/storage/TFZQ6EHJ/Burridge et al. - 1999 - Sequential Composition of Dynamically Dexterous Robot Behaviors.pdf}
}

@misc{cadeneLeRobotStateoftheartMachine2024,
  title = {{{LeRobot}}: {{State-of-the-art Machine Learning}} for {{Real-World Robotics}} in {{Pytorch}}},
  author = {Cadene, Remi and Alibert, Simon and Soare, Alexander and Galloudec, Quentin and Zouitine, Adil and Palma, Steven and Kooijmans, Pepijn and Aractingi, Michel and Shukor, Mustafa and Aubakirova, Dana and Russi, Martino and Capuano, Francesco and Pascal, Caroline and Chogari, Jade and Moss, Jess and Wolf, Thomas},
  year = {2024}
}

@misc{caronEmergingPropertiesSelfSupervised2021,
  title = {Emerging {{Properties}} in {{Self-Supervised Vision Transformers}}},
  author = {Caron, Mathilde and Touvron, Hugo and Misra, Ishan and J{\'e}gou, Herv{\'e} and Mairal, Julien and Bojanowski, Piotr and Joulin, Armand},
  year = {2021},
  month = may,
  number = {arXiv:2104.14294},
  eprint = {2104.14294},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2104.14294},
  urldate = {2025-09-07},
  abstract = {In this paper, we question if self-supervised learning provides new properties to Vision Transformer (ViT) that stand out compared to convolutional networks (convnets). Beyond the fact that adapting self-supervised methods to this architecture works particularly well, we make the following observations: first, self-supervised ViT features contain explicit information about the semantic segmentation of an image, which does not emerge as clearly with supervised ViTs, nor with convnets. Second, these features are also excellent k-NN classifiers, reaching 78.3\% top-1 on ImageNet with a small ViT. Our study also underlines the importance of momentum encoder, multi-crop training, and the use of small patches with ViTs. We implement our findings into a simple self-supervised method, called DINO, which we interpret as a form of self-distillation with no labels. We show the synergy between DINO and ViTs by achieving 80.1\% top-1 on ImageNet in linear evaluation with ViT-Base.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  file = {/Users/fracapuano/Zotero/storage/AYIY6DTF/Caron et al. - 2021 - Emerging Properties in Self-Supervised Vision Transformers.pdf;/Users/fracapuano/Zotero/storage/EKA7ZN2P/2104.html}
}

@inproceedings{chebotarClosingSimtorealLoop2019,
  title = {Closing the Sim-to-Real Loop: {{Adapting}} Simulation Randomization with Real World Experience},
  shorttitle = {Closing the Sim-to-Real Loop},
  booktitle = {2019 {{International Conference}} on {{Robotics}} and {{Automation}} ({{ICRA}})},
  author = {Chebotar, Yevgen and Handa, Ankur and Makoviychuk, Viktor and Macklin, Miles and Issac, Jan and Ratliff, Nathan and Fox, Dieter},
  year = {2019},
  pages = {8973--8979},
  publisher = {IEEE},
  urldate = {2025-08-31}
}

@misc{chenPaLIXScalingMultilingual2023,
  title = {{{PaLI-X}}: {{On Scaling}} up a {{Multilingual Vision}} and {{Language Model}}},
  shorttitle = {{{PaLI-X}}},
  author = {Chen, Xi and Djolonga, Josip and Padlewski, Piotr and Mustafa, Basil and Changpinyo, Soravit and Wu, Jialin and Ruiz, Carlos Riquelme and Goodman, Sebastian and Wang, Xiao and Tay, Yi and Shakeri, Siamak and Dehghani, Mostafa and Salz, Daniel and Lucic, Mario and Tschannen, Michael and Nagrani, Arsha and Hu, Hexiang and Joshi, Mandar and Pang, Bo and Montgomery, Ceslee and Pietrzyk, Paulina and Ritter, Marvin and Piergiovanni, A. J. and Minderer, Matthias and Pavetic, Filip and Waters, Austin and Li, Gang and Alabdulmohsin, Ibrahim and Beyer, Lucas and Amelot, Julien and Lee, Kenton and Steiner, Andreas Peter and Li, Yang and Keysers, Daniel and Arnab, Anurag and Xu, Yuanzhong and Rong, Keran and Kolesnikov, Alexander and Seyedhosseini, Mojtaba and Angelova, Anelia and Zhai, Xiaohua and Houlsby, Neil and Soricut, Radu},
  year = {2023},
  month = may,
  number = {arXiv:2305.18565},
  eprint = {2305.18565},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2305.18565},
  urldate = {2025-09-07},
  abstract = {We present the training recipe and results of scaling up PaLI-X, a multilingual vision and language model, both in terms of size of the components and the breadth of its training task mixture. Our model achieves new levels of performance on a wide-range of varied and complex tasks, including multiple image-based captioning and question-answering tasks, image-based document understanding and few-shot (in-context) learning, as well as object detection, video question answering, and video captioning. PaLI-X advances the state-of-the-art on most vision-and-language benchmarks considered (25+ of them). Finally, we observe emerging capabilities, such as complex counting and multilingual object detection, tasks that are not explicitly in the training mix.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computation and Language,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning},
  file = {/Users/fracapuano/Zotero/storage/UES2DMFM/Chen et al. - 2023 - PaLI-X On Scaling up a Multilingual Vision and Language Model.pdf;/Users/fracapuano/Zotero/storage/LEGNNSHS/2305.html}
}

@misc{chiDiffusionPolicyVisuomotor2024,
  title = {Diffusion {{Policy}}: {{Visuomotor Policy Learning}} via {{Action Diffusion}}},
  shorttitle = {Diffusion {{Policy}}},
  author = {Chi, Cheng and Xu, Zhenjia and Feng, Siyuan and Cousineau, Eric and Du, Yilun and Burchfiel, Benjamin and Tedrake, Russ and Song, Shuran},
  year = {2024},
  month = mar,
  number = {arXiv:2303.04137},
  eprint = {2303.04137},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2303.04137},
  urldate = {2025-08-28},
  abstract = {This paper introduces Diffusion Policy, a new way of generating robot behavior by representing a robot's visuomotor policy as a conditional denoising diffusion process. We benchmark Diffusion Policy across 12 different tasks from 4 different robot manipulation benchmarks and find that it consistently outperforms existing state-of-the-art robot learning methods with an average improvement of 46.9\%. Diffusion Policy learns the gradient of the action-distribution score function and iteratively optimizes with respect to this gradient field during inference via a series of stochastic Langevin dynamics steps. We find that the diffusion formulation yields powerful advantages when used for robot policies, including gracefully handling multimodal action distributions, being suitable for high-dimensional action spaces, and exhibiting impressive training stability. To fully unlock the potential of diffusion models for visuomotor policy learning on physical robots, this paper presents a set of key technical contributions including the incorporation of receding horizon control, visual conditioning, and the time-series diffusion transformer. We hope this work will help motivate a new generation of policy learning techniques that are able to leverage the powerful generative modeling capabilities of diffusion models. Code, data, and training details is publicly available diffusion-policy.cs.columbia.edu},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Robotics},
  file = {/Users/fracapuano/Zotero/storage/7XRY3GJX/Chi et al. - 2024 - Diffusion Policy Visuomotor Policy Learning via Action Diffusion.pdf;/Users/fracapuano/Zotero/storage/BBBPKKMZ/2303.html}
}

@book{connellRobotLearning1993,
  title = {Robot {{Learning}}},
  editor = {Connell, Jonathan H. and Mahadevan, Sridhar},
  year = {1993},
  publisher = {Springer US},
  address = {Boston, MA},
  doi = {10.1007/978-1-4615-3184-5},
  urldate = {2025-08-28},
  copyright = {http://www.springer.com/tdm},
  isbn = {978-1-4613-6396-5 978-1-4615-3184-5},
  keywords = {algorithms,artificial intelligence,artificial life,autonom,autonomous robot,genetic algorithms,intelligence,learning,Navigation,programming,proving,robot,uncertainty}
}

@article{degraveMagneticControlTokamak2022,
  title = {Magnetic Control of Tokamak Plasmas through Deep Reinforcement Learning},
  author = {Degrave, Jonas and Felici, Federico and Buchli, Jonas and Neunert, Michael and Tracey, Brendan and Carpanese, Francesco and Ewalds, Timo and Hafner, Roland and Abdolmaleki, Abbas and {de las Casas}, Diego and Donner, Craig and Fritz, Leslie and Galperti, Cristian and Huber, Andrea and Keeling, James and Tsimpoukelli, Maria and Kay, Jackie and Merle, Antoine and Moret, Jean-Marc and Noury, Seb and Pesamosca, Federico and Pfau, David and Sauter, Olivier and Sommariva, Cristian and Coda, Stefano and Duval, Basil and Fasoli, Ambrogio and Kohli, Pushmeet and Kavukcuoglu, Koray and Hassabis, Demis and Riedmiller, Martin},
  year = {2022},
  month = feb,
  journal = {Nature},
  volume = {602},
  number = {7897},
  pages = {414--419},
  publisher = {Nature Publishing Group},
  issn = {1476-4687},
  doi = {10.1038/s41586-021-04301-9},
  urldate = {2025-08-31},
  abstract = {Nuclear fusion using magnetic confinement, in particular in the tokamak configuration, is a promising path towards sustainable energy. A core challenge is to shape and maintain a high-temperature plasma within the tokamak vessel. This requires high-dimensional, high-frequency, closed-loop control using magnetic actuator coils, further complicated by the diverse requirements across a wide range of plasma configurations. In this work, we introduce a previously undescribed architecture for tokamak magnetic controller design that autonomously learns to command the full set of control coils. This architecture meets control objectives specified at a high level, at the same time satisfying physical and operational constraints. This approach has unprecedented flexibility and generality in problem specification and yields a notable reduction in design effort to produce new plasma configurations. We successfully produce and control a diverse set of plasma configurations on the Tokamak {\`a} Configuration Variable1,2, including elongated, conventional shapes, as well as advanced configurations, such as negative triangularity and `snowflake' configurations. Our approach achieves accurate tracking of the location, current and shape for these configurations. We also demonstrate sustained `droplets' on TCV, in which two separate plasmas are maintained simultaneously within the vessel. This represents a notable advance for tokamak feedback control, showing the potential of reinforcement learning to accelerate research in the fusion domain, and is one of the most challenging real-world systems to which reinforcement learning has been applied.},
  copyright = {2022 The Author(s)},
  langid = {english},
  keywords = {Computer science,Magnetically confined plasmas,Nuclear fusion and fission},
  file = {/Users/fracapuano/Zotero/storage/EZ4EAU84/Degrave et al. - 2022 - Magnetic control of tokamak plasmas through deep reinforcement learning.pdf}
}

@misc{devlinBERTPretrainingDeep2019,
  title = {{{BERT}}: {{Pre-training}} of {{Deep Bidirectional Transformers}} for {{Language Understanding}}},
  shorttitle = {{{BERT}}},
  author = {Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
  year = {2019},
  month = may,
  number = {arXiv:1810.04805},
  eprint = {1810.04805},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.1810.04805},
  urldate = {2025-09-08},
  abstract = {We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation models, BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result, the pre-trained BERT model can be fine-tuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial task-specific architecture modifications. BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art results on eleven natural language processing tasks, including pushing the GLUE score to 80.5\% (7.7\% point absolute improvement), MultiNLI accuracy to 86.7\% (4.6\% absolute improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point absolute improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement).},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computation and Language},
  file = {/Users/fracapuano/Zotero/storage/AJ3SRLHF/Devlin et al. - 2019 - BERT Pre-training of Deep Bidirectional Transformers for Language Understanding.pdf;/Users/fracapuano/Zotero/storage/LNIKJNIW/1810.html}
}

@misc{driessKnowledgeInsulatingVisionLanguageAction2025,
  title = {Knowledge {{Insulating Vision-Language-Action Models}}: {{Train Fast}}, {{Run Fast}}, {{Generalize Better}}},
  shorttitle = {Knowledge {{Insulating Vision-Language-Action Models}}},
  author = {Driess, Danny and Springenberg, Jost Tobias and Ichter, Brian and Yu, Lili and {Li-Bell}, Adrian and Pertsch, Karl and Ren, Allen Z. and Walke, Homer and Vuong, Quan and Shi, Lucy Xiaoyang and Levine, Sergey},
  year = {2025},
  month = may,
  number = {arXiv:2505.23705},
  eprint = {2505.23705},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2505.23705},
  urldate = {2025-09-09},
  abstract = {Vision-language-action (VLA) models provide a powerful approach to training control policies for physical systems, such as robots, by combining end-to-end learning with transfer of semantic knowledge from web-scale vision-language model (VLM) training. However, the constraints of real-time control are often at odds with the design of VLMs: the most powerful VLMs have tens or hundreds of billions of parameters, presenting an obstacle to real-time inference, and operate on discrete tokens rather than the continuous-valued outputs that are required for controlling robots. To address this challenge, recent VLA models have used specialized modules for efficient continuous control, such as action experts or continuous output heads, which typically require adding new untrained parameters to the pretrained VLM backbone. While these modules improve real-time and control capabilities, it remains an open question whether they preserve or degrade the semantic knowledge contained in the pretrained VLM, and what effect they have on the VLA training dynamics. In this paper, we study this question in the context of VLAs that include a continuous diffusion or flow matching action expert, showing that naively including such experts significantly harms both training speed and knowledge transfer. We provide an extensive analysis of various design choices, their impact on performance and knowledge transfer, and propose a technique for insulating the VLM backbone during VLA training that mitigates this issue. Videos are available at https://pi.website/research/knowledge\_insulation.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Machine Learning,Computer Science - Robotics},
  file = {/Users/fracapuano/Zotero/storage/QHTS9JIC/Driess et al. - 2025 - Knowledge Insulating Vision-Language-Action Models Train Fast, Run Fast, Generalize Better.pdf;/Users/fracapuano/Zotero/storage/3U9FCXRB/2505.html}
}

@misc{driessPaLMEEmbodiedMultimodal2023,
  title = {{{PaLM-E}}: {{An Embodied Multimodal Language Model}}},
  shorttitle = {{{PaLM-E}}},
  author = {Driess, Danny and Xia, Fei and Sajjadi, Mehdi S. M. and Lynch, Corey and Chowdhery, Aakanksha and Ichter, Brian and Wahid, Ayzaan and Tompson, Jonathan and Vuong, Quan and Yu, Tianhe and Huang, Wenlong and Chebotar, Yevgen and Sermanet, Pierre and Duckworth, Daniel and Levine, Sergey and Vanhoucke, Vincent and Hausman, Karol and Toussaint, Marc and Greff, Klaus and Zeng, Andy and Mordatch, Igor and Florence, Pete},
  year = {2023},
  month = mar,
  number = {arXiv:2303.03378},
  eprint = {2303.03378},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2303.03378},
  urldate = {2025-09-07},
  abstract = {Large language models excel at a wide range of complex tasks. However, enabling general inference in the real world, e.g., for robotics problems, raises the challenge of grounding. We propose embodied language models to directly incorporate real-world continuous sensor modalities into language models and thereby establish the link between words and percepts. Input to our embodied language model are multi-modal sentences that interleave visual, continuous state estimation, and textual input encodings. We train these encodings end-to-end, in conjunction with a pre-trained large language model, for multiple embodied tasks including sequential robotic manipulation planning, visual question answering, and captioning. Our evaluations show that PaLM-E, a single large embodied multimodal model, can address a variety of embodied reasoning tasks, from a variety of observation modalities, on multiple embodiments, and further, exhibits positive transfer: the model benefits from diverse joint training across internet-scale language, vision, and visual-language domains. Our largest model, PaLM-E-562B with 562B parameters, in addition to being trained on robotics tasks, is a visual-language generalist with state-of-the-art performance on OK-VQA, and retains generalist language capabilities with increasing scale.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Computer Science - Robotics},
  file = {/Users/fracapuano/Zotero/storage/PQSPI784/Driess et al. - 2023 - PaLM-E An Embodied Multimodal Language Model.pdf;/Users/fracapuano/Zotero/storage/K3PJVSGB/2303.html}
}

@misc{esserScalingRectifiedFlow2024,
  title = {Scaling {{Rectified Flow Transformers}} for {{High-Resolution Image Synthesis}}},
  author = {Esser, Patrick and Kulal, Sumith and Blattmann, Andreas and Entezari, Rahim and M{\"u}ller, Jonas and Saini, Harry and Levi, Yam and Lorenz, Dominik and Sauer, Axel and Boesel, Frederic and Podell, Dustin and Dockhorn, Tim and English, Zion and Lacey, Kyle and Goodwin, Alex and Marek, Yannik and Rombach, Robin},
  year = {2024},
  month = mar,
  number = {arXiv:2403.03206},
  eprint = {2403.03206},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2403.03206},
  urldate = {2025-09-07},
  abstract = {Diffusion models create data from noise by inverting the forward paths of data towards noise and have emerged as a powerful generative modeling technique for high-dimensional, perceptual data such as images and videos. Rectified flow is a recent generative model formulation that connects data and noise in a straight line. Despite its better theoretical properties and conceptual simplicity, it is not yet decisively established as standard practice. In this work, we improve existing noise sampling techniques for training rectified flow models by biasing them towards perceptually relevant scales. Through a large-scale study, we demonstrate the superior performance of this approach compared to established diffusion formulations for high-resolution text-to-image synthesis. Additionally, we present a novel transformer-based architecture for text-to-image generation that uses separate weights for the two modalities and enables a bidirectional flow of information between image and text tokens, improving text comprehension, typography, and human preference ratings. We demonstrate that this architecture follows predictable scaling trends and correlates lower validation loss to improved text-to-image synthesis as measured by various metrics and human evaluations. Our largest models outperform state-of-the-art models, and we will make our experimental data, code, and model weights publicly available.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  file = {/Users/fracapuano/Zotero/storage/23TGK9JM/Esser et al. - 2024 - Scaling Rectified Flow Transformers for High-Resolution Image Synthesis.pdf;/Users/fracapuano/Zotero/storage/W2CRYPZY/2403.html}
}

@misc{fedusReviewSparseExpert2022,
  title = {A {{Review}} of {{Sparse Expert Models}} in {{Deep Learning}}},
  author = {Fedus, William and Dean, Jeff and Zoph, Barret},
  year = {2022},
  month = sep,
  number = {arXiv:2209.01667},
  eprint = {2209.01667},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2209.01667},
  urldate = {2025-09-08},
  abstract = {Sparse expert models are a thirty-year old concept re-emerging as a popular architecture in deep learning. This class of architecture encompasses Mixture-of-Experts, Switch Transformers, Routing Networks, BASE layers, and others, all with the unifying idea that each example is acted on by a subset of the parameters. By doing so, the degree of sparsity decouples the parameter count from the compute per example allowing for extremely large, but efficient models. The resulting models have demonstrated significant improvements across diverse domains such as natural language processing, computer vision, and speech recognition. We review the concept of sparse expert models, provide a basic description of the common algorithms, contextualize the advances in the deep learning era, and conclude by highlighting areas for future work.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/Users/fracapuano/Zotero/storage/MZXG2WMJ/Fedus et al. - 2022 - A Review of Sparse Expert Models in Deep Learning.pdf;/Users/fracapuano/Zotero/storage/GLZINJYC/2209.html}
}

@misc{finiMultimodalAutoregressivePretraining2024,
  title = {Multimodal {{Autoregressive Pre-training}} of {{Large Vision Encoders}}},
  author = {Fini, Enrico and Shukor, Mustafa and Li, Xiujun and Dufter, Philipp and Klein, Michal and Haldimann, David and Aitharaju, Sai and da Costa, Victor Guilherme Turrisi and B{\'e}thune, Louis and Gan, Zhe and Toshev, Alexander T. and Eichner, Marcin and Nabi, Moin and Yang, Yinfei and Susskind, Joshua M. and {El-Nouby}, Alaaeldin},
  year = {2024},
  month = nov,
  number = {arXiv:2411.14402},
  eprint = {2411.14402},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2411.14402},
  urldate = {2025-09-09},
  abstract = {We introduce a novel method for pre-training of large-scale vision encoders. Building on recent advancements in autoregressive pre-training of vision models, we extend this framework to a multimodal setting, i.e., images and text. In this paper, we present AIMV2, a family of generalist vision encoders characterized by a straightforward pre-training process, scalability, and remarkable performance across a range of downstream tasks. This is achieved by pairing the vision encoder with a multimodal decoder that autoregressively generates raw image patches and text tokens. Our encoders excel not only in multimodal evaluations but also in vision benchmarks such as localization, grounding, and classification. Notably, our AIMV2-3B encoder achieves 89.5\% accuracy on ImageNet-1k with a frozen trunk. Furthermore, AIMV2 consistently outperforms state-of-the-art contrastive models (e.g., CLIP, SigLIP) in multimodal image understanding across diverse settings.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning},
  file = {/Users/fracapuano/Zotero/storage/ULTX55I6/Fini et al. - 2024 - Multimodal Autoregressive Pre-training of Large Vision Encoders.pdf;/Users/fracapuano/Zotero/storage/SUG2W6A9/2411.html}
}

@inproceedings{florenceImplicitBehavioralCloning2022,
  title = {Implicit {{Behavioral Cloning}}},
  booktitle = {Proceedings of the 5th {{Conference}} on {{Robot Learning}}},
  author = {Florence, Pete and Lynch, Corey and Zeng, Andy and Ramirez, Oscar A. and Wahid, Ayzaan and Downs, Laura and Wong, Adrian and Lee, Johnny and Mordatch, Igor and Tompson, Jonathan},
  year = {2022},
  month = jan,
  pages = {158--168},
  publisher = {PMLR},
  issn = {2640-3498},
  urldate = {2025-09-01},
  abstract = {We find that across a wide range of robot policy learning scenarios, treating supervised policy learning with an implicit model generally performs better, on average, than commonly used explicit models.  We present extensive experiments on this finding, and we provide both intuitive insight and theoretical arguments distinguishing the properties of implicit models compared to their explicit counterparts, particularly with respect to approximating complex, potentially discontinuous and multi-valued (set-valued) functions. On robotic policy learning tasks we show that implicit behavior-cloning policies with energy-based models (EBM) often outperform common explicit (Mean Square Error, or Mixture Density) behavior-cloning policies, including on tasks with high-dimensional action spaces and visual image inputs. We find these policies provide competitive results or outperform state-of-the-art offline reinforcement learning methods on the challenging human-expert tasks from the D4RL benchmark suite, despite using no reward information. In the real world, robots with implicit policies can learn complex and remarkably subtle behaviors on contact-rich tasks from human demonstrations, including tasks with high combinatorial complexity and tasks requiring 1mm precision.},
  langid = {english},
  file = {/Users/fracapuano/Zotero/storage/Q8I5E862/Florence et al. - 2022 - Implicit Behavioral Cloning.pdf}
}

@misc{FROMAGe,
  title = {Grounding Language Models to Images for Multimodal Inputs and Outputs},
  author = {Koh, Jing Yu and Salakhutdinov, Ruslan and Fried, Daniel},
  year = {2023}
}

@article{fujitaDevelopmentRobotsNuclear2020,
  title = {Development of {{Robots}} for {{Nuclear Power Plants}} and {{Their Application}} to {{New Fields}}},
  author = {Fujita, Jun and Soda, Daisuke and Murata, Chotaro and Tsuhari, Hiroyuki},
  year = {2020},
  volume = {57},
  number = {4},
  langid = {english},
  file = {/Users/fracapuano/Zotero/storage/K349QTEG/Fujita et al. - 2020 - Development of Robots for Nuclear Power Plants and Their Application to New Fields.pdf}
}

@misc{grattafioriLlama3Herd2024,
  title = {The {{Llama}} 3 {{Herd}} of {{Models}}},
  author = {Grattafiori, Aaron and Dubey, Abhimanyu and Jauhri, Abhinav and Pandey, Abhinav and Kadian, Abhishek and {Al-Dahle}, Ahmad and Letman, Aiesha and Mathur, Akhil and Schelten, Alan and Vaughan, Alex and Yang, Amy and Fan, Angela and Goyal, Anirudh and Hartshorn, Anthony and Yang, Aobo and Mitra, Archi and Sravankumar, Archie and Korenev, Artem and Hinsvark, Arthur and Rao, Arun and Zhang, Aston and Rodriguez, Aurelien and Gregerson, Austen and Spataru, Ava and Roziere, Baptiste and Biron, Bethany and Tang, Binh and Chern, Bobbie and Caucheteux, Charlotte and Nayak, Chaya and Bi, Chloe and Marra, Chris and McConnell, Chris and Keller, Christian and Touret, Christophe and Wu, Chunyang and Wong, Corinne and Ferrer, Cristian Canton and Nikolaidis, Cyrus and Allonsius, Damien and Song, Daniel and Pintz, Danielle and Livshits, Danny and Wyatt, Danny and Esiobu, David and Choudhary, Dhruv and Mahajan, Dhruv and {Garcia-Olano}, Diego and Perino, Diego and Hupkes, Dieuwke and Lakomkin, Egor and AlBadawy, Ehab and Lobanova, Elina and Dinan, Emily and Smith, Eric Michael and Radenovic, Filip and Guzm{\'a}n, Francisco and Zhang, Frank and Synnaeve, Gabriel and Lee, Gabrielle and Anderson, Georgia Lewis and Thattai, Govind and Nail, Graeme and Mialon, Gregoire and Pang, Guan and Cucurell, Guillem and Nguyen, Hailey and Korevaar, Hannah and Xu, Hu and Touvron, Hugo and Zarov, Iliyan and Ibarra, Imanol Arrieta and Kloumann, Isabel and Misra, Ishan and Evtimov, Ivan and Zhang, Jack and Copet, Jade and Lee, Jaewon and Geffert, Jan and Vranes, Jana and Park, Jason and Mahadeokar, Jay and Shah, Jeet and van der Linde, Jelmer and Billock, Jennifer and Hong, Jenny and Lee, Jenya and Fu, Jeremy and Chi, Jianfeng and Huang, Jianyu and Liu, Jiawen and Wang, Jie and Yu, Jiecao and Bitton, Joanna and Spisak, Joe and Park, Jongsoo and Rocca, Joseph and Johnstun, Joshua and Saxe, Joshua and Jia, Junteng and Alwala, Kalyan Vasuden and Prasad, Karthik and Upasani, Kartikeya and Plawiak, Kate and Li, Ke and Heafield, Kenneth and Stone, Kevin and {El-Arini}, Khalid and Iyer, Krithika and Malik, Kshitiz and Chiu, Kuenley and Bhalla, Kunal and Lakhotia, Kushal and {Rantala-Yeary}, Lauren and van der Maaten, Laurens and Chen, Lawrence and Tan, Liang and Jenkins, Liz and Martin, Louis and Madaan, Lovish and Malo, Lubo and Blecher, Lukas and Landzaat, Lukas and de Oliveira, Luke and Muzzi, Madeline and Pasupuleti, Mahesh and Singh, Mannat and Paluri, Manohar and Kardas, Marcin and Tsimpoukelli, Maria and Oldham, Mathew and Rita, Mathieu and Pavlova, Maya and Kambadur, Melanie and Lewis, Mike and Si, Min and Singh, Mitesh Kumar and Hassan, Mona and Goyal, Naman and Torabi, Narjes and Bashlykov, Nikolay and Bogoychev, Nikolay and Chatterji, Niladri and Zhang, Ning and Duchenne, Olivier and {\c C}elebi, Onur and Alrassy, Patrick and Zhang, Pengchuan and Li, Pengwei and Vasic, Petar and Weng, Peter and Bhargava, Prajjwal and Dubal, Pratik and Krishnan, Praveen and Koura, Punit Singh and Xu, Puxin and He, Qing and Dong, Qingxiao and Srinivasan, Ragavan and Ganapathy, Raj and Calderer, Ramon and Cabral, Ricardo Silveira and Stojnic, Robert and Raileanu, Roberta and Maheswari, Rohan and Girdhar, Rohit and Patel, Rohit and Sauvestre, Romain and Polidoro, Ronnie and Sumbaly, Roshan and Taylor, Ross and Silva, Ruan and Hou, Rui and Wang, Rui and Hosseini, Saghar and Chennabasappa, Sahana and Singh, Sanjay and Bell, Sean and Kim, Seohyun Sonia and Edunov, Sergey and Nie, Shaoliang and Narang, Sharan and Raparthy, Sharath and Shen, Sheng and Wan, Shengye and Bhosale, Shruti and Zhang, Shun and Vandenhende, Simon and Batra, Soumya and Whitman, Spencer and Sootla, Sten and Collot, Stephane and Gururangan, Suchin and Borodinsky, Sydney and Herman, Tamar and Fowler, Tara and Sheasha, Tarek and Georgiou, Thomas and Scialom, Thomas and Speckbacher, Tobias and Mihaylov, Todor and Xiao, Tong and Karn, Ujjwal and Goswami, Vedanuj and Gupta, Vibhor and Ramanathan, Vignesh and Kerkez, Viktor and Gonguet, Vincent and Do, Virginie and Vogeti, Vish and Albiero, V{\'i}tor and Petrovic, Vladan and Chu, Weiwei and Xiong, Wenhan and Fu, Wenyin and Meers, Whitney and Martinet, Xavier and Wang, Xiaodong and Wang, Xiaofang and Tan, Xiaoqing Ellen and Xia, Xide and Xie, Xinfeng and Jia, Xuchao and Wang, Xuewei and Goldschlag, Yaelle and Gaur, Yashesh and Babaei, Yasmine and Wen, Yi and Song, Yiwen and Zhang, Yuchen and Li, Yue and Mao, Yuning and Coudert, Zacharie Delpierre and Yan, Zheng and Chen, Zhengxing and Papakipos, Zoe and Singh, Aaditya and Srivastava, Aayushi and Jain, Abha and Kelsey, Adam and Shajnfeld, Adam and Gangidi, Adithya and Victoria, Adolfo and Goldstand, Ahuva and Menon, Ajay and Sharma, Ajay and Boesenberg, Alex and Baevski, Alexei and Feinstein, Allie and Kallet, Amanda and Sangani, Amit and Teo, Amos and Yunus, Anam and Lupu, Andrei and Alvarado, Andres and Caples, Andrew and Gu, Andrew and Ho, Andrew and Poulton, Andrew and Ryan, Andrew and Ramchandani, Ankit and Dong, Annie and Franco, Annie and Goyal, Anuj and Saraf, Aparajita and Chowdhury, Arkabandhu and Gabriel, Ashley and Bharambe, Ashwin and Eisenman, Assaf and Yazdan, Azadeh and James, Beau and Maurer, Ben and Leonhardi, Benjamin and Huang, Bernie and Loyd, Beth and Paola, Beto De and Paranjape, Bhargavi and Liu, Bing and Wu, Bo and Ni, Boyu and Hancock, Braden and Wasti, Bram and Spence, Brandon and Stojkovic, Brani and Gamido, Brian and Montalvo, Britt and Parker, Carl and Burton, Carly and Mejia, Catalina and Liu, Ce and Wang, Changhan and Kim, Changkyu and Zhou, Chao and Hu, Chester and Chu, Ching-Hsiang and Cai, Chris and Tindal, Chris and Feichtenhofer, Christoph and Gao, Cynthia and Civin, Damon and Beaty, Dana and Kreymer, Daniel and Li, Daniel and Adkins, David and Xu, David and Testuggine, Davide and David, Delia and Parikh, Devi and Liskovich, Diana and Foss, Didem and Wang, Dingkang and Le, Duc and Holland, Dustin and Dowling, Edward and Jamil, Eissa and Montgomery, Elaine and Presani, Eleonora and Hahn, Emily and Wood, Emily and Le, Eric-Tuan and Brinkman, Erik and Arcaute, Esteban and Dunbar, Evan and Smothers, Evan and Sun, Fei and Kreuk, Felix and Tian, Feng and Kokkinos, Filippos and Ozgenel, Firat and Caggioni, Francesco and Kanayet, Frank and Seide, Frank and Florez, Gabriela Medina and Schwarz, Gabriella and Badeer, Gada and Swee, Georgia and Halpern, Gil and Herman, Grant and Sizov, Grigory and Guangyi and Zhang and Lakshminarayanan, Guna and Inan, Hakan and Shojanazeri, Hamid and Zou, Han and Wang, Hannah and Zha, Hanwen and Habeeb, Haroun and Rudolph, Harrison and Suk, Helen and Aspegren, Henry and Goldman, Hunter and Zhan, Hongyuan and Damlaj, Ibrahim and Molybog, Igor and Tufanov, Igor and Leontiadis, Ilias and Veliche, Irina-Elena and Gat, Itai and Weissman, Jake and Geboski, James and Kohli, James and Lam, Janice and Asher, Japhet and Gaya, Jean-Baptiste and Marcus, Jeff and Tang, Jeff and Chan, Jennifer and Zhen, Jenny and Reizenstein, Jeremy and Teboul, Jeremy and Zhong, Jessica and Jin, Jian and Yang, Jingyi and Cummings, Joe and Carvill, Jon and Shepard, Jon and McPhie, Jonathan and Torres, Jonathan and Ginsburg, Josh and Wang, Junjie and Wu, Kai and U, Kam Hou and Saxena, Karan and Khandelwal, Kartikay and Zand, Katayoun and Matosich, Kathy and Veeraraghavan, Kaushik and Michelena, Kelly and Li, Keqian and Jagadeesh, Kiran and Huang, Kun and Chawla, Kunal and Huang, Kyle and Chen, Lailin and Garg, Lakshya and A, Lavender and Silva, Leandro and Bell, Lee and Zhang, Lei and Guo, Liangpeng and Yu, Licheng and Moshkovich, Liron and Wehrstedt, Luca and Khabsa, Madian and Avalani, Manav and Bhatt, Manish and Mankus, Martynas and Hasson, Matan and Lennie, Matthew and Reso, Matthias and Groshev, Maxim and Naumov, Maxim and Lathi, Maya and Keneally, Meghan and Liu, Miao and Seltzer, Michael L. and Valko, Michal and Restrepo, Michelle and Patel, Mihir and Vyatskov, Mik and Samvelyan, Mikayel and Clark, Mike and Macey, Mike and Wang, Mike and Hermoso, Miquel Jubert and Metanat, Mo and Rastegari, Mohammad and Bansal, Munish and Santhanam, Nandhini and Parks, Natascha and White, Natasha and Bawa, Navyata and Singhal, Nayan and Egebo, Nick and Usunier, Nicolas and Mehta, Nikhil and Laptev, Nikolay Pavlovich and Dong, Ning and Cheng, Norman and Chernoguz, Oleg and Hart, Olivia and Salpekar, Omkar and Kalinli, Ozlem and Kent, Parkin and Parekh, Parth and Saab, Paul and Balaji, Pavan and Rittner, Pedro and Bontrager, Philip and Roux, Pierre and Dollar, Piotr and Zvyagina, Polina and Ratanchandani, Prashant and Yuvraj, Pritish and Liang, Qian and Alao, Rachad and Rodriguez, Rachel and Ayub, Rafi and Murthy, Raghotham and Nayani, Raghu and Mitra, Rahul and Parthasarathy, Rangaprabhu and Li, Raymond and Hogan, Rebekkah and Battey, Robin and Wang, Rocky and Howes, Russ and Rinott, Ruty and Mehta, Sachin and Siby, Sachin and Bondu, Sai Jayesh and Datta, Samyak and Chugh, Sara and Hunt, Sara and Dhillon, Sargun and Sidorov, Sasha and Pan, Satadru and Mahajan, Saurabh and Verma, Saurabh and Yamamoto, Seiji and Ramaswamy, Sharadh and Lindsay, Shaun and Lindsay, Shaun and Feng, Sheng and Lin, Shenghao and Zha, Shengxin Cindy and Patil, Shishir and Shankar, Shiva and Zhang, Shuqiang and Zhang, Shuqiang and Wang, Sinong and Agarwal, Sneha and Sajuyigbe, Soji and Chintala, Soumith and Max, Stephanie and Chen, Stephen and Kehoe, Steve and Satterfield, Steve and Govindaprasad, Sudarshan and Gupta, Sumit and Deng, Summer and Cho, Sungmin and Virk, Sunny and Subramanian, Suraj and Choudhury, Sy and Goldman, Sydney and Remez, Tal and Glaser, Tamar and Best, Tamara and Koehler, Thilo and Robinson, Thomas and Li, Tianhe and Zhang, Tianjun and Matthews, Tim and Chou, Timothy and Shaked, Tzook and Vontimitta, Varun and Ajayi, Victoria and Montanez, Victoria and Mohan, Vijai and Kumar, Vinay Satish and Mangla, Vishal and Ionescu, Vlad and Poenaru, Vlad and Mihailescu, Vlad Tiberiu and Ivanov, Vladimir and Li, Wei and Wang, Wenchen and Jiang, Wenwen and Bouaziz, Wes and Constable, Will and Tang, Xiaocheng and Wu, Xiaojian and Wang, Xiaolan and Wu, Xilun and Gao, Xinbo and Kleinman, Yaniv and Chen, Yanjun and Hu, Ye and Jia, Ye and Qi, Ye and Li, Yenda and Zhang, Yilin and Zhang, Ying and Adi, Yossi and Nam, Youngjin and Yu and Wang and Zhao, Yu and Hao, Yuchen and Qian, Yundi and Li, Yunlu and He, Yuzi and Rait, Zach and DeVito, Zachary and Rosnbrick, Zef and Wen, Zhaoduo and Yang, Zhenyu and Zhao, Zhiwei and Ma, Zhiyu},
  year = {2024},
  month = nov,
  number = {arXiv:2407.21783},
  eprint = {2407.21783},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2407.21783},
  urldate = {2025-09-09},
  abstract = {Modern artificial intelligence (AI) systems are powered by foundation models. This paper presents a new set of foundation models, called Llama 3. It is a herd of language models that natively support multilinguality, coding, reasoning, and tool usage. Our largest model is a dense Transformer with 405B parameters and a context window of up to 128K tokens. This paper presents an extensive empirical evaluation of Llama 3. We find that Llama 3 delivers comparable quality to leading language models such as GPT-4 on a plethora of tasks. We publicly release Llama 3, including pre-trained and post-trained versions of the 405B parameter language model and our Llama Guard 3 model for input and output safety. The paper also presents the results of experiments in which we integrate image, video, and speech capabilities into Llama 3 via a compositional approach. We observe this approach performs competitively with the state-of-the-art on image, video, and speech recognition tasks. The resulting models are not yet being broadly released as they are still under development.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Computer Vision and Pattern Recognition},
  file = {/Users/fracapuano/Zotero/storage/88PJ48EN/Grattafiori et al. - 2024 - The Llama 3 Herd of Models.pdf;/Users/fracapuano/Zotero/storage/2LLAWX8L/2407.html}
}

@inproceedings{griffinWalkingStabilizationUsing2017,
  title = {Walking {{Stabilization Using Step Timing}} and {{Location Adjustment}} on the {{Humanoid Robot}}, {{Atlas}}},
  booktitle = {2017 {{IEEE}}/{{RSJ International Conference}} on {{Intelligent Robots}} and {{Systems}} ({{IROS}})},
  author = {Griffin, Robert J. and Wiedebach, Georg and Bertrand, Sylvain and Leonessa, Alexander and Pratt, Jerry},
  year = {2017},
  month = sep,
  eprint = {1703.00477},
  primaryclass = {cs},
  pages = {667--673},
  doi = {10.1109/IROS.2017.8202223},
  urldate = {2025-08-26},
  abstract = {While humans are highly capable of recovering from external disturbances and uncertainties that result in large tracking errors, humanoid robots have yet to reliably mimic this level of robustness. Essential to this is the ability to combine traditional "ankle strategy" balancing with step timing and location adjustment techniques. In doing so, the robot is able to step quickly to the necessary location to continue walking. In this work, we present both a new swing speed up algorithm to adjust the step timing, allowing the robot to set the foot down more quickly to recover from errors in the direction of the current capture point dynamics, and a new algorithm to adjust the desired footstep, expanding the base of support to utilize the center of pressure (CoP)-based ankle strategy for balance. We then utilize the desired centroidal moment pivot (CMP) to calculate the momentum rate of change for our inverse-dynamics based whole-body controller. We present simulation and experimental results using this work, and discuss performance limitations and potential improvements.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Robotics},
  file = {/Users/fracapuano/Zotero/storage/SSNAZ6U4/Griffin et al. - 2017 - Walking Stabilization Using Step Timing and Location Adjustment on the Humanoid Robot, Atlas.pdf;/Users/fracapuano/Zotero/storage/VP885PA9/1703.html}
}

@inproceedings{haarnojaReinforcementLearningDeep2017b,
  title = {Reinforcement {{Learning}} with {{Deep Energy-Based Policies}}},
  booktitle = {Proceedings of the 34th {{International Conference}} on {{Machine Learning}}},
  author = {Haarnoja, Tuomas and Tang, Haoran and Abbeel, Pieter and Levine, Sergey},
  year = {2017},
  month = jul,
  pages = {1352--1361},
  publisher = {PMLR},
  issn = {2640-3498},
  urldate = {2025-08-31},
  abstract = {We propose a method for learning expressive energy-based policies for continuous states and actions, which has been feasible only in tabular domains before. We apply our method to learning maximum entropy policies, resulting into a new algorithm, called soft Q-learning, that expresses the optimal policy via a Boltzmann distribution. We use the recently proposed amortized Stein variational gradient descent to learn a stochastic sampling network that approximates samples from this distribution. The benefits of the proposed algorithm include improved exploration and compositionality that allows transferring skills between tasks, which we confirm in simulated experiments with swimming and walking robots. We also draw a connection to actor-critic methods, which can be viewed performing approximate inference on the corresponding energy-based model.},
  langid = {english},
  file = {/Users/fracapuano/Zotero/storage/C59BJ4GU/Haarnoja et al. - 2017 - Reinforcement Learning with Deep Energy-Based Policies.pdf}
}

@misc{haarnojaSoftActorCriticOffPolicy2018,
  title = {Soft {{Actor-Critic}}: {{Off-Policy Maximum Entropy Deep Reinforcement Learning}} with a {{Stochastic Actor}}},
  shorttitle = {Soft {{Actor-Critic}}},
  author = {Haarnoja, Tuomas and Zhou, Aurick and Abbeel, Pieter and Levine, Sergey},
  year = {2018},
  month = aug,
  number = {arXiv:1801.01290},
  eprint = {1801.01290},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.1801.01290},
  urldate = {2025-08-29},
  abstract = {Model-free deep reinforcement learning (RL) algorithms have been demonstrated on a range of challenging decision making and control tasks. However, these methods typically suffer from two major challenges: very high sample complexity and brittle convergence properties, which necessitate meticulous hyperparameter tuning. Both of these challenges severely limit the applicability of such methods to complex, real-world domains. In this paper, we propose soft actor-critic, an off-policy actor-critic deep RL algorithm based on the maximum entropy reinforcement learning framework. In this framework, the actor aims to maximize expected reward while also maximizing entropy. That is, to succeed at the task while acting as randomly as possible. Prior deep RL methods based on this framework have been formulated as Q-learning methods. By combining off-policy updates with a stable stochastic actor-critic formulation, our method achieves state-of-the-art performance on a range of continuous control benchmark tasks, outperforming prior on-policy and off-policy methods. Furthermore, we demonstrate that, in contrast to other off-policy algorithms, our approach is very stable, achieving very similar performance across different random seeds.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Statistics - Machine Learning},
  file = {/Users/fracapuano/Zotero/storage/HG6UQIRM/Haarnoja et al. - 2018 - Soft Actor-Critic Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor.pdf;/Users/fracapuano/Zotero/storage/RKG3J7MX/1801.html}
}

@misc{hansenTemporalDifferenceLearning2022,
  title = {Temporal {{Difference Learning}} for {{Model Predictive Control}}},
  author = {Hansen, Nicklas and Wang, Xiaolong and Su, Hao},
  year = {2022},
  month = jul,
  number = {arXiv:2203.04955},
  eprint = {2203.04955},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2203.04955},
  urldate = {2025-08-25},
  abstract = {Data-driven model predictive control has two key advantages over model-free methods: a potential for improved sample efficiency through model learning, and better performance as computational budget for planning increases. However, it is both costly to plan over long horizons and challenging to obtain an accurate model of the environment. In this work, we combine the strengths of model-free and model-based methods. We use a learned task-oriented latent dynamics model for local trajectory optimization over a short horizon, and use a learned terminal value function to estimate long-term return, both of which are learned jointly by temporal difference learning. Our method, TD-MPC, achieves superior sample efficiency and asymptotic performance over prior work on both state and image-based continuous control tasks from DMControl and Meta-World. Code and video results are available at https://nicklashansen.github.io/td-mpc.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Machine Learning,Computer Science - Robotics},
  file = {/Users/fracapuano/Zotero/storage/TZF8LCDG/Hansen et al. - 2022 - Temporal Difference Learning for Model Predictive Control.pdf;/Users/fracapuano/Zotero/storage/WU2WWWQE/2203.html}
}

@misc{heessEmergenceLocomotionBehaviours2017,
  title = {Emergence of {{Locomotion Behaviours}} in {{Rich Environments}}},
  author = {Heess, Nicolas and TB, Dhruva and Sriram, Srinivasan and Lemmon, Jay and Merel, Josh and Wayne, Greg and Tassa, Yuval and Erez, Tom and Wang, Ziyu and Eslami, S. M. Ali and Riedmiller, Martin and Silver, David},
  year = {2017},
  month = jul,
  number = {arXiv:1707.02286},
  eprint = {1707.02286},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.1707.02286},
  urldate = {2025-09-02},
  abstract = {The reinforcement learning paradigm allows, in principle, for complex behaviours to be learned directly from simple reward signals. In practice, however, it is common to carefully hand-design the reward function to encourage a particular solution, or to derive it from demonstration data. In this paper explore how a rich environment can help to promote the learning of complex behavior. Specifically, we train agents in diverse environmental contexts, and find that this encourages the emergence of robust behaviours that perform well across a suite of tasks. We demonstrate this principle for locomotion -- behaviours that are known for their sensitivity to the choice of reward. We train several simulated bodies on a diverse set of challenging terrains and obstacles, using a simple reward function based on forward progress. Using a novel scalable variant of policy gradient reinforcement learning, our agents learn to run, jump, crouch and turn as required by the environment without explicit reward-based guidance. A visual depiction of highlights of the learned behavior can be viewed following https://youtu.be/hx\_bgoTF7bs .},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence},
  file = {/Users/fracapuano/Zotero/storage/9DZ8XEVY/Heess et al. - 2017 - Emergence of Locomotion Behaviours in Rich Environments.pdf;/Users/fracapuano/Zotero/storage/JUB2Q3WH/1707.html}
}

@inproceedings{higgins2017beta,
  title = {Beta-Vae: {{Learning}} Basic Visual Concepts with a Constrained Variational Framework},
  booktitle = {International Conference on Learning Representations},
  author = {Higgins, Irina and Matthey, Loic and Pal, Arka and Burgess, Christopher and Glorot, Xavier and Botvinick, Matthew and Mohamed, Shakir and Lerchner, Alexander},
  year = {2017}
}

@misc{hoDenoisingDiffusionProbabilistic2020,
  title = {Denoising {{Diffusion Probabilistic Models}}},
  author = {Ho, Jonathan and Jain, Ajay and Abbeel, Pieter},
  year = {2020},
  month = dec,
  number = {arXiv:2006.11239},
  eprint = {2006.11239},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2006.11239},
  urldate = {2025-09-03},
  abstract = {We present high quality image synthesis results using diffusion probabilistic models, a class of latent variable models inspired by considerations from nonequilibrium thermodynamics. Our best results are obtained by training on a weighted variational bound designed according to a novel connection between diffusion probabilistic models and denoising score matching with Langevin dynamics, and our models naturally admit a progressive lossy decompression scheme that can be interpreted as a generalization of autoregressive decoding. On the unconditional CIFAR10 dataset, we obtain an Inception score of 9.46 and a state-of-the-art FID score of 3.17. On 256x256 LSUN, we obtain sample quality similar to ProgressiveGAN. Our implementation is available at https://github.com/hojonathanho/diffusion},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Machine Learning,Statistics - Machine Learning},
  file = {/Users/fracapuano/Zotero/storage/DE655AYQ/Ho et al. - 2020 - Denoising Diffusion Probabilistic Models.pdf;/Users/fracapuano/Zotero/storage/NVIS47ZH/2006.html}
}

@inproceedings{ImageNet_VSS09,
  title = {Construction and Analysis of a Large Scale Image Ontology},
  author = {Deng, J. and Li, K. and Do, M. and Su, H. and {Fei-Fei}, L.},
  year = {2009},
  publisher = {Vision Sciences Society}
}

@inproceedings{InstructBLIP,
  title = {{{InstructBLIP}}: {{Towards}} General-Purpose Vision-Language Models with Instruction Tuning},
  booktitle = {Thirty-Seventh Conference on Neural Information Processing Systems},
  author = {Dai, Wenliang and Li, Junnan and Li, Dongxu and Tiong, Anthony and Zhao, Junqi and Wang, Weisheng and Li, Boyang and Fung, Pascale and Hoi, Steven},
  year = {2023}
}

@misc{intelligence$p_05$VisionLanguageActionModel2025,
  title = {\${$\pi\_$}\{0.5\}\$: A {{Vision-Language-Action Model}} with {{Open-World Generalization}}},
  shorttitle = {\${$\pi\_$}\{0.5\}\$},
  author = {Intelligence, Physical and Black, Kevin and Brown, Noah and Darpinian, James and Dhabalia, Karan and Driess, Danny and Esmail, Adnan and Equi, Michael and Finn, Chelsea and Fusai, Niccolo and Galliker, Manuel Y. and Ghosh, Dibya and Groom, Lachy and Hausman, Karol and Ichter, Brian and Jakubczak, Szymon and Jones, Tim and Ke, Liyiming and LeBlanc, Devin and Levine, Sergey and {Li-Bell}, Adrian and Mothukuri, Mohith and Nair, Suraj and Pertsch, Karl and Ren, Allen Z. and Shi, Lucy Xiaoyang and Smith, Laura and Springenberg, Jost Tobias and Stachowicz, Kyle and Tanner, James and Vuong, Quan and Walke, Homer and Walling, Anna and Wang, Haohuan and Yu, Lili and Zhilinsky, Ury},
  year = {2025},
  month = apr,
  number = {arXiv:2504.16054},
  eprint = {2504.16054},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2504.16054},
  urldate = {2025-09-12},
  abstract = {In order for robots to be useful, they must perform practically relevant tasks in the real world, outside of the lab. While vision-language-action (VLA) models have demonstrated impressive results for end-to-end robot control, it remains an open question how far such models can generalize in the wild. We describe \${\textbackslash}pi\_\{0.5\}\$, a new model based on \${\textbackslash}pi\_\{0\}\$ that uses co-training on heterogeneous tasks to enable broad generalization. \${\textbackslash}pi\_\{0.5\}\${\textbackslash} uses data from multiple robots, high-level semantic prediction, web data, and other sources to enable broadly generalizable real-world robotic manipulation. Our system uses a combination of co-training and hybrid multi-modal examples that combine image observations, language commands, object detections, semantic subtask prediction, and low-level actions. Our experiments show that this kind of knowledge transfer is essential for effective generalization, and we demonstrate for the first time that an end-to-end learning-enabled robotic system can perform long-horizon and dexterous manipulation skills, such as cleaning a kitchen or bedroom, in entirely new homes.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Machine Learning,Computer Science - Robotics},
  file = {/Users/fracapuano/Zotero/storage/UC3RB96R/Intelligence et al. - 2025 - $π_ 0.5 $ a Vision-Language-Action Model with Open-World Generalization.pdf;/Users/fracapuano/Zotero/storage/DSFCCRF3/2504.html}
}

@misc{jangBCZZeroShotTask2022,
  title = {{{BC-Z}}: {{Zero-Shot Task Generalization}} with {{Robotic Imitation Learning}}},
  shorttitle = {{{BC-Z}}},
  author = {Jang, Eric and Irpan, Alex and Khansari, Mohi and Kappler, Daniel and Ebert, Frederik and Lynch, Corey and Levine, Sergey and Finn, Chelsea},
  year = {2022},
  month = feb,
  number = {arXiv:2202.02005},
  eprint = {2202.02005},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2202.02005},
  urldate = {2025-09-01},
  abstract = {In this paper, we study the problem of enabling a vision-based robotic manipulation system to generalize to novel tasks, a long-standing challenge in robot learning. We approach the challenge from an imitation learning perspective, aiming to study how scaling and broadening the data collected can facilitate such generalization. To that end, we develop an interactive and flexible imitation learning system that can learn from both demonstrations and interventions and can be conditioned on different forms of information that convey the task, including pre-trained embeddings of natural language or videos of humans performing the task. When scaling data collection on a real robot to more than 100 distinct tasks, we find that this system can perform 24 unseen manipulation tasks with an average success rate of 44\%, without any robot demonstrations for those tasks.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Machine Learning,Computer Science - Robotics},
  file = {/Users/fracapuano/Zotero/storage/YDG2WMDC/Jang et al. - 2022 - BC-Z Zero-Shot Task Generalization with Robotic Imitation Learning.pdf;/Users/fracapuano/Zotero/storage/ZZ47RG6V/2202.html}
}

@misc{jannerPlanningDiffusionFlexible2022,
  title = {Planning with {{Diffusion}} for {{Flexible Behavior Synthesis}}},
  author = {Janner, Michael and Du, Yilun and Tenenbaum, Joshua B. and Levine, Sergey},
  year = {2022},
  month = dec,
  number = {arXiv:2205.09991},
  eprint = {2205.09991},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2205.09991},
  urldate = {2025-09-03},
  abstract = {Model-based reinforcement learning methods often use learning only for the purpose of estimating an approximate dynamics model, offloading the rest of the decision-making work to classical trajectory optimizers. While conceptually simple, this combination has a number of empirical shortcomings, suggesting that learned models may not be well-suited to standard trajectory optimization. In this paper, we consider what it would look like to fold as much of the trajectory optimization pipeline as possible into the modeling problem, such that sampling from the model and planning with it become nearly identical. The core of our technical approach lies in a diffusion probabilistic model that plans by iteratively denoising trajectories. We show how classifier-guided sampling and image inpainting can be reinterpreted as coherent planning strategies, explore the unusual and useful properties of diffusion-based planning methods, and demonstrate the effectiveness of our framework in control settings that emphasize long-horizon decision-making and test-time flexibility.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning},
  file = {/Users/fracapuano/Zotero/storage/6S28T733/Janner et al. - 2022 - Planning with Diffusion for Flexible Behavior Synthesis.pdf;/Users/fracapuano/Zotero/storage/DRH9ZWCG/2205.html}
}

@misc{jiangMistral7B2023,
  title = {Mistral {{7B}}},
  author = {Jiang, Albert Q. and Sablayrolles, Alexandre and Mensch, Arthur and Bamford, Chris and Chaplot, Devendra Singh and de las Casas, Diego and Bressand, Florian and Lengyel, Gianna and Lample, Guillaume and Saulnier, Lucile and Lavaud, L{\'e}lio Renard and Lachaux, Marie-Anne and Stock, Pierre and Scao, Teven Le and Lavril, Thibaut and Wang, Thomas and Lacroix, Timoth{\'e}e and Sayed, William El},
  year = {2023},
  month = oct,
  number = {arXiv:2310.06825},
  eprint = {2310.06825},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2310.06825},
  urldate = {2025-09-09},
  abstract = {We introduce Mistral 7B v0.1, a 7-billion-parameter language model engineered for superior performance and efficiency. Mistral 7B outperforms Llama 2 13B across all evaluated benchmarks, and Llama 1 34B in reasoning, mathematics, and code generation. Our model leverages grouped-query attention (GQA) for faster inference, coupled with sliding window attention (SWA) to effectively handle sequences of arbitrary length with a reduced inference cost. We also provide a model fine-tuned to follow instructions, Mistral 7B -- Instruct, that surpasses the Llama 2 13B -- Chat model both on human and automated benchmarks. Our models are released under the Apache 2.0 license.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/Users/fracapuano/Zotero/storage/JJX9Q8J9/Jiang et al. - 2023 - Mistral 7B.pdf;/Users/fracapuano/Zotero/storage/WTMQBRW3/2310.html}
}

@misc{jiDribbleBotDynamicLegged2023,
  title = {{{DribbleBot}}: {{Dynamic Legged Manipulation}} in the {{Wild}}},
  shorttitle = {{{DribbleBot}}},
  author = {Ji, Yandong and Margolis, Gabriel B. and Agrawal, Pulkit},
  year = {2023},
  month = apr,
  number = {arXiv:2304.01159},
  eprint = {2304.01159},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2304.01159},
  urldate = {2025-08-26},
  abstract = {DribbleBot (Dexterous Ball Manipulation with a Legged Robot) is a legged robotic system that can dribble a soccer ball under the same real-world conditions as humans (i.e., in-the-wild). We adopt the paradigm of training policies in simulation using reinforcement learning and transferring them into the real world. We overcome critical challenges of accounting for variable ball motion dynamics on different terrains and perceiving the ball using body-mounted cameras under the constraints of onboard computing. Our results provide evidence that current quadruped platforms are well-suited for studying dynamic whole-body control problems involving simultaneous locomotion and manipulation directly from sensory observations.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Computer Science - Robotics},
  file = {/Users/fracapuano/Zotero/storage/ABSRE4C4/Ji et al. - 2023 - DribbleBot Dynamic Legged Manipulation in the Wild.pdf;/Users/fracapuano/Zotero/storage/ADI4QNCY/2304.html}
}

@misc{kakaobrain2022coyo700m,
  title = {{{COYO-700M}}: {{Image-text}} Pair Dataset},
  author = {Byeon, Minwoo and Park, Beomhee and Kim, Haecheon and Lee, Sungjun and Baek, Woonhyuk and Kim, Saehoon},
  year = {2022}
}

@misc{kaplanScalingLawsNeural2020,
  title = {Scaling {{Laws}} for {{Neural Language Models}}},
  author = {Kaplan, Jared and McCandlish, Sam and Henighan, Tom and Brown, Tom B. and Chess, Benjamin and Child, Rewon and Gray, Scott and Radford, Alec and Wu, Jeffrey and Amodei, Dario},
  year = {2020},
  month = jan,
  number = {arXiv:2001.08361},
  eprint = {2001.08361},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2001.08361},
  urldate = {2025-09-07},
  abstract = {We study empirical scaling laws for language model performance on the cross-entropy loss. The loss scales as a power-law with model size, dataset size, and the amount of compute used for training, with some trends spanning more than seven orders of magnitude. Other architectural details such as network width or depth have minimal effects within a wide range. Simple equations govern the dependence of overfitting on model/dataset size and the dependence of training speed on model size. These relationships allow us to determine the optimal allocation of a fixed compute budget. Larger models are significantly more sample-efficient, such that optimally compute-efficient training involves training very large models on a relatively modest amount of data and stopping significantly before convergence.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Machine Learning,Statistics - Machine Learning},
  file = {/Users/fracapuano/Zotero/storage/MI5AGWBH/Kaplan et al. - 2020 - Scaling Laws for Neural Language Models.pdf;/Users/fracapuano/Zotero/storage/SBZT8DDY/2001.html}
}

@misc{keGraspingChopsticksCombating2020,
  title = {Grasping with {{Chopsticks}}: {{Combating Covariate Shift}} in {{Model-free Imitation Learning}} for {{Fine Manipulation}}},
  shorttitle = {Grasping with {{Chopsticks}}},
  author = {Ke, Liyiming and Wang, Jingqiang and Bhattacharjee, Tapomayukh and Boots, Byron and Srinivasa, Siddhartha},
  year = {2020},
  month = nov,
  number = {arXiv:2011.06719},
  eprint = {2011.06719},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2011.06719},
  urldate = {2025-09-01},
  abstract = {Billions of people use chopsticks, a simple yet versatile tool, for fine manipulation of everyday objects. The small, curved, and slippery tips of chopsticks pose a challenge for picking up small objects, making them a suitably complex test case. This paper leverages human demonstrations to develop an autonomous chopsticks-equipped robotic manipulator. Due to the lack of accurate models for fine manipulation, we explore model-free imitation learning, which traditionally suffers from the covariate shift phenomenon that causes poor generalization. We propose two approaches to reduce covariate shift, neither of which requires access to an interactive expert or a model, unlike previous approaches. First, we alleviate single-step prediction errors by applying an invariant operator to increase the data support at critical steps for grasping. Second, we generate synthetic corrective labels by adding bounded noise and combining parametric and non-parametric methods to prevent error accumulation. We demonstrate our methods on a real chopstick-equipped robot that we built, and observe the agent's success rate increase from 37.3\% to 80\%, which is comparable to the human expert performance of 82.6\%.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Machine Learning,Computer Science - Robotics},
  file = {/Users/fracapuano/Zotero/storage/ZUPECLSW/Ke et al. - 2020 - Grasping with Chopsticks Combating Covariate Shift in Model-free Imitation Learning for Fine Manipu.pdf;/Users/fracapuano/Zotero/storage/X7PX638S/2011.html}
}

@misc{khazatskyDROIDLargeScaleInTheWild2025,
  title = {{{DROID}}: {{A Large-Scale In-The-Wild Robot Manipulation Dataset}}},
  shorttitle = {{{DROID}}},
  author = {Khazatsky, Alexander and Pertsch, Karl and Nair, Suraj and Balakrishna, Ashwin and Dasari, Sudeep and Karamcheti, Siddharth and Nasiriany, Soroush and Srirama, Mohan Kumar and Chen, Lawrence Yunliang and Ellis, Kirsty and Fagan, Peter David and Hejna, Joey and Itkina, Masha and Lepert, Marion and Ma, Yecheng Jason and Miller, Patrick Tree and Wu, Jimmy and Belkhale, Suneel and Dass, Shivin and Ha, Huy and Jain, Arhan and Lee, Abraham and Lee, Youngwoon and Memmel, Marius and Park, Sungjae and Radosavovic, Ilija and Wang, Kaiyuan and Zhan, Albert and Black, Kevin and Chi, Cheng and Hatch, Kyle Beltran and Lin, Shan and Lu, Jingpei and Mercat, Jean and Rehman, Abdul and Sanketi, Pannag R. and Sharma, Archit and Simpson, Cody and Vuong, Quan and Walke, Homer Rich and Wulfe, Blake and Xiao, Ted and Yang, Jonathan Heewon and Yavary, Arefeh and Zhao, Tony Z. and Agia, Christopher and Baijal, Rohan and Castro, Mateo Guaman and Chen, Daphne and Chen, Qiuyu and Chung, Trinity and Drake, Jaimyn and Foster, Ethan Paul and Gao, Jensen and Guizilini, Vitor and Herrera, David Antonio and Heo, Minho and Hsu, Kyle and Hu, Jiaheng and Irshad, Muhammad Zubair and Jackson, Donovon and Le, Charlotte and Li, Yunshuang and Lin, Kevin and Lin, Roy and Ma, Zehan and Maddukuri, Abhiram and Mirchandani, Suvir and Morton, Daniel and Nguyen, Tony and O'Neill, Abigail and Scalise, Rosario and Seale, Derick and Son, Victor and Tian, Stephen and Tran, Emi and Wang, Andrew E. and Wu, Yilin and Xie, Annie and Yang, Jingyun and Yin, Patrick and Zhang, Yunchu and Bastani, Osbert and Berseth, Glen and Bohg, Jeannette and Goldberg, Ken and Gupta, Abhinav and Gupta, Abhishek and Jayaraman, Dinesh and Lim, Joseph J. and Malik, Jitendra and {Mart{\'i}n-Mart{\'i}n}, Roberto and Ramamoorthy, Subramanian and Sadigh, Dorsa and Song, Shuran and Wu, Jiajun and Yip, Michael C. and Zhu, Yuke and Kollar, Thomas and Levine, Sergey and Finn, Chelsea},
  year = {2025},
  month = apr,
  number = {arXiv:2403.12945},
  eprint = {2403.12945},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2403.12945},
  urldate = {2025-09-08},
  abstract = {The creation of large, diverse, high-quality robot manipulation datasets is an important stepping stone on the path toward more capable and robust robotic manipulation policies. However, creating such datasets is challenging: collecting robot manipulation data in diverse environments poses logistical and safety challenges and requires substantial investments in hardware and human labour. As a result, even the most general robot manipulation policies today are mostly trained on data collected in a small number of environments with limited scene and task diversity. In this work, we introduce DROID (Distributed Robot Interaction Dataset), a diverse robot manipulation dataset with 76k demonstration trajectories or 350 hours of interaction data, collected across 564 scenes and 84 tasks by 50 data collectors in North America, Asia, and Europe over the course of 12 months. We demonstrate that training with DROID leads to policies with higher performance and improved generalization ability. We open source the full dataset, policy learning code, and a detailed guide for reproducing our robot hardware setup.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Robotics},
  file = {/Users/fracapuano/Zotero/storage/XZ5Y4HZS/Khazatsky et al. - 2025 - DROID A Large-Scale In-The-Wild Robot Manipulation Dataset.pdf;/Users/fracapuano/Zotero/storage/N2Z72XLK/2403.html}
}

@misc{kimOpenVLAOpenSourceVisionLanguageAction2024,
  title = {{{OpenVLA}}: {{An Open-Source Vision-Language-Action Model}}},
  shorttitle = {{{OpenVLA}}},
  author = {Kim, Moo Jin and Pertsch, Karl and Karamcheti, Siddharth and Xiao, Ted and Balakrishna, Ashwin and Nair, Suraj and Rafailov, Rafael and Foster, Ethan and Lam, Grace and Sanketi, Pannag and Vuong, Quan and Kollar, Thomas and Burchfiel, Benjamin and Tedrake, Russ and Sadigh, Dorsa and Levine, Sergey and Liang, Percy and Finn, Chelsea},
  year = {2024},
  month = sep,
  number = {arXiv:2406.09246},
  eprint = {2406.09246},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2406.09246},
  urldate = {2025-09-08},
  abstract = {Large policies pretrained on a combination of Internet-scale vision-language data and diverse robot demonstrations have the potential to change how we teach robots new skills: rather than training new behaviors from scratch, we can fine-tune such vision-language-action (VLA) models to obtain robust, generalizable policies for visuomotor control. Yet, widespread adoption of VLAs for robotics has been challenging as 1) existing VLAs are largely closed and inaccessible to the public, and 2) prior work fails to explore methods for efficiently fine-tuning VLAs for new tasks, a key component for adoption. Addressing these challenges, we introduce OpenVLA, a 7B-parameter open-source VLA trained on a diverse collection of 970k real-world robot demonstrations. OpenVLA builds on a Llama 2 language model combined with a visual encoder that fuses pretrained features from DINOv2 and SigLIP. As a product of the added data diversity and new model components, OpenVLA demonstrates strong results for generalist manipulation, outperforming closed models such as RT-2-X (55B) by 16.5\% in absolute task success rate across 29 tasks and multiple robot embodiments, with 7x fewer parameters. We further show that we can effectively fine-tune OpenVLA for new settings, with especially strong generalization results in multi-task environments involving multiple objects and strong language grounding abilities, and outperform expressive from-scratch imitation learning methods such as Diffusion Policy by 20.4\%. We also explore compute efficiency; as a separate contribution, we show that OpenVLA can be fine-tuned on consumer GPUs via modern low-rank adaptation methods and served efficiently via quantization without a hit to downstream success rate. Finally, we release model checkpoints, fine-tuning notebooks, and our PyTorch codebase with built-in support for training VLAs at scale on Open X-Embodiment datasets.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Machine Learning,Computer Science - Robotics},
  file = {/Users/fracapuano/Zotero/storage/XR2SX8WG/Kim et al. - 2024 - OpenVLA An Open-Source Vision-Language-Action Model.pdf;/Users/fracapuano/Zotero/storage/63Q96WRV/2406.html}
}

@article{kingma2013auto,
  title = {Auto-Encoding Variational Bayes},
  author = {Kingma, Diederik P and Welling, Max},
  year = {2013},
  journal = {arXiv preprint arXiv:1312.6114},
  eprint = {1312.6114},
  abstract = {How can we perform efficient inference and learning in directed probabilistic models, in the presence of continuous latent variables with intractable posterior distributions, and large datasets? We introduce a stochastic variational inference and learning algorithm that scales to large datasets and, under some mild differentiability conditions, even works in the intractable case. Our contributions are two-fold. First, we show that a reparameterization of the variational lower bound yields a lower bound estimator that can be straightforwardly optimized using standard stochastic gradient methods. Second, we show that for i.i.d. datasets with continuous latent variables per datapoint, posterior inference can be made especially efficient by fitting an approximate inference model (also called a recognition model) to the intractable posterior using the proposed lower bound estimator. Theoretical advantages are reflected in experimental results.},
  archiveprefix = {arXiv}
}

@misc{knightStandardOpenSO100,
  title = {Standard {{Open SO-100}} \& {{SO-101 Arms}}},
  author = {Knight, Rob and Kooijmans, Pepijn and Wolf, Thomas and Alibert, Simon and Aractingi, Michel and Aubakirova, Dana and Zouitine, Adil and Martino, Russi and Palma, Steven and Pascal, Caroline and Cadene, Remi}
}

@article{koberReinforcementLearningRobotics,
  title = {Reinforcement {{Learning}} in {{Robotics}}: {{A Survey}}},
  author = {Kober, Jens and Bagnell, J Andrew and Peters, Jan},
  langid = {english},
  file = {/Users/fracapuano/Zotero/storage/72PRHGKL/Kober et al. - Reinforcement Learning in Robotics A Survey.pdf}
}

@inproceedings{kong2024audioflam,
  title = {Audio Flamingo: A Novel Audio Language Model with Few-Shot Learning and Dialogue Abilities},
  booktitle = {International Conference on Machine Learning},
  author = {Kong, Zhifeng and Goel, Arushi and Badlani, Rohan and Ping, Wei and Valle, Rafael and Catanzaro, Bryan},
  year = {2024},
  pages = {25125--25148},
  publisher = {PMLR}
}

@misc{kumarRMARapidMotor2021,
  title = {{{RMA}}: {{Rapid Motor Adaptation}} for {{Legged Robots}}},
  shorttitle = {{{RMA}}},
  author = {Kumar, Ashish and Fu, Zipeng and Pathak, Deepak and Malik, Jitendra},
  year = {2021},
  month = jul,
  number = {arXiv:2107.04034},
  eprint = {2107.04034},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2107.04034},
  urldate = {2025-08-27},
  abstract = {Successful real-world deployment of legged robots would require them to adapt in real-time to unseen scenarios like changing terrains, changing payloads, wear and tear. This paper presents Rapid Motor Adaptation (RMA) algorithm to solve this problem of real-time online adaptation in quadruped robots. RMA consists of two components: a base policy and an adaptation module. The combination of these components enables the robot to adapt to novel situations in fractions of a second. RMA is trained completely in simulation without using any domain knowledge like reference trajectories or predefined foot trajectory generators and is deployed on the A1 robot without any fine-tuning. We train RMA on a varied terrain generator using bioenergetics-inspired rewards and deploy it on a variety of difficult terrains including rocky, slippery, deformable surfaces in environments with grass, long vegetation, concrete, pebbles, stairs, sand, etc. RMA shows state-of-the-art performance across diverse real-world as well as simulation experiments. Video results at https://ashish-kmr.github.io/rma-legged-robots/},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning,Computer Science - Robotics},
  file = {/Users/fracapuano/Zotero/storage/TMYICHS6/Kumar et al. - 2021 - RMA Rapid Motor Adaptation for Legged Robots.pdf;/Users/fracapuano/Zotero/storage/TFY2EU8I/2107.html}
}

@misc{laiActionChunkingConditional2025,
  title = {Action Chunking as Conditional Policy Compression},
  author = {Lai, Lucy and Huang, Ann and Gershman, Samuel},
  year = {2025},
  month = jun,
  publisher = {OSF},
  doi = {10.31234/osf.io/z8yrv_v2},
  urldate = {2025-09-02},
  abstract = {Many skills in our everyday lives are learned by sequencing actions towards a desired goal. The action sequence can become a ``chunk'' when individual actions are grouped together and executed as one unit, making them more efficient to store and execute. While chunking has been studied extensively across various domains, a puzzle remains as to why and under what conditions action chunking occurs. To tackle these questions, we develop a model of conditional policy compression---the reduction in cognitive cost by conditioning on an additional source of information---to explain the origin of chunking. We argue that chunking is a result of optimizing the trade-off between reward and conditional policy complexity. Chunking compresses policies when there is temporal structure in the environment that can be leveraged for action selection, reducing the amount of memory necessary to encode the policy. We experimentally confirm our model's predictions, showing that chunking reduces conditional policy complexity and reaction times. Chunking also increases with working memory load, consistent with the hypothesis that the degree of policy compression scales with the scarcity of cognitive resources. Finally, chunking also reduces overall working memory load, freeing cognitive resources for the benefit of other, not-chunked information.},
  archiveprefix = {OSF},
  langid = {american},
  keywords = {action selection,chunking,habits,reinforcement learning,resource-rationality,working memory}
}

@article{laiActionChunkingConditional2025a,
  title = {Action Chunking as Conditional Policy Compression},
  author = {Lai, Lucy and Huang, Ann Z. X. and Gershman, Samuel J.},
  year = {2025},
  month = nov,
  journal = {Cognition},
  volume = {264},
  pages = {106201},
  issn = {1873-7838},
  doi = {10.1016/j.cognition.2025.106201},
  abstract = {Many skills in our everyday lives are learned by sequencing actions towards a desired goal. The action sequence can become a "chunk" when individual actions are grouped together and executed as one unit, making them more efficient to store and execute. While chunking has been studied extensively across various domains, a puzzle remains as to why and under what conditions action chunking occurs. To tackle these questions, we develop a model of conditional policy compression-the reduction in cognitive cost by conditioning on an additional source of information-to explain the origin of chunking. We argue that chunking is a result of optimizing the trade-off between reward and conditional policy complexity. Chunking compresses policies when there is temporal structure in the environment that can be leveraged for action selection, reducing the amount of memory necessary to encode the policy. We experimentally confirm our model's predictions, showing that chunking reduces conditional policy complexity and reaction times. Chunking also increases with working memory load, consistent with the hypothesis that the degree of policy compression scales with the scarcity of cognitive resources. Finally, chunking also reduces overall working memory load, freeing cognitive resources for the benefit of other, not-chunked information.},
  langid = {english},
  pmid = {40602234},
  keywords = {Action selection,Adult,Chunking,Cognition,Decision making,Female,Humans,Information bottleneck,Male,Memory Short-Term,Models Psychological,Psychomotor Performance,Reaction Time,Reinforcement learning,Resource rationality,Reward,Young Adult}
}

@article{LAION-COCO,
  title = {Laion Coco: 600m Synthetic Captions from Laion2b-En},
  author = {Schuhmann, C and K{\"o}pf, A and Vencu, R and Coombes, T and Beaumont, R},
  year = {2022},
  journal = {URL https://laion.ai/blog/laion-coco}
}

@misc{laurenconWhatMattersWhen2024,
  title = {What Matters When Building Vision-Language Models?},
  author = {Lauren{\c c}on, Hugo and Tronchon, L{\'e}o and Cord, Matthieu and Sanh, Victor},
  year = {2024},
  month = may,
  number = {arXiv:2405.02246},
  eprint = {2405.02246},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2405.02246},
  urldate = {2025-09-09},
  abstract = {The growing interest in vision-language models (VLMs) has been driven by improvements in large language models and vision transformers. Despite the abundance of literature on this subject, we observe that critical decisions regarding the design of VLMs are often not justified. We argue that these unsupported decisions impede progress in the field by making it difficult to identify which choices improve model performance. To address this issue, we conduct extensive experiments around pre-trained models, architecture choice, data, and training methods. Our consolidation of findings includes the development of Idefics2, an efficient foundational VLM of 8 billion parameters. Idefics2 achieves state-of-the-art performance within its size category across various multimodal benchmarks, and is often on par with models four times its size. We release the model (base, instructed, and chat) along with the datasets created for its training.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computer Vision and Pattern Recognition},
  file = {/Users/fracapuano/Zotero/storage/8H6NRPU7/Laurençon et al. - 2024 - What matters when building vision-language models.pdf;/Users/fracapuano/Zotero/storage/H3NETYXA/2405.html}
}

@misc{leeBehaviorGenerationLatent2024,