Awesome-RL-for-LRMs/reference.bib at main · TsinghuaC3I/Awesome-RL-for-LRMs · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# Related Works
# RL
@article{huh2023multi,
  title   = {Multi-agent reinforcement learning: A comprehensive survey},
  author  = {Huh, Dom and Mohapatra, Prasant},
  journal = {arXiv preprint arXiv:2312.10256},
  year    = {2023}
}

@article{ghasemi2024comprehensive,
  title   = {A comprehensive survey of reinforcement learning: From algorithms to practical challenges},
  author  = {Ghasemi, Majid and Moosavi, Amir Hossein and Ebrahimi, Dariush},
  journal = {arXiv preprint arXiv:2411.18892},
  year    = {2024}
}

@article{ai2025inquiremobile,
  title={InquireMobile: Teaching VLM-based Mobile Agent to Request Human Assistance via Reinforcement Fine-Tuning},
  author={Ai, Qihang and Bu, Pi and Cao, Yue and Wang, Yingyao and Gu, Jihao and Xing, Jingxuan and Zhu, Zekun and Jiang, Wei and Zheng, Zhicheng and Song, Jun and others},
  journal={arXiv preprint arXiv:2508.19679},
  year={2025}
}

@article{xia2025visionary,
  title   = {Visionary-r1: Mitigating shortcuts in visual reasoning with reinforcement learning},
  author  = {Xia, Jiaer and Zang, Yuhang and Gao, Peng and Li, Yixuan and Zhou, Kaiyang},
  journal = {arXiv preprint arXiv:2505.14677},
  year    = {2025}
}

@article{wang2025vision,
  title={Vision-Zero: Scalable VLM Self-Improvement via Strategic Gamified Self-Play},
  author={Wang, Qinsi and Liu, Bo and Zhou, Tianyi and Shi, Jing and Lin, Yueqian and Chen, Yiran and Li, Hai Helen and Wan, Kun and Zhao, Wentian},
  journal={arXiv preprint arXiv:2509.25541},
  year={2025}
}

@article{long2025adsqa,
  title={AdsQA: Towards Advertisement Video Understanding},
  author={Long, Xinwei and Tian, Kai and Xu, Peng and Jia, Guoli and Li, Jingxuan and Yang, Sa and Shao, Yihua and Zhang, Kaiyan and Jiang, Che and Xu, Hao and others},
  journal={arXiv preprint arXiv:2509.08621},
  year={2025}
}


@article{kan2025taco,
  title   = {TACO: Think-Answer Consistency for Optimized Long-Chain Reasoning and Efficient Data Learning via Reinforcement Learning in LVLMs},
  author  = {Kan, Zhehan and Liu, Yanlin and Yin, Kun and Jiang, Xinghua and Li, Xin and Cao, Haoyu and Liu, Yinsong and Jiang, Deqiang and Sun, Xing and Liao, Qingmin and others},
  journal = {arXiv preprint arXiv:2505.20777},
  year    = {2025}
}


@article{fan2025grit,
  title   = {GRIT: Teaching MLLMs to Think with Images},
  author  = {Fan, Yue and He, Xuehai and Yang, Diji and Zheng, Kaizhi and Kuo, Ching-Chen and Zheng, Yuting and Narayanaraju, Sravana Jyothi and Guan, Xinze and Wang, Xin Eric},
  journal = {arXiv preprint arXiv:2505.15879},
  year    = {2025}
}

@article{huang2025vision,
  title={Vision-r1: Incentivizing reasoning capability in multimodal large language models},
  author={Huang, Wenxuan and Jia, Bohan and Zhai, Zijie and Cao, Shaosheng and Ye, Zheyu and Zhao, Fei and Xu, Zhe and Hu, Yao and Lin, Shaohui},
  journal={arXiv preprint arXiv:2503.06749},
  year={2025}
}

@article{shen2025vlm,
  title   = {Vlm-r1: A stable and generalizable r1-style large vision-language model},
  author  = {Shen, Haozhan and Liu, Peng and Li, Jingcheng and Fang, Chunxin and Ma, Yibo and Liao, Jiajia and Shen, Qiaoli and Zhang, Zilun and Zhao, Kangjia and Zhang, Qianqian and others},
  journal = {arXiv preprint arXiv:2504.07615},
  year    = {2025}
}

@article{cao2025ground,
  title   = {Ground-R1: Incentivizing Grounded Visual Reasoning via Reinforcement Learning},
  author  = {Cao, Meng and Zhao, Haoze and Zhang, Can and Chang, Xiaojun and Reid, Ian and Liang, Xiaodan},
  journal = {arXiv preprint arXiv:2505.20272},
  year    = {2025}
}

@article{chu2025qwen,
  title   = {Qwen Look Again: Guiding Vision-Language Reasoning Models to Re-attention Visual Information},
  author  = {Chu, Xu and Chen, Xinrong and Wang, Guanyu and Tan, Zhijie and Huang, Kui and Lv, Wenyu and Mo, Tong and Li, Weiping},
  journal = {arXiv preprint arXiv:2505.23558},
  year    = {2025}
}

@article{xu2025visual,
  title   = {Visual Planning: Let's Think Only with Images},
  author  = {Xu, Yi and Li, Chengzu and Zhou, Han and Wan, Xingchen and Zhang, Caiqi and Korhonen, Anna and Vuli{\'c}, Ivan},
  journal = {arXiv preprint arXiv:2505.11409},
  year    = {2025}
}


@article{zhang2024survey,
  title   = {A survey on self-play methods in reinforcement learning},
  author  = {Zhang, Ruize and Xu, Zelai and Ma, Chengdong and Yu, Chao and Tu, Wei-Wei and Tang, Wenhao and Huang, Shiyu and Ye, Deheng and Ding, Wenbo and Yang, Yaodong and others},
  journal = {arXiv preprint arXiv:2408.01072},
  year    = {2024}
}

@article{wu2025reinforcement,
  title   = {Reinforcement Learning in Vision: A Survey},
  author  = {Wu, Weijia and Gao, Chen and Chen, Joya and Lin, Kevin Qinghong and Meng, Qingwei and Zhang, Yiming and Qiu, Yuke and Zhou, Hong and Shou, Mike Zheng},
  journal = {arXiv preprint arXiv:2508.08189},
  year    = {2025}
}


@article{tan2025chartmaster,
  title   = {ChartMaster: Advancing Chart-to-Code Generation with Real-World Charts and Chart Similarity Reinforcement Learning},
  author  = {Tan, Wentao and Cao, Qiong and Xue, Chao and Zhan, Yibing and Ding, Changxing and He, Xiaodong},
  journal = {arXiv preprint arXiv:2508.17608},
  year    = {2025}
}

# LLM/LRM
@article{zhao2023survey,
  title   = {A survey of large language models},
  author  = {Zhao, Wayne Xin and Zhou, Kun and Li, Junyi and Tang, Tianyi and Wang, Xiaolei and Hou, Yupeng and Min, Yingqian and Zhang, Beichen and Zhang, Junjie and Dong, Zican and others},
  journal = {arXiv preprint arXiv:2303.18223},
  volume  = {1},
  number  = {2},
  year    = {2023}
}

@article{zhang2025100,
  title   = {100 days after deepseek-r1: A survey on replication studies and more directions for reasoning language models},
  author  = {Zhang, Chong and Deng, Yue and Lin, Xiang and Wang, Bin and Ng, Dianwen and Ye, Hai and Li, Xingxuan and Xiao, Yao and Mo, Zhanfeng and Zhang, Qi and others},
  journal = {arXiv preprint arXiv:2505.00551},
  year    = {2025}
}

@article{li2025system,
  title   = {From system 1 to system 2: A survey of reasoning large language models},
  author  = {Li, Zhong-Zhi and Zhang, Duzhen and Zhang, Ming-Liang and Zhang, Jiaxin and Liu, Zengyan and Yao, Yuxuan and Xu, Haotian and Zheng, Junhao and Wang, Pei-Jie and Chen, Xiuyi and others},
  journal = {arXiv preprint arXiv:2502.17419},
  year    = {2025}
}

@article{xu2025towards,
  title   = {Towards large reasoning models: A survey of reinforced reasoning with large language models},
  author  = {Xu, Fengli and Hao, Qianyue and Zong, Zefang and Wang, Jingwei and Zhang, Yunke and Wang, Jingyi and Lan, Xiaochong and Gong, Jiahui and Ouyang, Tianjian and Meng, Fanjin and others},
  journal = {arXiv preprint arXiv:2501.09686},
  year    = {2025}
}

@article{srivastava2025technical,
  title   = {A Technical Survey of Reinforcement Learning Techniques for Large Language Models},
  author  = {Srivastava, Saksham Sahai and Aggarwal, Vaneet},
  journal = {arXiv preprint arXiv:2507.04136},
  year    = {2025}
}

@article{wu2025sailing,
  title   = {Sailing by the Stars: A Survey on Reward Models and Learning Strategies for Learning from Rewards},
  author  = {Wu, Xiaobao},
  journal = {arXiv preprint arXiv:2505.02686},
  year    = {2025}
}

@article{sun2025survey,
  title     = {A survey of reasoning with foundation models: Concepts, methodologies, and outlook},
  author    = {Sun, Jiankai and Zheng, Chuanyang and Xie, Enze and Liu, Zhengying and Chu, Ruihang and Qiu, Jianing and Xu, Jiaqi and Ding, Mingyu and Li, Hongyang and Geng, Mengzhe and others},
  journal   = {ACM Computing Surveys},
  volume    = {57},
  number    = {11},
  pages     = {1--43},
  year      = {2025},
  publisher = {ACM New York, NY}
}
@article{shen2025skywork,
  title={Skywork-r1v3 technical report},
  author={Shen, Wei and Pei, Jiangbo and Peng, Yi and Song, Xuchen and Liu, Yang and Peng, Jian and Sun, Haofeng and Hao, Yunzhuo and Wang, Peiyu and Zhang, Jianhao and others},
  journal={arXiv preprint arXiv:2507.06167},
  year={2025}
}
@article{xiaomi2025mimo,
  title={MiMo: Unlocking the Reasoning Potential of Language Model--From Pretraining to Posttraining},
  author={Xiaomi, LLM and Xia, Bingquan and Shen, Bowen and Zhu, Dawei and Zhang, Di and Wang, Gang and Zhang, Hailin and Liu, Huaqiu and Xiao, Jiebao and Dong, Jinhao and others},
  journal={arXiv preprint arXiv:2505.07608},
  year={2025}
}
@article{team2025intellect,
  title={INTELLECT-2: A Reasoning Model Trained Through Globally Decentralized Reinforcement Learning},
  author={Team, Prime Intellect and Jaghouar, Sami and Mattern, Justus and Ong, Jack Min and Straube, Jannik and Basra, Manveer and Pazdera, Aaron and Thaman, Kushal and Di Ferrante, Matthew and Gabriel, Felix and others},
  journal={arXiv preprint arXiv:2505.07291},
  year={2025}
}
@article{team2025hunyuan,
  title={Hunyuan-turbos: Advancing large language models through mamba-transformer synergy and adaptive chain-of-thought},
  author={Team, Tencent Hunyuan and Liu, Ao and Zhou, Botong and Xu, Can and Zhou, Chayse and Zhang, ChenChen and Xu, Chengcheng and Wang, Chenhao and Wu, Decheng and Wu, Dengpeng and others},
  journal={arXiv preprint arXiv:2505.15431},
  year={2025}
}

@misc{deepseekai2024deepseekv32,
      title={DeepSeek-V3.2-Exp: Boosting Long-Context Efficiency with DeepSeek Sparse Attention},
      author={DeepSeek-AI},
      year={2025},
      url={https://github.com/deepseek-ai/DeepSeek-V3.2-Exp/blob/main/DeepSeek_V3_2.pdf}
}

@misc{GLM4.6,
      title={GLM-4.6: Advanced Agentic, Reasoning and Coding Capabilities},
      author={Zhipu-AI},
      year={2025},
      url={https://z.ai/blog/glm-4.6}
}

@misc{Ring1T,
      title={Ring-1T-preview, Deep Thinking, No Waiting},
      author={inclusionAI},
      year={2025},
      url={https://huggingface.co/inclusionAI/Ring-1T-preview}
}

@misc{Qwen3VL,
      title={Qwen3-VL: Sharper Vision, Deeper Thought, Broader Action},
      author={Alibaba-Qwen},
      year={2025},
      url={https://qwen.ai/blog?id=99f0335c4ad9ff6153e517418d48535ab6d8afef&from=research.latest-advancements-list}
}

@misc{Qwen3next,
      title={Qwen3-Next: Towards Ultimate Training & Inference Efficiency},
      author={Alibaba-Qwen},
      year={2025},
      url={https://qwen.ai/blog?id=4074cca80393150c248e508aa62983f9cb7d27cd&from=research.latest-advancements-list}
}

@misc{Ringmini2.0,
      title={Ring-mini-2.0},
      author={inclusionAI},
      year={2025},
      url={https://huggingface.co/inclusionAI/Ring-mini-2.0}
}

# RL Methodology
## Reward Design - Verifiable Rewards
@article{sun2025freeprm,
  title   = {FreePRM: Training Process Reward Models Without Ground Truth Process Labels},
  author  = {Sun, Lin and Liu, Chuang and Ma, Xiaofeng and Yang, Tao and Lu, Weijia and Wu, Ning},
  journal = {arXiv preprint arXiv:2506.03570},
  year    = {2025}
}

@article{setlur2024rewarding,
  title   = {Rewarding progress: Scaling automated process verifiers for llm reasoning},
  author  = {Setlur, Amrith and Nagpal, Chirag and Fisch, Adam and Geng, Xinyang and Eisenstein, Jacob and Agarwal, Rishabh and Agarwal, Alekh and Berant, Jonathan and Kumar, Aviral},
  journal = {arXiv preprint arXiv:2410.08146},
  year    = {2024}
}


@article{lin2025r1,
  title   = {OS-R1: Agentic Operating System Kernel Tuning with Reinforcement Learning},
  author  = {Lin, Hongyu and Li, Yuchen and Luo, Haoran and Yao, Kaichun and Zhang, Libo and Xing, Mingjie and Wu, Yanjun},
  journal = {arXiv preprint arXiv:2508.12551},
  year    = {2025}
}

@article{ye2025mobile,
  title   = {Mobile-Agent-v3: Foundamental Agents for GUI Automation},
  author  = {Ye, Jiabo and Zhang, Xi and Xu, Haiyang and Liu, Haowei and Wang, Junyang and Zhu, Zhaoqing and Zheng, Ziwei and Gao, Feiyu and Cao, Junjie and Lu, Zhengxi and others},
  journal = {arXiv preprint arXiv:2508.15144},
  year    = {2025}
}

@article{lu2025swirl,
  title   = {SWIRL: A Staged Workflow for Interleaved Reinforcement Learning in Mobile GUI Control},
  author  = {Lu, Quanfeng and Ma, Zhantao and Zhong, Shuai and Wang, Jin and Yu, Dahai and Ng, Michael K and Luo, Ping},
  journal = {arXiv preprint arXiv:2508.20018},
  year    = {2025}
}

@article{zhou2025reinforcing,
  title   = {Reinforcing General Reasoning without Verifiers},
  author  = {Zhou, Xiangxin and Liu, Zichen and Sims, Anya and Wang, Haonan and Pang, Tianyu and Li, Chongxuan and Wang, Liang and Lin, Min and Du, Chao},
  journal = {arXiv preprint arXiv:2505.21493},
  year    = {2025}
}

@article{yu2025rlpr,
  title   = {RLPR: Extrapolating RLVR to General Domains without Verifiers},
  author  = {Yu, Tianyu and Ji, Bo and Wang, Shouli and Yao, Shu and Wang, Zefan and Cui, Ganqu and Yuan, Lifan and Ding, Ning and Yao, Yuan and Liu, Zhiyuan and others},
  journal = {arXiv preprint arXiv:2506.18254},
  year    = {2025}
}

@article{li2025verifybench,
  title   = {VerifyBench: A Systematic Benchmark for Evaluating Reasoning Verifiers Across Domains},
  author  = {Li, Xuzhao and Li, Xuchen and Hu, Shiyu and Guo, Yongzhen and Zhang, Wentao},
  journal = {arXiv preprint arXiv:2507.09884},
  year    = {2025}
}

@article{zhao2025one,
  title   = {One Token to Fool LLM-as-a-Judge},
  author  = {Zhao, Yulai and Liu, Haolin and Yu, Dian and Kung, SY and Mi, Haitao and Yu, Dong},
  journal = {arXiv preprint arXiv:2507.08794},
  year    = {2025}
}

@article{huang2025pitfalls,
  title   = {Pitfalls of Rule-and Model-based Verifiers--A Case Study on Mathematical Reasoning},
  author  = {Huang, Yuzhen and Zeng, Weihao and Zeng, Xingshan and Zhu, Qi and He, Junxian},
  journal = {arXiv preprint arXiv:2505.22203},
  year    = {2025}
}

@misc{kimiteam2025kimik2openagentic,
  title         = {Kimi K2: Open Agentic Intelligence},
  author        = {Kimi Team},
  year          = {2025},
  eprint        = {2507.20534},
  archiveprefix = {arXiv},
  primaryclass  = {cs.LG},
  url           = {https://arxiv.org/abs/2507.20534}
}

## Reward Design - Unsupervised rewards

@article{wang2025thinking,
  title={Thinking Augmented Pre-training},
  author={Wang, Liang and Yang, Nan and Huang, Shaohan and Dong, Li and Wei, Furu},
  journal={arXiv preprint arXiv:2509.20186},
  year={2025}
}

@article{liu2025ettrl,
  title   = {ETTRL: Balancing Exploration and Exploitation in LLM Test-Time Reinforcement Learning Via Entropy Mechanism},
  author  = {Liu, Jia and He, ChangYi and Lin, YingQiao and Yang, MingMin and Shen, FeiYang and Liu, ShaoGuo and Gao, TingTing},
  journal = {arXiv preprint arXiv:2508.11356},
  year    = {2025}
}

@article{chen2025selfques,
  title   = {Self-Questioning Language Models},
  author  = {Chen, Lili and Prabhudesai, Mihir and Fragkiadaki, Katerina and Liu, Hao and Pathak, Deepak},
  journal = {arXiv preprint arXiv:2508.03682},
  year    = {2025}
}

@article{van2025post,
  title   = {Post-Training Large Language Models via Reinforcement Learning from Self-Feedback},
  author  = {van Niekerk, Carel and Vukovic, Renato and Ruppik, Benjamin Matthias and Lin, Hsien-chin and Ga{\v{s}}i{\'c}, Milica},
  journal = {arXiv preprint arXiv:2507.21931},
  year    = {2025}
}

@article{kiruluta2025self,
  title   = {A Self-Supervised Reinforcement Learning Approach for Fine-Tuning Large Language Models Using Cross-Attention Signals},
  author  = {Kiruluta, Andrew and Lemos, Andreas and Burity, Priscilla},
  journal = {arXiv preprint arXiv:2502.10482},
  year    = {2025}
}

@article{zhang2025co,
  title   = {Co-Reward: Self-supervised Reinforcement Learning for Large Language Model Reasoning via Contrastive Agreement},
  author  = {Zhang, Zizhuo and Zhu, Jianing and Ge, Xinmu and Zhao, Zihua and Zhou, Zhanke and Li, Xuan and Feng, Xiao and Yao, Jiangchao and Han, Bo},
  journal = {arXiv preprint arXiv:2508.00410},
  year    = {2025}
}


@article{burns2023weak,
  title   = {Weak-to-strong generalization: Eliciting strong capabilities with weak supervision},
  author  = {Burns, Collin and Izmailov, Pavel and Kirchner, Jan Hendrik and Baker, Bowen and Gao, Leo and Aschenbrenner, Leopold and Chen, Yining and Ecoffet, Adrien and Joglekar, Manas and Leike, Jan and others},
  journal = {arXiv preprint arXiv:2312.09390},
  year    = {2023}
}

@misc{lai2025computerrlscalingendtoendonline,
  title         = {ComputerRL: Scaling End-to-End Online Reinforcement Learning for Computer Use Agents},
  author        = {Hanyu Lai and Xiao Liu and Yanxiao Zhao and Han Xu and Hanchen Zhang and Bohao Jing and Yanyu Ren and Shuntian Yao and Yuxiao Dong and Jie Tang},
  year          = {2025},
  eprint        = {2508.14040},
  archiveprefix = {arXiv},
  primaryclass  = {cs.AI},
  url           = {https://arxiv.org/abs/2508.14040}
}

@article{li2024numinamath,
  title   = {Numinamath: The largest public dataset in ai4maths with 860k pairs of competition math problems and solutions},
  author  = {Li, Jia and Beeching, Edward and Tunstall, Lewis and Lipkin, Ben and Soletskyi, Roman and Huang, Shengyi and Rasul, Kashif and Yu, Longhui and Jiang, Albert Q and Shen, Ziju and others},
  journal = {Hugging Face repository},
  volume  = {13},
  pages   = {9},
  year    = {2024}
}

@article{glazer2024frontiermath,
  title   = {Frontiermath: A benchmark for evaluating advanced mathematical reasoning in ai},
  author  = {Glazer, Elliot and Erdil, Ege and Besiroglu, Tamay and Chicharro, Diego and Chen, Evan and Gunning, Alex and Olsson, Caroline Falkman and Denain, Jean-Stanislas and Ho, Anson and Santos, Emily de Oliveira and others},
  journal = {arXiv preprint arXiv:2411.04872},
  year    = {2024}
}

@misc{liu2025there,
  title  = {There may not be aha moment in r1-zero-like training—a pilot study},
  author = {Liu, Zichen and Chen, Changyu and Li, Wenjun and Pang, Tianyu and Du, Chao and Lin, Min},
  year   = {2025}
}

@article{song2024mind,
  title   = {Mind the gap: Examining the self-improvement capabilities of large language models},
  author  = {Song, Yuda and Zhang, Hanlin and Eisenach, Carson and Kakade, Sham and Foster, Dean and Ghai, Udaya},
  journal = {arXiv preprint arXiv:2412.02674},
  year    = {2024}
}

@article{gao2025uishift,
  title   = {UIShift: Enhancing VLM-based GUI Agents through Self-supervised Reinforcement Learning},
  author  = {Gao, Longxi and Zhang, Li and Xu, Mengwei},
  journal = {arXiv preprint arXiv:2505.12493},
  year    = {2025}
}

@article{du2025test,
  title   = {Test-Time Reinforcement Learning for GUI Grounding via Region Consistency},
  author  = {Du, Yong and Yan, Yuchen and Tang, Fei and Lu, Zhengxi and Zong, Chang and Lu, Weiming and Jiang, Shengpei and Shen, Yongliang},
  journal = {arXiv preprint arXiv:2508.05615},
  year    = {2025}
}


@article{shi2025mobilegui,
  title   = {MobileGUI-RL: Advancing Mobile GUI Agent through Reinforcement Learning in Online Environment},
  author  = {Shi, Yucheng and Yu, Wenhao and Li, Zaitang and Wang, Yonglin and Zhang, Hongming and Liu, Ninghao and Mi, Haitao and Yu, Dong},
  journal = {arXiv preprint arXiv:2507.05720},
  year    = {2025}
}

@article{wu2025gui,
  title   = {GUI-Reflection: Empowering Multimodal GUI Models with Self-Reflection Behavior},
  author  = {Wu, Penghao and Ma, Shengnan and Wang, Bo and Yu, Jiaheng and Lu, Lewei and Liu, Ziwei},
  journal = {arXiv preprint arXiv:2506.08012},
  year    = {2025}
}

@article{wanyan2025look,
  title   = {Look Before You Leap: A GUI-Critic-R1 Model for Pre-Operative Error Diagnosis in GUI Automation},
  author  = {Wanyan, Yuyang and Zhang, Xi and Xu, Haiyang and Liu, Haowei and Wang, Junyang and Ye, Jiabo and Kou, Yutong and Yan, Ming and Huang, Fei and Yang, Xiaoshan and others},
  journal = {arXiv preprint arXiv:2506.04614},
  year    = {2025}
}


@article{poesia2024learning,
  title   = {Learning formal mathematics from intrinsic motivation},
  author  = {Poesia, Gabriel and Broman, David and Haber, Nick and Goodman, Noah},
  journal = {Advances in Neural Information Processing Systems},
  volume  = {37},
  pages   = {43032--43057},
  year    = {2024}
}


@article{yeo2025demystifying,
  title   = {Demystifying long chain-of-thought reasoning in llms},
  author  = {Yeo, Edward and Tong, Yuxuan and Niu, Morry and Neubig, Graham and Yue, Xiang},
  journal = {arXiv preprint arXiv:2502.03373},
  year    = {2025}
}


@misc{openr1,
  title  = {Open R1: A fully open reproduction of DeepSeek-R1},
  url    = {https://github.com/huggingface/open-r1},
  author = {{Hugging Face}},
  month  = {January},
  year   = {2025}
}

@article{huang2025r,
  title   = {R-Zero: Self-Evolving Reasoning LLM from Zero Data},
  author  = {Huang, Chengsong and Yu, Wenhao and Wang, Xiaoyang and Zhang, Hongming and Li, Zongxia and Li, Ruosen and Huang, Jiaxin and Mi, Haitao and Yu, Dong},
  journal = {arXiv preprint arXiv:2508.05004},
  year    = {2025}
}

@article{pang2023language,
  title   = {Language model self-improvement by reinforcement learning contemplation},
  author  = {Pang, Jing-Cheng and Wang, Pengyuan and Li, Kaiyuan and Chen, Xiong-Hui and Xu, Jiacheng and Zhang, Zongzhang and Yu, Yang},
  journal = {arXiv preprint arXiv:2305.14483},
  year    = {2023}
}

@article{yang2025ssr,
  title   = {SSR-Zero: Simple Self-Rewarding Reinforcement Learning for Machine Translation},
  author  = {Yang, Wenjie and Zheng, Mao and Song, Mingyang and Li, Zheng and Wang, Sitong},
  journal = {arXiv preprint arXiv:2505.16637},
  year    = {2025}
}

@article{chen2024self,
  title   = {Self-play fine-tuning converts weak language models to strong language models},
  author  = {Chen, Zixiang and Deng, Yihe and Yuan, Huizhuo and Ji, Kaixuan and Gu, Quanquan},
  journal = {arXiv preprint arXiv:2401.01335},
  year    = {2024}
}

@article{zhou2024calibrated,
  title   = {Calibrated self-rewarding vision language models},
  author  = {Zhou, Yiyang and Fan, Zhiyuan and Cheng, Dongjie and Yang, Sihan and Chen, Zhaorun and Cui, Chenhang and Wang, Xiyao and Li, Yun and Zhang, Linjun and Yao, Huaxiu},
  journal = {Advances in Neural Information Processing Systems},
  volume  = {37},
  pages   = {51503--51531},
  year    = {2024}
}

@article{zou2025trans,
  title   = {Trans-Zero: Self-Play Incentivizes Large Language Models for Multilingual Translation Without Parallel Data},
  author  = {Zou, Wei and Yang, Sen and Bao, Yu and Huang, Shujian and Chen, Jiajun and Cheng, Shanbo},
  journal = {arXiv preprint arXiv:2504.14669},
  year    = {2025}
}

@article{xin2025surrogate,
  title   = {Surrogate Signals from Format and Length: Reinforcement Learning for Solving Mathematical Problems without Ground Truth Answers},
  author  = {Xin, Rihui and Liu, Han and Wang, Zecheng and Zhang, Yupeng and Sui, Dianbo and Hu, Xiaolin and Wang, Bingning},
  journal = {arXiv preprint arXiv:2505.19439},
  year    = {2025}
}

@article{gandhi2025cognitive,
  title   = {Cognitive behaviors that enable self-improving reasoners, or, four habits of highly effective stars},
  author  = {Gandhi, Kanishk and Chakravarthy, Ayush and Singh, Anikait and Lile, Nathan and Goodman, Noah D},
  journal = {arXiv preprint arXiv:2503.01307},
  year    = {2025}
}


@article{zweiger2025self,
  title   = {Self-Adapting Language Models},
  author  = {Zweiger, Adam and Pari, Jyothish and Guo, Han and Aky{\"u}rek, Ekin and Kim, Yoon and Agrawal, Pulkit},
  journal = {arXiv preprint arXiv:2506.10943},
  year    = {2025}
}

@article{zuo2025ttrl,
  title   = {Ttrl: Test-time reinforcement learning},
  author  = {Zuo, Yuxin and Zhang, Kaiyan and Sheng, Li and Qu, Shang and Cui, Ganqu and Zhu, Xuekai and Li, Haozhan and Zhang, Yuchen and Long, Xinwei and Hua, Ermo and others},
  journal = {arXiv preprint arXiv:2504.16084},
  year    = {2025}
}

@article{wang2025reinforcement,
  title   = {Reinforcement learning for reasoning in large language models with one training example},
  author  = {Wang, Yiping and Yang, Qing and Zeng, Zhiyuan and Ren, Liliang and Liu, Liyuan and Peng, Baolin and Cheng, Hao and He, Xuehai and Wang, Kuan and Gao, Jianfeng and others},
  journal = {arXiv preprint arXiv:2504.20571},
  year    = {2025}
}

@article{zhang2025right,
  title   = {Right question is already half the answer: Fully unsupervised llm reasoning incentivization},
  author  = {Zhang, Qingyang and Wu, Haitao and Zhang, Changqing and Zhao, Peilin and Bian, Yatao},
  journal = {arXiv preprint arXiv:2504.05812},
  year    = {2025}
}

@article{zhao2025learning,
  title   = {Learning to reason without external rewards},
  author  = {Zhao, Xuandong and Kang, Zhewei and Feng, Aosong and Levine, Sergey and Song, Dawn},
  journal = {arXiv preprint arXiv:2505.19590},
  year    = {2025}
}

@article{cui2025entropy,
  title   = {The entropy mechanism of reinforcement learning for reasoning language models},
  author  = {Cui, Ganqu and Zhang, Yuchen and Chen, Jiacheng and Yuan, Lifan and Wang, Zhi and Zuo, Yuxin and Li, Haozhan and Fan, Yuchen and Chen, Huayu and Chen, Weize and others},
  journal = {arXiv preprint arXiv:2505.22617},
  year    = {2025}
}

@article{fang2025serl,
  title   = {SeRL: Self-Play Reinforcement Learning for Large Language Models with Limited Data},
  author  = {Fang, Wenkai and Liu, Shunyu and Zhou, Yang and Zhang, Kongcheng and Zheng, Tongya and Chen, Kaixuan and Song, Mingli and Tao, Dacheng},
  journal = {arXiv preprint arXiv:2505.20347},
  year    = {2025}
}

@article{zhang2025consistent,
  title   = {Consistent Paths Lead to Truth: Self-Rewarding Reinforcement Learning for LLM Reasoning},
  author  = {Zhang, Kongcheng and Yao, Qi and Liu, Shunyu and Wang, Yingjie and Lai, Baisheng and Ye, Jieping and Song, Mingli and Tao, Dacheng},
  journal = {arXiv preprint arXiv:2506.08745},
  year    = {2025}
}

@article{prabhudesai2025maximizing,
  title   = {Maximizing Confidence Alone Improves Reasoning},
  author  = {Prabhudesai, Mihir and Chen, Lili and Ippoliti, Alex and Fragkiadaki, Katerina and Liu, Hao and Pathak, Deepak},
  journal = {arXiv preprint arXiv:2505.22660},
  year    = {2025}
}

@inproceedings{lee2013pseudo,
  title        = {Pseudo-label: The simple and efficient semi-supervised learning method for deep neural networks},
  author       = {Lee, Dong-Hyun and others},
  booktitle    = {Workshop on challenges in representation learning, ICML},
  volume       = {3},
  pages        = {896},
  year         = {2013},
  organization = {Atlanta}
}

@article{shumailov2023curse,
  title   = {The curse of recursion: Training on generated data makes models forget},
  author  = {Shumailov, Ilia and Shumaylov, Zakhar and Zhao, Yiren and Gal, Yarin and Papernot, Nicolas and Anderson, Ross},
  journal = {arXiv preprint arXiv:2305.17493},
  year    = {2023}
}


@article{yuan2024self,
  title   = {Self-rewarding language models},
  author  = {Yuan, Weizhe and Pang, Richard Yuanzhe and Cho, Kyunghyun and Sukhbaatar, Sainbayar and Xu, Jing and Weston, Jason},
  journal = {arXiv preprint arXiv:2401.10020},
  volume  = {3},
  year    = {2024}
}

@article{wu2024meta,
  title   = {Meta-rewarding language models: Self-improving alignment with llm-as-a-meta-judge},
  author  = {Wu, Tianhao and Yuan, Weizhe and Golovneva, Olga and Xu, Jing and Tian, Yuandong and Jiao, Jiantao and Weston, Jason and Sukhbaatar, Sainbayar},
  journal = {arXiv preprint arXiv:2407.19594},
  year    = {2024}
}

@article{zhang2025process,
  title   = {Process-based self-rewarding language models},
  author  = {Zhang, Shimao and Liu, Xiao and Zhang, Xin and Liu, Junxiao and Luo, Zheheng and Huang, Shujian and Gong, Yeyun},
  journal = {arXiv preprint arXiv:2503.03746},
  year    = {2025}
}


@article{shao2025spurious,
  title   = {Spurious Rewards: Rethinking Training Signals in RLVR},
  author  = {Shao, Rulin and Li, Shuyue Stella and Xin, Rui and Geng, Scott and Wang, Yiping and Oh, Sewoong and Du, Simon Shaolei and Lambert, Nathan and Min, Sewon and Krishna, Ranjay and others},
  journal = {arXiv preprint arXiv:2506.10947},
  year    = {2025}
}

@article{wei2025unsupervised,
  title   = {Unsupervised Post-Training for Multi-Modal LLM Reasoning via GRPO},
  author  = {Wei, Lai and Li, Yuting and Wang, Chen and Wang, Yue and Kong, Linghe and Huang, Weiran and Sun, Lichao},
  journal = {arXiv preprint arXiv:2505.22453},
  year    = {2025}
}

@article{li2025confidence,
  title   = {Confidence Is All You Need: Few-Shot RL Fine-Tuning of Language Models},
  author  = {Li, Pengyi and Skripkin, Matvey and Zubrey, Alexander and Kuznetsov, Andrey and Oseledets, Ivan},
  journal = {arXiv preprint arXiv:2506.06395},
  year    = {2025}
}

@article{shafayat2025can,
  title   = {Can Large Reasoning Models Self-Train?},
  author  = {Shafayat, Sheikh and Tajwar, Fahim and Salakhutdinov, Ruslan and Schneider, Jeff and Zanette, Andrea},
  journal = {arXiv preprint arXiv:2505.21444},
  year    = {2025}
}

@article{dong2025reinforcement,
  title   = {Reinforcement Pre-Training},
  author  = {Dong, Qingxiu and Dong, Li and Tang, Yao and Ye, Tianzhu and Sun, Yutao and Sui, Zhifang and Wei, Furu},
  journal = {arXiv preprint arXiv:2506.08007},
  year    = {2025}
}

@article{zhang2025no,
  title   = {No Free Lunch: Rethinking Internal Feedback for LLM Reasoning},
  author  = {Zhang, Yanzhi and Zhang, Zhaoxi and Guan, Haoxiang and Cheng, Yilin and Duan, Yitong and Wang, Chen and Wang, Yue and Zheng, Shuxin and He, Jiyan},
  journal = {arXiv preprint arXiv:2506.17219},
  year    = {2025}
}

@article{xu2025genius,
  title   = {Genius: A generalizable and purely unsupervised self-training framework for advanced reasoning},
  author  = {Xu, Fangzhi and Yan, Hang and Ma, Chang and Zhao, Haiteng and Sun, Qiushi and Cheng, Kanzhi and He, Junxian and Liu, Jun and Wu, Zhiyong},
  journal = {arXiv preprint arXiv:2504.08672},
  year    = {2025}
}

@article{agarwal2025unreasonable,
  title   = {The unreasonable effectiveness of entropy minimization in llm reasoning},
  author  = {Agarwal, Shivam and Zhang, Zimin and Yuan, Lifan and Han, Jiawei and Peng, Hao},
  journal = {arXiv preprint arXiv:2505.15134},
  year    = {2025}
}

@article{eysenbach2018diversity,
  title   = {Diversity is all you need: Learning skills without a reward function},
  author  = {Eysenbach, Benjamin and Gupta, Abhishek and Ibarz, Julian and Levine, Sergey},
  journal = {arXiv preprint arXiv:1802.06070},
  year    = {2018}
}

@article{kim2023variational,
  title   = {Variational curriculum reinforcement learning for unsupervised discovery of skills},
  author  = {Kim, Seongun and Lee, Kyowoon and Choi, Jaesik},
  journal = {arXiv preprint arXiv:2310.19424},
  year    = {2023}
}

@article{krishnan2020improving,
  title   = {Improving model calibration with accuracy versus uncertainty optimization},
  author  = {Krishnan, Ranganath and Tickoo, Omesh},
  journal = {Advances in Neural Information Processing Systems},
  volume  = {33},
  pages   = {18237--18248},
  year    = {2020}
}

@article{grandvalet2004semi,
  title   = {Semi-supervised learning by entropy minimization},
  author  = {Grandvalet, Yves and Bengio, Yoshua},
  journal = {Advances in neural information processing systems},
  volume  = {17},
  year    = {2004}
}


@article{wang2020tent,
  title   = {Tent: Fully test-time adaptation by entropy minimization},
  author  = {Wang, Dequan and Shelhamer, Evan and Liu, Shaoteng and Olshausen, Bruno and Darrell, Trevor},
  journal = {arXiv preprint arXiv:2006.10726},
  year    = {2020}
}

@article{zhang2024come,
  title   = {COME: Test-time adaption by Conservatively Minimizing Entropy},
  author  = {Zhang, Qingyang and Bian, Yatao and Kong, Xinke and Zhao, Peilin and Zhang, Changqing},
  journal = {arXiv preprint arXiv:2410.10894},
  year    = {2024}
}

@article{huang2024self,
  title   = {Self-Improvement in Language Models: The Sharpening Mechanism},
  author  = {Huang, Audrey and Block, Adam and Foster, Dylan J and Rohatgi, Dhruv and Zhang, Cyril and Simchowitz, Max and Ash, Jordan T and Krishnamurthy, Akshay},
  journal = {arXiv preprint arXiv:2412.01951},
  year    = {2024}
}

@inproceedings{chapelle2005semi,
  title        = {Semi-supervised classification by low density separation},
  author       = {Chapelle, Olivier and Zien, Alexander},
  booktitle    = {International workshop on artificial intelligence and statistics},
  pages        = {57--64},
  year         = {2005},
  organization = {PMLR}
}

@article{bai2022constitutional,
  title   = {Constitutional ai: Harmlessness from ai feedback},
  author  = {Bai, Yuntao and Kadavath, Saurav and Kundu, Sandipan and Askell, Amanda and Kernion, Jackson and Jones, Andy and Chen, Anna and Goldie, Anna and Mirhoseini, Azalia and McKinnon, Cameron and others},
  journal = {arXiv preprint arXiv:2212.08073},
  year    = {2022}
}

@article{chen2025seed,
  title   = {Seed-grpo: Semantic entropy enhanced grpo for uncertainty-aware policy optimization},
  author  = {Chen, Minghan and Chen, Guikun and Wang, Wenguan and Yang, Yi},
  journal = {arXiv preprint arXiv:2505.12346},
  year    = {2025}
}

@article{cheng2025reasoning,
  title   = {Reasoning with Exploration: An Entropy Perspective},
  author  = {Cheng, Daixuan and Huang, Shaohan and Zhu, Xuekai and Dai, Bo and Zhao, Wayne Xin and Zhang, Zhenliang and Wei, Furu},
  journal = {arXiv preprint arXiv:2506.14758},
  year    = {2025}
}

@article{zhao2025absolute,
  title   = {Absolute zero: Reinforced self-play reasoning with zero data},
  author  = {Zhao, Andrew and Wu, Yiran and Yue, Yang and Wu, Tong and Xu, Quentin and Lin, Matthieu and Wang, Shenzhi and Wu, Qingyun and Zheng, Zilong and Huang, Gao},
  journal = {arXiv preprint arXiv:2505.03335},
  year    = {2025}
}


@article{press2024entropy,
  title   = {The entropy enigma: Success and failure of entropy minimization},
  author  = {Press, Ori and Shwartz-Ziv, Ravid and LeCun, Yann and Bethge, Matthias},
  journal = {arXiv preprint arXiv:2405.05012},
  year    = {2024}
}

@article{lv2025climb,
  title   = {The Climb Carves Wisdom Deeper Than the Summit: On the Noisy Rewards in Learning to Reason},
  author  = {Lv, Ang and Xie, Ruobing and Sun, Xingwu and Kang, Zhanhui and Yan, Rui},
  journal = {arXiv preprint arXiv:2505.22653},
  year    = {2025}
}


## Reward Design - Credit Assignment
@article{uesato2022solving,
  title   = {Solving math word problems with process-and outcome-based feedback},
  author  = {Uesato, Jonathan and Kushman, Nate and Kumar, Ramana and Song, Francis and Siegel, Noah and Wang, Lisa and Creswell, Antonia and Irving, Geoffrey and Higgins, Irina},
  journal = {arXiv preprint arXiv:2211.14275},
  year    = {2022}
}

@inproceedings{lightman2023let,
  title     = {Let's Verify Step by Step},
  author    = {Hunter Lightman and Vineet Kosaraju and Yuri Burda and Harrison Edwards and Bowen Baker and Teddy Lee and Jan Leike and John Schulman and Ilya Sutskever and Karl Cobbe},
  booktitle = {The Twelfth International Conference on Learning Representations},
  year      = {2024},
  url       = {https://openreview.net/forum?id=v8L0pN6EOi}
}

@inproceedings{wang2023math,
  title     = {Math-Shepherd: Verify and Reinforce {LLM}s Step-by-step without Human Annotations},
  author    = {Wang, Peiyi  and
               Li, Lei  and
               Shao, Zhihong  and
               Xu, Runxin  and
               Dai, Damai  and
               Li, Yifei  and
               Chen, Deli  and
               Wu, Yu  and
               Sui, Zhifang},
  editor    = {Ku, Lun-Wei  and
               Martins, Andre  and
               Srikumar, Vivek},
  booktitle = {Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = aug,
  year      = {2024},
  address   = {Bangkok, Thailand},
  publisher = {Association for Computational Linguistics},
  url       = {https://aclanthology.org/2024.acl-long.510/},
  doi       = {10.18653/v1/2024.acl-long.510},
  pages     = {9426--9439}
}

@article{rafailov2023direct,
  title   = {Direct preference optimization: Your language model is secretly a reward model},
  author  = {Rafailov, Rafael and Sharma, Archit and Mitchell, Eric and Manning, Christopher D and Ermon, Stefano and Finn, Chelsea},
  journal = {Advances in neural information processing systems},
  volume  = {36},
  pages   = {53728--53741},
  year    = {2023}
}

@inproceedings{rafailov2024r,
  title     = {From r to Q star: Your Language Model is Secretly a Q-Function},
  author    = {Rafael Rafailov and Joey Hejna and Ryan Park and Chelsea Finn},
  booktitle = {First Conference on Language Modeling},
  year      = {2024},
  url       = {https://openreview.net/forum?id=kEVcNxtqXk}
}

@inproceedings{yuan2024free,
  title     = {Free Process Rewards without Process Labels},
  author    = {Lifan Yuan and Wendi Li and Huayu Chen and Ganqu Cui and Ning Ding and Kaiyan Zhang and Bowen Zhou and Zhiyuan Liu and Hao Peng},
  booktitle = {Forty-second International Conference on Machine Learning},
  year      = {2025},
  url       = {https://openreview.net/forum?id=8ThnPFhGm8}
}

@article{zhang2025lessons,
  title   = {The Lessons of Developing Process Reward Models in Mathematical Reasoning},
  author  = {Zhenru Zhang and Chujie Zheng and Yangzhen Wu and Beichen Zhang and Runji Lin and Bowen Yu and Dayiheng Liu and Jingren Zhou and Junyang Lin},
  journal = {arXiv preprint arXiv:2501.07301},
  year    = {2025}
}

@inproceedings{hadfield2017inverse,
  author    = {Hadfield-Menell, Dylan and Milli, Smitha and Abbeel, Pieter and Russell, Stuart J and Dragan, Anca},
  booktitle = {Advances in Neural Information Processing Systems},
  editor    = {I. Guyon and U. Von Luxburg and S. Bengio and H. Wallach and R. Fergus and S. Vishwanathan and R. Garnett},
  pages     = {},
  publisher = {Curran Associates, Inc.},
  title     = {Inverse Reward Design},
  url       = {https://proceedings.neurips.cc/paper_files/paper/2017/file/32fdab6559cdfa4f167f8c31b9199643-Paper.pdf},
  volume    = {30},
  year      = {2017}
}

@article{schrittwieser2020mastering,
  title     = {Mastering atari, go, chess and shogi by planning with a learned model},
  author    = {Schrittwieser, Julian and Antonoglou, Ioannis and Hubert, Thomas and Simonyan, Karen and Sifre, Laurent and Schmitt, Simon and Guez, Arthur and Lockhart, Edward and Hassabis, Demis and Graepel, Thore and others},
  journal   = {Nature},
  volume    = {588},
  number    = {7839},
  pages     = {604--609},
  year      = {2020},
  publisher = {Nature Publishing Group UK London}
}

@article{silver2018general,
  title     = {A general reinforcement learning algorithm that masters chess, shogi, and Go through self-play},
  author    = {Silver, David and Hubert, Thomas and Schrittwieser, Julian and Antonoglou, Ioannis and Lai, Matthew and Guez, Arthur and Lanctot, Marc and Sifre, Laurent and Kumaran, Dharshan and Graepel, Thore and others},
  journal   = {Science},
  volume    = {362},
  number    = {6419},
  pages     = {1140--1144},
  year      = {2018},
  publisher = {American Association for the Advancement of Science}
}

@inproceedings{arulkumaran2019alphastar,
  title     = {Alphastar: An evolutionary computation perspective},
  author    = {Arulkumaran, Kai and Cully, Antoine and Togelius, Julian},
  booktitle = {Proceedings of the genetic and evolutionary computation conference companion},
  pages     = {314--315},
  year      = {2019}
}

@article{silver2017mastering,
  title   = {Mastering chess and shogi by self-play with a general reinforcement learning algorithm},
  author  = {Silver, David and Hubert, Thomas and Schrittwieser, Julian and Antonoglou, Ioannis and Lai, Matthew and Guez, Arthur and Lanctot, Marc and Sifre, Laurent and Kumaran, Dharshan and Graepel, Thore and others},
  journal = {arXiv preprint arXiv:1712.01815},
  year    = {2017}
}

@article{silver2016mastering,
  title     = {Mastering the game of Go with deep neural networks and tree search},
  author    = {Silver, David and Huang, Aja and Maddison, Chris J and Guez, Arthur and Sifre, Laurent and Van Den Driessche, George and Schrittwieser, Julian and Antonoglou, Ioannis and Panneershelvam, Veda and Lanctot, Marc and others},
  journal   = {nature},
  volume    = {529},
  number    = {7587},
  pages     = {484--489},
  year      = {2016},
  publisher = {Nature Publishing Group}
}

@inproceedings{liu2022meta,
  author    = {Liu, Runze and Bai, Fengshuo and Du, Yali and Yang, Yaodong},
  booktitle = {Advances in Neural Information Processing Systems},
  editor    = {S. Koyejo and S. Mohamed and A. Agarwal and D. Belgrave and K. Cho and A. Oh},
  pages     = {22270--22284},
  publisher = {Curran Associates, Inc.},
  title     = {Meta-Reward-Net: Implicitly Differentiable Reward Learning for Preference-based Reinforcement Learning},
  url       = {https://proceedings.neurips.cc/paper_files/paper/2022/file/8be9c134bb193d8bd3827d4df8488228-Paper-Conference.pdf},
  volume    = {35},
  year      = {2022}
}

@article{sun2025large,
  title    = {A large language model-driven reward design framework via dynamic feedback for reinforcement learning},
  journal  = {Knowledge-Based Systems},
  volume   = {326},
  pages    = {114065},
  year     = {2025},
  issn     = {0950-7051},
  doi      = {https://doi.org/10.1016/j.knosys.2025.114065},
  url      = {https://www.sciencedirect.com/science/article/pii/S0950705125011104},
  author   = {Shengjie Sun and Runze Liu and Jiafei Lyu and Jing-Wen Yang and Liangpeng Zhang and Xiu Li},
  keywords = {Reinforcement learning, Large language model, Reward design}
}

@article{liu2025can1b,
  title   = {Can 1b llm surpass 405b llm? rethinking compute-optimal test-time scaling},
  author  = {Liu, Runze and Gao, Junqi and Zhao, Jian and Zhang, Kaiyan and Li, Xiu and Qi, Biqing and Ouyang, Wanli and Zhou, Bowen},
  journal = {arXiv preprint arXiv:2502.06703},
  year    = {2025}
}

@article{zou2025reasonflux,
  title   = {ReasonFlux-PRM: Trajectory-Aware PRMs for Long Chain-of-Thought Reasoning in LLMs},
  author  = {Zou, Jiaru and Yang, Ling and Gu, Jingwen and Qiu, Jiahao and Shen, Ke and He, Jingrui and Wang, Mengdi},
  journal = {arXiv preprint arXiv:2506.18896},
  year    = {2025}
}

@article{dong2025tool,
  title   = {Tool-Star: Empowering LLM-Brained Multi-Tool Reasoner via Reinforcement Learning},
  author  = {Dong, Guanting and Chen, Yifei and Li, Xiaoxi and Jin, Jiajie and Qian, Hongjin and Zhu, Yutao and Mao, Hangyu and Zhou, Guorui and Dou, Zhicheng and Wen, Ji-Rong},
  journal = {arXiv preprint arXiv:2505.16410},
  year    = {2025}
}

@article{wang2025stepsearch,
  title   = {StepSearch: Igniting LLMs Search Ability via Step-Wise Proximal Policy Optimization},
  author  = {Wang, Ziliang and Zheng, Xuhui and An, Kang and Ouyang, Cijun and Cai, Jialu and Wang, Yuhang and Wu, Yichao},
  journal = {arXiv preprint arXiv:2505.15107},
  year    = {2025}
}


@article{zheng2024processbench,
  title   = {ProcessBench: Identifying Process Errors in Mathematical Reasoning},
  author  = {Zheng, Chujie and Zhang, Zhenru and Zhang, Beichen and Lin, Runji and Lu, Keming and Yu, Bowen and Liu, Dayiheng and Zhou, Jingren and Lin, Junyang},
  journal = {arXiv preprint arXiv:2412.06559},
  year    = {2024}
}


@inproceedings{zhang2025openprm,
  title     = {Open{PRM}: Building Open-domain Process-based Reward Models with Preference Trees},