style(nyz): polish rl_utils style details (ci skip)

PaParaZz1 · PaParaZz1 · commit 6c2ca2fac2ec · 2025-03-01T18:43:52.000+08:00
diff --git a/ding/rl_utils/README.md b/ding/rl_utils/README.md
@@ -98,6 +98,6 @@ Peak GPU Memory: 2560.27 MB
 To run the tests:
 
 ```bash
-pytest -v -s test_log_prob_fn.py
+pytest -v -s tests/test_log_prob_utils.py
 ```
 
diff --git a/ding/rl_utils/grpo.py b/ding/rl_utils/grpo.py
@@ -15,7 +15,7 @@ def grpo_policy_error(
 ) -> Tuple[namedtuple, namedtuple]:
     """
         Overview:
-             Group Relative Policy Optimization(	arxiv:2402.03300) .
+             Group Relative Policy Optimization (GRPO) algorithm, see https://arxiv.org/abs/2402.03300.
         Arguments:
             - data (:obj:`namedtuple`): the grpo input data with fields shown in ``grpo_policy_data``.
             - clip_ratio (:obj:`float`): the ppo clip ratio for the constraint of policy update, defaults to 0.2.
diff --git a/ding/rl_utils/rloo.py b/ding/rl_utils/rloo.py
@@ -14,7 +14,7 @@ def rloo_policy_error(
 ) -> Tuple[namedtuple, namedtuple]:
     """
     Overview:
-        REINFORCE Leave-One-Out(arXiv:2402.14740)
+        REINFORCE Leave-One-Out (RLOO) algorithm, see https://arxiv.org/abs/2402.14740.
     Arguments:
         - data (:obj:`namedtuple`): the rloo input data with fields shown in ``rloo_policy_data``.
         - clip_ratio (:obj:`float`): the ppo clip ratio for the constraint of policy update, defaults to 0.2.