polish(pu): polish exp-name, add prompt_length log

puyuan1996 · puyuan1996 · commit 3c13ec1f18a2 · 2026-01-23T18:23:57.000+08:00
diff --git a/zoo/jericho/priorzero/models/actor.py b/zoo/jericho/priorzero/models/actor.py
@@ -263,12 +263,24 @@ def train_batch(self, batch_data: Dict[str, torch.Tensor], kl_ctl: float, step_i
 
             self.strategy.optimizer_step(self.actor_optim, self.actor, self.actor_scheduler, name="actor")
 
-            # Calculate response length statistics
+            # Calculate response length statistics (action tokens)
             response_lengths = micro_batch['action_mask'].sum(dim=1).float()
             avg_response_length = response_lengths.mean().item()
             max_response_length = response_lengths.max().item()
             min_response_length = response_lengths.min().item()
 
+            # Calculate prompt length statistics (total - action tokens)
+            total_lengths = micro_batch['attention_mask'].sum(dim=1).float()
+            prompt_lengths = total_lengths - response_lengths
+            avg_prompt_length = prompt_lengths.mean().item()
+            max_prompt_length = prompt_lengths.max().item()
+            min_prompt_length = prompt_lengths.min().item()
+
+            # Calculate total sequence length statistics
+            avg_total_length = total_lengths.mean().item()
+            max_total_length = total_lengths.max().item()
+            min_total_length = total_lengths.min().item()
+
             # Calculate log_probs statistics
             valid_log_probs = action_log_probs[micro_batch['action_mask'] > 0]
             avg_log_prob = valid_log_probs.mean().item() if valid_log_probs.numel() > 0 else 0.0
@@ -288,10 +300,18 @@ def train_batch(self, batch_data: Dict[str, torch.Tensor], kl_ctl: float, step_i
                 # "approx_kl": approx_kl.detach().float().mean().item(),
                 "cur_old_kl": approx_kl.detach().float().mean().item(),
                 "iter": self.train_iter,
-                # Response length statistics
+                # Response length statistics (action tokens)
                 "response_length_avg": avg_response_length,
                 "response_length_max": max_response_length,
                 "response_length_min": min_response_length,
+                # Prompt length statistics (context tokens)
+                "prompt_length_avg": avg_prompt_length,
+                "prompt_length_max": max_prompt_length,
+                "prompt_length_min": min_prompt_length,
+                # Total sequence length statistics
+                "total_length_avg": avg_total_length,
+                "total_length_max": max_total_length,
+                "total_length_min": min_total_length,
                 # Log prob and ratio statistics
                 "log_prob_avg": avg_log_prob,
                 "ratio_avg": avg_ratio,
diff --git a/zoo/jericho/priorzero/priorzero_config.py b/zoo/jericho/priorzero/priorzero_config.py
@@ -412,6 +412,40 @@ def get_priorzero_config(
     print(f"  - Tensor Parallel Size: {llm_config.vllm_tensor_parallel_size}")
     print(f"  - GPU Memory Utilization: {llm_config.gpu_memory_utilization}")
 
+    # Auto-generate exp_name with key configuration info if not provided
+    if exp_name is None:
+        # Extract key configuration parameters
+        adv_type = llm_config.advantage_type
+        adv_type_short = {
+            'advantage': 'adv',
+            'target_reward': 'tgt-rew',
+            'advantage_batch_norm': 'adv-bn',
+            'advantage_running_norm': 'adv-rn',
+        }.get(adv_type, adv_type)
+
+        # Prior temperature schedule info
+        prior_temp_cfg = llm_config.prior_temp_schedule
+        if prior_temp_cfg.enable:
+            prior_temp_str = f"pt-{prior_temp_cfg.schedule_type[:3]}-{prior_temp_cfg.init_temperature:.1f}to{prior_temp_cfg.final_temperature:.1f}"
+        else:
+            prior_temp_str = "pt-off"
+
+        # CoT info
+        cot_str = "cot" if use_cot else "nocot"
+
+        # Format reward info
+        fmt_rew_str = "fmt" if llm_config.reward_func.format_reward else "nofmt"
+
+        # Build exp_name
+        exp_name = (
+            f"data_priorzero/pz_{env_id}_{model_key}_"
+            f"{cot_str}_{adv_type_short}_{prior_temp_str}_{fmt_rew_str}_seed{seed}"
+        )
+
+        # Update config with generated exp_name
+        main_config.exp_name = exp_name
+        print(f"\n[Config] Auto-generated exp_name: {exp_name}\n")
+
     return main_config, create_config, llm_config
 
 
diff --git a/zoo/jericho/priorzero/priorzero_entry_sync_ddp.py b/zoo/jericho/priorzero/priorzero_entry_sync_ddp.py
@@ -350,7 +350,7 @@ def main():
     print(f"Quick Test: {args.quick_test}")
     print(f"{'='*80}\n")
 
-    # use_cot = True 
+    # use_cot = True
     if args.quick_test:
         logger.info("Using quick test configuration")
         main_cfg, create_cfg, llm_cfg = get_priorzero_debug_config(
@@ -359,9 +359,11 @@ def main():
             model_key=model_key,
         )
     else:
+        # Generate exp_name with key configuration info
+        # This will be called after get_priorzero_config, so we'll modify it there
         main_cfg, create_cfg, llm_cfg = get_priorzero_config(
             args.env_id, args.seed, use_cot=args.use_cot,
-            exp_name=f'data_priorzero/priorzero_ddp_ppo_{args.env_id}_use_cot_{args.use_cot}_{model_key}_with_fmtReward_seed0',
+            exp_name=None,  # Will be auto-generated with config info
             model_key=model_key,
             multi_gpu=True
         )