Refactor ActorsHQ scripts for modular execution and fix LPIPS error handling

haodongw101 · haodongw101 · commit 6dcf5606b95e · 2026-01-21T16:13:38.000-08:00
- Add ValueError handling in _safe_lpips for torchmetrics validation errors that occur when image value distributions differ
- Refactor run_actorshq.py to accept CLI arguments, making it callable from batch_run_actorshq.py as a subprocess
- Extract build_exp_name() helper to eliminate duplicate experiment naming logic
diff --git a/examples/simple_trainer.py b/examples/simple_trainer.py
@@ -414,14 +414,26 @@ def rasterize_splats(
         return render_colors, render_alphas, info
 
     def _safe_lpips(self, preds: Tensor, targets: Tensor) -> Tensor:
-        """Compute LPIPS with a cuDNN fallback for known internal errors."""
+        """Compute LPIPS with fallbacks for known errors."""
         try:
             return self.lpips(preds, targets)
         except RuntimeError as exc:
             if "CUDNN_STATUS_INTERNAL_ERROR" not in str(exc):
                 raise
             with torch.backends.cudnn.flags(enabled=False):
                 return self.lpips(preds, targets)
+        except ValueError as exc:
+            # Handle torchmetrics LPIPS validation errors that can occur
+            # when rendered images have different value distributions than
+            # ground truth (e.g., at step 0 before training)
+            if "Expected both input arguments to be normalized tensors" in str(exc):
+                print(
+                    f"Warning: LPIPS validation failed (preds range: [{preds.min():.4f}, {preds.max():.4f}], "
+                    f"targets range: [{targets.min():.4f}, {targets.max():.4f}]). "
+                    f"Returning 0.0 as placeholder."
+                )
+                return torch.tensor(0.0, device=preds.device)
+            raise
 
     def train(self):
         cfg = self.cfg
diff --git a/scripts/batch_run_actorshq.py b/scripts/batch_run_actorshq.py
@@ -13,16 +13,17 @@
 
 # ================= Configuration =================
 # Modify these lists to specify which actors, sequences, and frames to run
-ACTORS = ["Actor01", "Actor02", "Actor03", "Actor04", "Actor05", "Actor06", "Actor07", "Actor08"]
+# ACTORS = ["Actor01", "Actor02", "Actor03", "Actor04", "Actor05", "Actor06", "Actor07", "Actor08"]
+ACTORS = ["Actor06", "Actor02"]
 SEQUENCES = ["Sequence1"]  # e.g., ["Sequence1", "Sequence2"]
-FRAME_IDS = [0]
+FRAME_IDS = [1]
 
 # Method: "train" or "eval"
 METHOD = "train"
 
 # GPUs to use (list of GPU IDs, e.g., ["0", "1", "2", "3"] or ["0"])
 # Jobs will be distributed across these GPUs in parallel
-CUDA_DEVICES = ["0", "1"]
+CUDA_DEVICES = ["0", "1", "2", "3"]
 
 # Base data directory
 BASE_DATA_DIR = "/synology/actorshq/colmap"
@@ -32,6 +33,9 @@
 
 # Root run path (working directory for running experiments)
 ROOT_RUN_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+
+# Path to the run_actorshq.py script
+RUN_SCRIPT_PATH = os.path.join(os.path.dirname(__file__), "run_actorshq.py")
 # ================================================
 
 
@@ -44,110 +48,40 @@ class RunConfig:
     cuda_device: str = "1"
 
 
-def build_data_dir(actor: str, sequence: str, resolution: int = 4) -> str:
-    """Build the data directory path for a given actor and sequence."""
-    return f"{BASE_DATA_DIR}/{actor}/{sequence}/{resolution}x/frames"
+def build_data_dir(actor: str, sequence: str, frame_id: int, resolution: int = 4) -> str:
+    """Build the data directory path for a given actor, sequence, and frame."""
+    return f"{BASE_DATA_DIR}/{actor}/{sequence}/{resolution}x/frames/frame{frame_id}"
 
 
 def run_single_experiment(config: RunConfig):
     """Run a single experiment with the given configuration."""
-    data_dir = build_data_dir(config.actor, config.sequence, RESOLUTION)
+    data_dir = build_data_dir(config.actor, config.sequence, config.frame_id, RESOLUTION)
+    exp_name_prefix = f"{config.actor}_{config.sequence}"
 
     print(f"\n{'='*60}")
     print(f"Running: Actor={config.actor}, Sequence={config.sequence}, Frame={config.frame_id}")
     print(f"Data dir: {data_dir}")
     print(f"Method: {config.method}")
+    print(f"GPU: {config.cuda_device}")
     print(f"{'='*60}\n")
 
     # Set environment variables
     env = os.environ.copy()
     env["CUDA_VISIBLE_DEVICES"] = config.cuda_device
 
-    # Build the command - we'll modify the config via command line or temp config
-    # For simplicity, we'll create a modified version of the script inline
-    script_content = f'''
-import sys
-import os
-sys.path.insert(0, "{ROOT_RUN_PATH}")
-from examples.simple_trainer import main2
-from gsplat.strategy import DefaultStrategy
-from examples.config import Config, load_config_from_toml, merge_config
-from scripts.utils import set_result_dir
-
-os.environ["CUDA_VISIBLE_DEVICES"] = "{config.cuda_device}"
-
-# Build default config
-default_cfg = Config(strategy=DefaultStrategy(verbose=True))
-default_cfg.adjust_steps(default_cfg.steps_scaler)
-
-# Load template config
-template_path = "./configs/actorshq.toml"
-cfg = load_config_from_toml(template_path)
-cfg = merge_config(default_cfg, cfg)
-
-# Override data directory
-cfg.data_dir = "{data_dir}/frame{config.frame_id}"
-
-# Build experiment name
-exp_name = f"{config.actor}_{config.sequence}_l1_{{1.0 - cfg.ssim_lambda}}_ssim_{{cfg.ssim_lambda}}"
-if cfg.masked_l1_loss:
-    exp_name += f"_ml1_{{cfg.masked_l1_lambda}}"
-if cfg.masked_ssim_loss:
-    exp_name += f"_mssim_{{cfg.masked_ssim_lambda}}"
-if cfg.alpha_loss:
-    exp_name += f"_alpha_{{cfg.alpha_lambda}}"
-if cfg.scale_var_loss:
-    exp_name += f"_svar_{{cfg.scale_var_lambda}}"
-if cfg.random_bkgd:
-    exp_name += "_rbkgd"
-
-cfg.disable_viewer = True
-frame_id = {config.frame_id}
-
-if "{config.method}" == "train":
-    cfg.exp_name = exp_name
-    cfg.scene_id = frame_id
-    set_result_dir(cfg, exp_name)
-    cfg.run_mode = "train"
-    cfg.save_ply = True
-    cfg.max_steps = 30000
-    cfg.save_steps = list(sorted(set(range(0, cfg.max_steps + 1, 10000)) | {{1}}))
-    cfg.ply_steps = cfg.save_steps
-    cfg.eval_steps = cfg.save_steps
-    cfg.init_type = "sfm"
-    cfg.strategy = DefaultStrategy(verbose=True)
-
-    print(f"Training frame {{frame_id}}")
-    print(f"exp_name={{cfg.exp_name}}, scene_id={{cfg.scene_id}}, run_mode={{cfg.run_mode}}")
-    main2(0, 0, 1, cfg)
-
-elif "{config.method}" == "eval":
-    cfg.exp_name = exp_name
-    cfg.run_mode = "eval"
-    cfg.init_type = "sfm"
-    cfg.save_ply = False
-    cfg.scene_id = frame_id
-    set_result_dir(cfg, exp_name=exp_name)
-    iter = cfg.max_steps
-    ckpt = os.path.join(f"{{cfg.result_dir}}/ckpts/ckpt_{{iter - 1}}_rank0.pt")
-    cfg.ckpt = ckpt
-
-    print(f"Evaluating frame {{frame_id}}")
-    main2(0, 0, 1, cfg)
-'''
-
-    # Write temp script and run it
-    temp_script = f"/tmp/run_actorshq_{config.actor}_{config.sequence}_{config.frame_id}.py"
-    with open(temp_script, "w") as f:
-        f.write(script_content)
+    # Build the command to call run_actorshq.py
+    cmd = [
+        "python", RUN_SCRIPT_PATH,
+        "--data_dir", data_dir,
+        "--frame_id", str(config.frame_id),
+        "--method", config.method,
+        "--exp_name_prefix", exp_name_prefix,
+        "--disable_viewer",
+    ]
 
     # Run the script
-    cmd = ["python", temp_script]
     result = subprocess.run(cmd, env=env, cwd=ROOT_RUN_PATH)
 
-    # Clean up temp script
-    os.remove(temp_script)
-
     return result.returncode
 
 
diff --git a/scripts/run_actorshq.py b/scripts/run_actorshq.py
@@ -1,5 +1,6 @@
 from dataclasses import dataclass
-from typing import ClassVar
+from typing import ClassVar, Optional
+import argparse
 import os
 import sys
 
@@ -11,6 +12,23 @@
 from examples.config import Config, load_config_from_toml, merge_config
 from scripts.utils import set_result_dir
 
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Run ActorsHQ training/evaluation")
+    parser.add_argument("--data_dir", type=str, default=None,
+                        help="Path to the data directory (overrides config)")
+    parser.add_argument("--frame_id", type=int, default=None,
+                        help="Frame ID to train/evaluate")
+    parser.add_argument("--method", type=str, choices=["train", "eval"], default=None,
+                        help="Method: train or eval")
+    parser.add_argument("--exp_name_prefix", type=str, default="actorshq",
+                        help="Prefix for experiment name (e.g., 'Actor02_Sequence1')")
+    parser.add_argument("--config", type=str, default="./configs/actorshq.toml",
+                        help="Path to config file")
+    parser.add_argument("--disable_viewer", action="store_true",
+                        help="Disable the viewer")
+    return parser.parse_args()
+
 def run_experiment(config: Config, dist=False):
     print(
             f"------- Running: "
@@ -73,59 +91,73 @@ class Method:
     """
 
 # ================= Global Configurations =================
-method = Method.train
-os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+# These are used when running without command-line arguments
+DEFAULT_METHOD = Method.train
+DEFAULT_CUDA_DEVICE = "0"
+DEFAULT_START_FRAME = 0
+DEFAULT_END_FRAME = 0
 # =========================================================
 
-if __name__ == '__main__':    
-    # build default config
+
+def build_exp_name(cfg: Config, prefix: str = "actorshq") -> str:
+    """Build experiment name based on config settings."""
+    exp_name = f"{prefix}_l1_{1.0 - cfg.ssim_lambda}_ssim_{cfg.ssim_lambda}"
+    if cfg.masked_l1_loss:
+        exp_name += f"_ml1_{cfg.masked_l1_lambda}"
+    if cfg.masked_ssim_loss:
+        exp_name += f"_mssim_{cfg.masked_ssim_lambda}"
+    if cfg.alpha_loss:
+        exp_name += f"_alpha_{cfg.alpha_lambda}"
+    if cfg.scale_var_loss:
+        exp_name += f"_svar_{cfg.scale_var_lambda}"
+    if cfg.random_bkgd:
+        exp_name += "_rbkgd"
+    return exp_name
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    # Set CUDA device if not already set by parent process
+    if "CUDA_VISIBLE_DEVICES" not in os.environ:
+        os.environ["CUDA_VISIBLE_DEVICES"] = DEFAULT_CUDA_DEVICE
+
+    # Build default config
     default_cfg = Config(strategy=DefaultStrategy(verbose=True))
     default_cfg.adjust_steps(default_cfg.steps_scaler)
-    
-    # read the template of yaml from file
-    template_path = "./configs/actorshq.toml"
+
+    # Read the template config from file
+    template_path = args.config
     cfg = load_config_from_toml(template_path)
     cfg = merge_config(default_cfg, cfg)
-    
-    if method == Method.eval:
-        exp_name = f"actorshq_l1_{1.0 - cfg.ssim_lambda}_ssim_{cfg.ssim_lambda}"
-        if cfg.masked_l1_loss:
-            exp_name += f"_ml1_{cfg.masked_l1_lambda}"
-        if cfg.masked_ssim_loss:
-            exp_name += f"_mssim_{cfg.masked_ssim_lambda}"
-        if cfg.alpha_loss:
-            exp_name += f"_alpha_{cfg.alpha_lambda}"
-        if cfg.scale_var_loss:
-            exp_name += f"_svar_{cfg.scale_var_lambda}"
-        if cfg.random_bkgd:
-            exp_name += "_rbkgd"
-        # exp_name = exp_name + "_test"
-        
-        cfg.disable_viewer = False
-        iter = cfg.max_steps
-        start_frame_id = 0
-        end_frame_id = 0
-        
+
+    # Override data_dir if provided
+    if args.data_dir is not None:
+        cfg.data_dir = args.data_dir
+
+    # Determine method
+    method = args.method if args.method else DEFAULT_METHOD
+
+    # Determine frame range
+    if args.frame_id is not None:
+        start_frame_id = args.frame_id
+        end_frame_id = args.frame_id
+    else:
+        start_frame_id = DEFAULT_START_FRAME
+        end_frame_id = DEFAULT_END_FRAME
+
+    # Build experiment name
+    exp_name = build_exp_name(cfg, args.exp_name_prefix)
+
+    # Set viewer
+    cfg.disable_viewer = args.disable_viewer
+
+    if method == Method.eval or method == "eval":
+        iter_num = cfg.max_steps
         for frame_id in range(start_frame_id, end_frame_id + 1):
             print(f"\nEvaluating frame {frame_id}")
-            evaluate_frame(frame_id, iter, cfg, exp_name)
-    elif method == Method.train:
-        exp_name = f"actorshq_l1_{1.0 - cfg.ssim_lambda}_ssim_{cfg.ssim_lambda}"
-        if cfg.masked_l1_loss:
-            exp_name += f"_ml1_{cfg.masked_l1_lambda}"
-        if cfg.masked_ssim_loss:
-            exp_name += f"_mssim_{cfg.masked_ssim_lambda}"
-        if cfg.alpha_loss:
-            exp_name += f"_alpha_{cfg.alpha_lambda}"
-        if cfg.scale_var_loss:
-            exp_name += f"_svar_{cfg.scale_var_lambda}"
-        if cfg.random_bkgd:
-            exp_name += "_rbkgd"
-        # exp_name = exp_name + "_test"
-        
-        cfg.disable_viewer = False
-        start_frame_id = 0
-        end_frame_id = 0
+            evaluate_frame(frame_id, iter_num, cfg, exp_name)
+    elif method == Method.train or method == "train":
         for frame_id in range(start_frame_id, end_frame_id + 1):
             print(f"\nTraining frame {frame_id}")
             train_frame(frame_id, cfg, exp_name)