Skip to content

Commit 6dcf560

Browse files
committed
Refactor ActorsHQ scripts for modular execution and fix LPIPS error handling
- Add ValueError handling in _safe_lpips for torchmetrics validation errors that occur when image value distributions differ - Refactor run_actorshq.py to accept CLI arguments, making it callable from batch_run_actorshq.py as a subprocess - Extract build_exp_name() helper to eliminate duplicate experiment naming logic
1 parent 1a24e75 commit 6dcf560

File tree

3 files changed

+113
-135
lines changed

3 files changed

+113
-135
lines changed

examples/simple_trainer.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -414,14 +414,26 @@ def rasterize_splats(
414414
return render_colors, render_alphas, info
415415

416416
def _safe_lpips(self, preds: Tensor, targets: Tensor) -> Tensor:
417-
"""Compute LPIPS with a cuDNN fallback for known internal errors."""
417+
"""Compute LPIPS with fallbacks for known errors."""
418418
try:
419419
return self.lpips(preds, targets)
420420
except RuntimeError as exc:
421421
if "CUDNN_STATUS_INTERNAL_ERROR" not in str(exc):
422422
raise
423423
with torch.backends.cudnn.flags(enabled=False):
424424
return self.lpips(preds, targets)
425+
except ValueError as exc:
426+
# Handle torchmetrics LPIPS validation errors that can occur
427+
# when rendered images have different value distributions than
428+
# ground truth (e.g., at step 0 before training)
429+
if "Expected both input arguments to be normalized tensors" in str(exc):
430+
print(
431+
f"Warning: LPIPS validation failed (preds range: [{preds.min():.4f}, {preds.max():.4f}], "
432+
f"targets range: [{targets.min():.4f}, {targets.max():.4f}]). "
433+
f"Returning 0.0 as placeholder."
434+
)
435+
return torch.tensor(0.0, device=preds.device)
436+
raise
425437

426438
def train(self):
427439
cfg = self.cfg

scripts/batch_run_actorshq.py

Lines changed: 22 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -13,16 +13,17 @@
1313

1414
# ================= Configuration =================
1515
# Modify these lists to specify which actors, sequences, and frames to run
16-
ACTORS = ["Actor01", "Actor02", "Actor03", "Actor04", "Actor05", "Actor06", "Actor07", "Actor08"]
16+
# ACTORS = ["Actor01", "Actor02", "Actor03", "Actor04", "Actor05", "Actor06", "Actor07", "Actor08"]
17+
ACTORS = ["Actor06", "Actor02"]
1718
SEQUENCES = ["Sequence1"] # e.g., ["Sequence1", "Sequence2"]
18-
FRAME_IDS = [0]
19+
FRAME_IDS = [1]
1920

2021
# Method: "train" or "eval"
2122
METHOD = "train"
2223

2324
# GPUs to use (list of GPU IDs, e.g., ["0", "1", "2", "3"] or ["0"])
2425
# Jobs will be distributed across these GPUs in parallel
25-
CUDA_DEVICES = ["0", "1"]
26+
CUDA_DEVICES = ["0", "1", "2", "3"]
2627

2728
# Base data directory
2829
BASE_DATA_DIR = "/synology/actorshq/colmap"
@@ -32,6 +33,9 @@
3233

3334
# Root run path (working directory for running experiments)
3435
ROOT_RUN_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
36+
37+
# Path to the run_actorshq.py script
38+
RUN_SCRIPT_PATH = os.path.join(os.path.dirname(__file__), "run_actorshq.py")
3539
# ================================================
3640

3741

@@ -44,110 +48,40 @@ class RunConfig:
4448
cuda_device: str = "1"
4549

4650

47-
def build_data_dir(actor: str, sequence: str, resolution: int = 4) -> str:
48-
"""Build the data directory path for a given actor and sequence."""
49-
return f"{BASE_DATA_DIR}/{actor}/{sequence}/{resolution}x/frames"
51+
def build_data_dir(actor: str, sequence: str, frame_id: int, resolution: int = 4) -> str:
52+
"""Build the data directory path for a given actor, sequence, and frame."""
53+
return f"{BASE_DATA_DIR}/{actor}/{sequence}/{resolution}x/frames/frame{frame_id}"
5054

5155

5256
def run_single_experiment(config: RunConfig):
5357
"""Run a single experiment with the given configuration."""
54-
data_dir = build_data_dir(config.actor, config.sequence, RESOLUTION)
58+
data_dir = build_data_dir(config.actor, config.sequence, config.frame_id, RESOLUTION)
59+
exp_name_prefix = f"{config.actor}_{config.sequence}"
5560

5661
print(f"\n{'='*60}")
5762
print(f"Running: Actor={config.actor}, Sequence={config.sequence}, Frame={config.frame_id}")
5863
print(f"Data dir: {data_dir}")
5964
print(f"Method: {config.method}")
65+
print(f"GPU: {config.cuda_device}")
6066
print(f"{'='*60}\n")
6167

6268
# Set environment variables
6369
env = os.environ.copy()
6470
env["CUDA_VISIBLE_DEVICES"] = config.cuda_device
6571

66-
# Build the command - we'll modify the config via command line or temp config
67-
# For simplicity, we'll create a modified version of the script inline
68-
script_content = f'''
69-
import sys
70-
import os
71-
sys.path.insert(0, "{ROOT_RUN_PATH}")
72-
from examples.simple_trainer import main2
73-
from gsplat.strategy import DefaultStrategy
74-
from examples.config import Config, load_config_from_toml, merge_config
75-
from scripts.utils import set_result_dir
76-
77-
os.environ["CUDA_VISIBLE_DEVICES"] = "{config.cuda_device}"
78-
79-
# Build default config
80-
default_cfg = Config(strategy=DefaultStrategy(verbose=True))
81-
default_cfg.adjust_steps(default_cfg.steps_scaler)
82-
83-
# Load template config
84-
template_path = "./configs/actorshq.toml"
85-
cfg = load_config_from_toml(template_path)
86-
cfg = merge_config(default_cfg, cfg)
87-
88-
# Override data directory
89-
cfg.data_dir = "{data_dir}/frame{config.frame_id}"
90-
91-
# Build experiment name
92-
exp_name = f"{config.actor}_{config.sequence}_l1_{{1.0 - cfg.ssim_lambda}}_ssim_{{cfg.ssim_lambda}}"
93-
if cfg.masked_l1_loss:
94-
exp_name += f"_ml1_{{cfg.masked_l1_lambda}}"
95-
if cfg.masked_ssim_loss:
96-
exp_name += f"_mssim_{{cfg.masked_ssim_lambda}}"
97-
if cfg.alpha_loss:
98-
exp_name += f"_alpha_{{cfg.alpha_lambda}}"
99-
if cfg.scale_var_loss:
100-
exp_name += f"_svar_{{cfg.scale_var_lambda}}"
101-
if cfg.random_bkgd:
102-
exp_name += "_rbkgd"
103-
104-
cfg.disable_viewer = True
105-
frame_id = {config.frame_id}
106-
107-
if "{config.method}" == "train":
108-
cfg.exp_name = exp_name
109-
cfg.scene_id = frame_id
110-
set_result_dir(cfg, exp_name)
111-
cfg.run_mode = "train"
112-
cfg.save_ply = True
113-
cfg.max_steps = 30000
114-
cfg.save_steps = list(sorted(set(range(0, cfg.max_steps + 1, 10000)) | {{1}}))
115-
cfg.ply_steps = cfg.save_steps
116-
cfg.eval_steps = cfg.save_steps
117-
cfg.init_type = "sfm"
118-
cfg.strategy = DefaultStrategy(verbose=True)
119-
120-
print(f"Training frame {{frame_id}}")
121-
print(f"exp_name={{cfg.exp_name}}, scene_id={{cfg.scene_id}}, run_mode={{cfg.run_mode}}")
122-
main2(0, 0, 1, cfg)
123-
124-
elif "{config.method}" == "eval":
125-
cfg.exp_name = exp_name
126-
cfg.run_mode = "eval"
127-
cfg.init_type = "sfm"
128-
cfg.save_ply = False
129-
cfg.scene_id = frame_id
130-
set_result_dir(cfg, exp_name=exp_name)
131-
iter = cfg.max_steps
132-
ckpt = os.path.join(f"{{cfg.result_dir}}/ckpts/ckpt_{{iter - 1}}_rank0.pt")
133-
cfg.ckpt = ckpt
134-
135-
print(f"Evaluating frame {{frame_id}}")
136-
main2(0, 0, 1, cfg)
137-
'''
138-
139-
# Write temp script and run it
140-
temp_script = f"/tmp/run_actorshq_{config.actor}_{config.sequence}_{config.frame_id}.py"
141-
with open(temp_script, "w") as f:
142-
f.write(script_content)
72+
# Build the command to call run_actorshq.py
73+
cmd = [
74+
"python", RUN_SCRIPT_PATH,
75+
"--data_dir", data_dir,
76+
"--frame_id", str(config.frame_id),
77+
"--method", config.method,
78+
"--exp_name_prefix", exp_name_prefix,
79+
"--disable_viewer",
80+
]
14381

14482
# Run the script
145-
cmd = ["python", temp_script]
14683
result = subprocess.run(cmd, env=env, cwd=ROOT_RUN_PATH)
14784

148-
# Clean up temp script
149-
os.remove(temp_script)
150-
15185
return result.returncode
15286

15387

scripts/run_actorshq.py

Lines changed: 78 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from dataclasses import dataclass
2-
from typing import ClassVar
2+
from typing import ClassVar, Optional
3+
import argparse
34
import os
45
import sys
56

@@ -11,6 +12,23 @@
1112
from examples.config import Config, load_config_from_toml, merge_config
1213
from scripts.utils import set_result_dir
1314

15+
16+
def parse_args():
17+
parser = argparse.ArgumentParser(description="Run ActorsHQ training/evaluation")
18+
parser.add_argument("--data_dir", type=str, default=None,
19+
help="Path to the data directory (overrides config)")
20+
parser.add_argument("--frame_id", type=int, default=None,
21+
help="Frame ID to train/evaluate")
22+
parser.add_argument("--method", type=str, choices=["train", "eval"], default=None,
23+
help="Method: train or eval")
24+
parser.add_argument("--exp_name_prefix", type=str, default="actorshq",
25+
help="Prefix for experiment name (e.g., 'Actor02_Sequence1')")
26+
parser.add_argument("--config", type=str, default="./configs/actorshq.toml",
27+
help="Path to config file")
28+
parser.add_argument("--disable_viewer", action="store_true",
29+
help="Disable the viewer")
30+
return parser.parse_args()
31+
1432
def run_experiment(config: Config, dist=False):
1533
print(
1634
f"------- Running: "
@@ -73,59 +91,73 @@ class Method:
7391
"""
7492

7593
# ================= Global Configurations =================
76-
method = Method.train
77-
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
94+
# These are used when running without command-line arguments
95+
DEFAULT_METHOD = Method.train
96+
DEFAULT_CUDA_DEVICE = "0"
97+
DEFAULT_START_FRAME = 0
98+
DEFAULT_END_FRAME = 0
7899
# =========================================================
79100

80-
if __name__ == '__main__':
81-
# build default config
101+
102+
def build_exp_name(cfg: Config, prefix: str = "actorshq") -> str:
103+
"""Build experiment name based on config settings."""
104+
exp_name = f"{prefix}_l1_{1.0 - cfg.ssim_lambda}_ssim_{cfg.ssim_lambda}"
105+
if cfg.masked_l1_loss:
106+
exp_name += f"_ml1_{cfg.masked_l1_lambda}"
107+
if cfg.masked_ssim_loss:
108+
exp_name += f"_mssim_{cfg.masked_ssim_lambda}"
109+
if cfg.alpha_loss:
110+
exp_name += f"_alpha_{cfg.alpha_lambda}"
111+
if cfg.scale_var_loss:
112+
exp_name += f"_svar_{cfg.scale_var_lambda}"
113+
if cfg.random_bkgd:
114+
exp_name += "_rbkgd"
115+
return exp_name
116+
117+
118+
if __name__ == '__main__':
119+
args = parse_args()
120+
121+
# Set CUDA device if not already set by parent process
122+
if "CUDA_VISIBLE_DEVICES" not in os.environ:
123+
os.environ["CUDA_VISIBLE_DEVICES"] = DEFAULT_CUDA_DEVICE
124+
125+
# Build default config
82126
default_cfg = Config(strategy=DefaultStrategy(verbose=True))
83127
default_cfg.adjust_steps(default_cfg.steps_scaler)
84-
85-
# read the template of yaml from file
86-
template_path = "./configs/actorshq.toml"
128+
129+
# Read the template config from file
130+
template_path = args.config
87131
cfg = load_config_from_toml(template_path)
88132
cfg = merge_config(default_cfg, cfg)
89-
90-
if method == Method.eval:
91-
exp_name = f"actorshq_l1_{1.0 - cfg.ssim_lambda}_ssim_{cfg.ssim_lambda}"
92-
if cfg.masked_l1_loss:
93-
exp_name += f"_ml1_{cfg.masked_l1_lambda}"
94-
if cfg.masked_ssim_loss:
95-
exp_name += f"_mssim_{cfg.masked_ssim_lambda}"
96-
if cfg.alpha_loss:
97-
exp_name += f"_alpha_{cfg.alpha_lambda}"
98-
if cfg.scale_var_loss:
99-
exp_name += f"_svar_{cfg.scale_var_lambda}"
100-
if cfg.random_bkgd:
101-
exp_name += "_rbkgd"
102-
# exp_name = exp_name + "_test"
103-
104-
cfg.disable_viewer = False
105-
iter = cfg.max_steps
106-
start_frame_id = 0
107-
end_frame_id = 0
108-
133+
134+
# Override data_dir if provided
135+
if args.data_dir is not None:
136+
cfg.data_dir = args.data_dir
137+
138+
# Determine method
139+
method = args.method if args.method else DEFAULT_METHOD
140+
141+
# Determine frame range
142+
if args.frame_id is not None:
143+
start_frame_id = args.frame_id
144+
end_frame_id = args.frame_id
145+
else:
146+
start_frame_id = DEFAULT_START_FRAME
147+
end_frame_id = DEFAULT_END_FRAME
148+
149+
# Build experiment name
150+
exp_name = build_exp_name(cfg, args.exp_name_prefix)
151+
152+
# Set viewer
153+
cfg.disable_viewer = args.disable_viewer
154+
155+
if method == Method.eval or method == "eval":
156+
iter_num = cfg.max_steps
109157
for frame_id in range(start_frame_id, end_frame_id + 1):
110158
print(f"\nEvaluating frame {frame_id}")
111-
evaluate_frame(frame_id, iter, cfg, exp_name)
112-
elif method == Method.train:
113-
exp_name = f"actorshq_l1_{1.0 - cfg.ssim_lambda}_ssim_{cfg.ssim_lambda}"
114-
if cfg.masked_l1_loss:
115-
exp_name += f"_ml1_{cfg.masked_l1_lambda}"
116-
if cfg.masked_ssim_loss:
117-
exp_name += f"_mssim_{cfg.masked_ssim_lambda}"
118-
if cfg.alpha_loss:
119-
exp_name += f"_alpha_{cfg.alpha_lambda}"
120-
if cfg.scale_var_loss:
121-
exp_name += f"_svar_{cfg.scale_var_lambda}"
122-
if cfg.random_bkgd:
123-
exp_name += "_rbkgd"
124-
# exp_name = exp_name + "_test"
125-
126-
cfg.disable_viewer = False
127-
start_frame_id = 0
128-
end_frame_id = 0
159+
evaluate_frame(frame_id, iter_num, cfg, exp_name)
160+
elif method == Method.train or method == "train":
129161
for frame_id in range(start_frame_id, end_frame_id + 1):
130162
print(f"\nTraining frame {frame_id}")
131163
train_frame(frame_id, cfg, exp_name)

0 commit comments

Comments
 (0)