-
Notifications
You must be signed in to change notification settings - Fork 27
Expand file tree
/
Copy pathtrain.env
More file actions
47 lines (42 loc) · 1.97 KB
/
train.env
File metadata and controls
47 lines (42 loc) · 1.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# This is the environment file for running HoloMotion scripts.
export CONDA_BASE=$(conda info --base)
export Train_CONDA_PREFIX="$CONDA_BASE/envs/holomotion_train"
# export CUDA_HOME=$Train_CONDA_PREFIX
export CUDA_HOME=/usr/local/cuda
# export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$Train_CONDA_PREFIX/lib/:$Train_CONDA_PREFIX/lib/stubs"
# export LIBRARY_PATH="$Train_CONDA_PREFIX/lib/stubs:$Train_CONDA_PREFIX/lib:$LIBRARY_PATH"
export HYDRA_FULL_ERROR=1
export OMNI_KIT_ACCEPT_EULA="YES"
export ACCEPT_EULA="YES"
export CUDA_LAUNCH_BLOCKING=1
export USE_NVRTC=1
export HDF5_USE_FILE_LOCKING=FALSE
export HOLOMOTION_ISAAC_STAGGER_SEC=5
export HOLOMOTION_HDF5_RDCC_NBYTES=$((4*1024*1024))
# ^ Per-file HDF5 raw chunk cache size in bytes.
# Reduced to 4MB to allow keeping many shards open without OOM.
export HOLOMOTION_HDF5_MAX_OPEN_SHARDS=1000
# ^ Maximum number of HDF5 motion shards kept open per DataLoader worker.
# Increased to 1000 to prevent file open/close churn (thrashing), which
# causes memory fragmentation/leaks in HDF5.
echo "--------------------------------"
echo "Train_CONDA_PREFIX: $Train_CONDA_PREFIX"
echo "CUDA_HOME: $CUDA_HOME"
echo "LD_LIBRARY_PATH: $LD_LIBRARY_PATH"
echo "LIBRARY_PATH: $LIBRARY_PATH"
echo "HYDRA_FULL_ERROR: $HYDRA_FULL_ERROR"
echo "OMNI_KIT_ACCEPT_EULA: $OMNI_KIT_ACCEPT_EULA"
echo "HDF5_USE_FILE_LOCKING: $HDF5_USE_FILE_LOCKING"
echo "--------------------------------"
# Graceful shutdown function for training scripts
# Note: Scripts must set TRAIN_PID variable and call: trap cleanup SIGINT SIGTERM
cleanup() {
echo ""
echo "🛑 Cleanup triggered - shutting down training process ${TRAIN_PID}..."
exec 2>/dev/null # Suppress error messages during cleanup
[[ -n "${TRAIN_PID}" ]] && kill -TERM "${TRAIN_PID}" 2>/dev/null && echo " ✓ Sent TERM signal to process ${TRAIN_PID}"
sleep 2
[[ -n "${TRAIN_PID}" ]] && pkill -P "${TRAIN_PID}" 2>/dev/null && echo " ✓ Killed child processes"
exec 2>&1
echo " ✓ Cleanup complete"
}