Skip to content

[Bug] Training is stuck #3252

@ll8657

Description

@ll8657

Prerequisite

Environment

OrderedDict([('sys.platform', 'linux'), ('Python', '3.8.0 (default, Nov 6 2019, 21:49:08) [GCC 7.3.0]'), ('CUDA available', True), ('MUSA available', False), ('numpy_random_seed', 2147483648), ('GPU 0', 'NVIDIA GeForce RTX 4090'), ('CUDA_HOME', '/usr/local/cuda'), ('NVCC', 'Cuda compilation tools, release 12.4, V12.4.131'), ('GCC', 'gcc (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0'), ('PyTorch', '1.8.0+cu111'), ('PyTorch compiling details', 'PyTorch built with:\n - GCC 7.3\n - C++ Version: 201402\n - Intel(R) Math Kernel Library Version 2020.0.0 Product Build 20191122 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v1.7.0 (Git Hash 7aed236906b1f7a05c0917e5257a1af05e9ff683)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 11.1\n - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86\n - CuDNN 8.0.5\n - Magma 2.5.2\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.1, CUDNN_VERSION=8.0.5, CXX_COMPILER=/opt/rh/devtoolset-7/root/usr/bin/c++, CXX_FLAGS= -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -fopenmp -DNDEBUG -DUSE_KINETO -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -O2 -fPIC -Wno-narrowing -Wall -Wextra -Werror=return-type -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-sign-compare -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function -Wno-unused-result -Wno-unused-local-typedefs -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_VERSION=1.8.0, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=ON, USE_NNPACK=ON, USE_OPENMP=ON, \n'), ('TorchVision', '0.9.0+cu111'), ('OpenCV', '4.10.0'), ('MMEngine', '0.10.4'), ('MMPose', '1.3.2+')])
mmcv 2.2.0
mmcv-full 1.7.0
mmdeploy 1.3.1 /home/ubuntu/data2/ll/volleyball/mmpose-main/mmdeploy
mmdet 3.3.0 /home/ubuntu/data2/ll/volleyball/mmpose-main/mmdetection-main
mmengine 0.10.4
mmpose 1.3.2 /home/ubuntu/data2/ll/volleyball/mmpose-main/mmpose-main
timm 0.6.5

Reproduces the problem - code sample

base = ['../../../base/default_runtime.py']

runtime

max_epochs = 300
stage2_num_epochs = 30
base_lr = 4e-3

train_cfg = dict(max_epochs=max_epochs, val_interval=5)
randomness = dict(seed=21)

optimizer

optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
paramwise_cfg=dict(
norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))

learning rate

param_scheduler = [
dict(
type='LinearLR',
start_factor=1.0e-5,
by_epoch=False,
begin=0,
end=15345*5),
dict(
# use cosine lr from 210 to 420 epoch
type='CosineAnnealingLR',
eta_min=base_lr * 0.05,
begin=max_epochs // 2,
end=max_epochs,
T_max=max_epochs // 2,
by_epoch=True,
convert_to_iter_based=True),
]

automatically scaling LR based on the actual training batch size

auto_scale_lr = dict(base_batch_size=1024)

codec settings

codec = dict(
type='SimCCLabel',
input_size=(288, 384),
sigma=(6., 6.93),
simcc_split_ratio=2.0,
normalize=False,
use_dark=False)

model settings

model

widen_factor = 0.5
deepen_factor = 0.33
model = dict(
type='TopdownPoseEstimator',
data_preprocessor=dict(
type='PoseDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True),
backbone=dict(
scope='mmdet',
type='CSPNeXt',
arch='P5',
expand_ratio=0.5,
deepen_factor=0.67,
widen_factor=0.75,
out_indices=(2,3,4 ),
channel_attention=True,
norm_cfg=dict(type='SyncBN'),
act_cfg=dict(type='SiLU'),
init_cfg=dict(
type='Pretrained',
prefix='backbone.',
checkpoint='/home/ubuntu/data2/ll/volleyball/mmpose-main/mmpose-main/preweight/cspnext-m_udp-body7_210e-384x288-b9bc2b57_20230504.pth' # noqa
)),
neck=dict(
type='HybridEncoder',
in_channels=[192, 384, 768],
deepen_factor=deepen_factor,
widen_factor=widen_factor,
hidden_dim=256,
use_encoder_idx=[2],
output_indices=[2],
encoder_cfg=dict(
self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
ffn_cfg=dict(
embed_dims=256,
feedforward_channels=1024,
ffn_drop=0.0,
act_cfg=dict(type='GELU'))),
),
head=dict(
type='RTMCCHead',
in_channels=256,
out_channels=17,
input_size=codec['input_size'],
in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
simcc_split_ratio=codec['simcc_split_ratio'],
final_layer_kernel_size=7,
gau_cfg=dict(
hidden_dims=256,
s=128,
expansion_factor=2,
dropout_rate=0.,
drop_path=0.,
act_fn='SiLU',
use_rel_bias=False,
pos_enc=False),
loss=dict(
type='KLDiscretLoss',
use_target_weight=True,
beta=10.,
label_softmax=True),
decoder=codec),
test_cfg=dict(flip_test=True, ))

base dataset settings

backend_args = dict(backend='local')

backend_args = dict(

backend='petrel',

path_mapping=dict({

f'{data_root}': 's3://openmmlab/datasets/',

f'{data_root}': 's3://openmmlab/datasets/'

}))

pipelines

train_pipeline = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='RandomFlip', direction='horizontal'),
dict(type='RandomHalfBody'),
dict(
type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='mmdet.YOLOXHSVRandomAug'),
dict(
type='Albumentation',
transforms=[
dict(type='Blur', p=0.1),
dict(type='MedianBlur', p=0.1),
dict(
type='CoarseDropout',
max_holes=1,
max_height=0.4,
max_width=0.4,
min_holes=1,
min_height=0.2,
min_width=0.2,
p=1.0),
]),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs')
]
val_pipeline = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='PackPoseInputs')
]

train_pipeline_stage2 = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='RandomFlip', direction='horizontal'),
dict(type='RandomHalfBody'),
dict(
type='RandomBBoxTransform',
shift_factor=0.,
scale_factor=[0.75, 1.25],
rotate_factor=60),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='mmdet.YOLOXHSVRandomAug'),
dict(
type='Albumentation',
transforms=[
dict(type='Blur', p=0.1),
dict(type='MedianBlur', p=0.1),
dict(
type='CoarseDropout',
max_holes=1,
max_height=0.4,
max_width=0.4,
min_holes=1,
min_height=0.2,
min_width=0.2,
p=0.5),
]),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs')
]

train datasets

dataset_type = 'CocoDataset'
data_mode = 'topdown'
data_root = r'/home/ubuntu/xxf/dataset/body7/'

data loaders

train_dataloader = dict(
batch_size=64,
num_workers=8,
persistent_workers=True,
pin_memory=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='annotations/body7_train.json',
metainfo=dict(from_file='/home/ubuntu/data2/ll/volleyball/mmpose-main/mmpose-main/configs/base/datasets/coco.py'),
data_prefix=dict(img='images/train'),
pipeline=train_pipeline,
))
val_dataloader = dict(
batch_size=32,
num_workers=6,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='annotations/person_keypoints_val2017.json',
# bbox_file='',
bbox_file=f'{data_root}person_detection_results/COCO_val2017_detections_AP_H_56_person.json',
data_prefix=dict(img='images/val'),
metainfo=dict(from_file='/home/ubuntu/data2/ll/volleyball/mmpose-main/mmpose-main/configs/base/datasets/coco.py'),
test_mode=True,
pipeline=val_pipeline,
))
test_dataloader = val_dataloader

hooks

default_hooks = dict(
checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=3))

custom_hooks = [
dict(
type='EMAHook',
ema_type='ExpMomentumEMA',
momentum=0.0002,
update_buffers=True,
priority=49),
dict(
type='mmdet.PipelineSwitchHook',
switch_epoch=max_epochs - stage2_num_epochs,
switch_pipeline=train_pipeline_stage2)
]

evaluators

val_evaluator = dict(
type='CocoMetric',
ann_file=data_root + 'annotations/person_keypoints_val2017.json')
test_evaluator = val_evaluator

Reproduces the problem - command or script

nohup python3 train.py > rtmpose_m_384_body7_1114_3.log 2>&1 &

Reproduces the problem - error message

at Nov 15 15:31:04 2025
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01 Driver Version: 535.183.01 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA GeForce RTX 4090 Off | 00000000:AF:00.0 Off | Off |
| 34% 33C P8 18W / 450W | 15451MiB / 24564MiB | 0% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+

+---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| 0 N/A N/A 1651 G /usr/lib/xorg/Xorg 9MiB |
| 0 N/A N/A 1815 G /usr/bin/gnome-shell 8MiB |
| 0 N/A N/A 4038238 C python3 15416MiB |

Additional information

During the model training process, it often gets stuck with no progress, the GPU usage is 0, and the program doesn’t report any errors or crash.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions