-
Notifications
You must be signed in to change notification settings - Fork 1.5k
Description
Prerequisite
- I have searched Issues and Discussions but cannot get the expected help.
- The bug has not been fixed in the latest version(https://github.com/open-mmlab/mmpose).
Environment
OrderedDict([('sys.platform', 'linux'), ('Python', '3.8.0 (default, Nov 6 2019, 21:49:08) [GCC 7.3.0]'), ('CUDA available', True), ('MUSA available', False), ('numpy_random_seed', 2147483648), ('GPU 0', 'NVIDIA GeForce RTX 4090'), ('CUDA_HOME', '/usr/local/cuda'), ('NVCC', 'Cuda compilation tools, release 12.4, V12.4.131'), ('GCC', 'gcc (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0'), ('PyTorch', '1.8.0+cu111'), ('PyTorch compiling details', 'PyTorch built with:\n - GCC 7.3\n - C++ Version: 201402\n - Intel(R) Math Kernel Library Version 2020.0.0 Product Build 20191122 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v1.7.0 (Git Hash 7aed236906b1f7a05c0917e5257a1af05e9ff683)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 11.1\n - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86\n - CuDNN 8.0.5\n - Magma 2.5.2\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.1, CUDNN_VERSION=8.0.5, CXX_COMPILER=/opt/rh/devtoolset-7/root/usr/bin/c++, CXX_FLAGS= -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -fopenmp -DNDEBUG -DUSE_KINETO -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -O2 -fPIC -Wno-narrowing -Wall -Wextra -Werror=return-type -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-sign-compare -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function -Wno-unused-result -Wno-unused-local-typedefs -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_VERSION=1.8.0, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=ON, USE_NNPACK=ON, USE_OPENMP=ON, \n'), ('TorchVision', '0.9.0+cu111'), ('OpenCV', '4.10.0'), ('MMEngine', '0.10.4'), ('MMPose', '1.3.2+')])
mmcv 2.2.0
mmcv-full 1.7.0
mmdeploy 1.3.1 /home/ubuntu/data2/ll/volleyball/mmpose-main/mmdeploy
mmdet 3.3.0 /home/ubuntu/data2/ll/volleyball/mmpose-main/mmdetection-main
mmengine 0.10.4
mmpose 1.3.2 /home/ubuntu/data2/ll/volleyball/mmpose-main/mmpose-main
timm 0.6.5
Reproduces the problem - code sample
base = ['../../../base/default_runtime.py']
runtime
max_epochs = 300
stage2_num_epochs = 30
base_lr = 4e-3
train_cfg = dict(max_epochs=max_epochs, val_interval=5)
randomness = dict(seed=21)
optimizer
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
paramwise_cfg=dict(
norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
learning rate
param_scheduler = [
dict(
type='LinearLR',
start_factor=1.0e-5,
by_epoch=False,
begin=0,
end=15345*5),
dict(
# use cosine lr from 210 to 420 epoch
type='CosineAnnealingLR',
eta_min=base_lr * 0.05,
begin=max_epochs // 2,
end=max_epochs,
T_max=max_epochs // 2,
by_epoch=True,
convert_to_iter_based=True),
]
automatically scaling LR based on the actual training batch size
auto_scale_lr = dict(base_batch_size=1024)
codec settings
codec = dict(
type='SimCCLabel',
input_size=(288, 384),
sigma=(6., 6.93),
simcc_split_ratio=2.0,
normalize=False,
use_dark=False)
model settings
model
widen_factor = 0.5
deepen_factor = 0.33
model = dict(
type='TopdownPoseEstimator',
data_preprocessor=dict(
type='PoseDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True),
backbone=dict(
scope='mmdet',
type='CSPNeXt',
arch='P5',
expand_ratio=0.5,
deepen_factor=0.67,
widen_factor=0.75,
out_indices=(2,3,4 ),
channel_attention=True,
norm_cfg=dict(type='SyncBN'),
act_cfg=dict(type='SiLU'),
init_cfg=dict(
type='Pretrained',
prefix='backbone.',
checkpoint='/home/ubuntu/data2/ll/volleyball/mmpose-main/mmpose-main/preweight/cspnext-m_udp-body7_210e-384x288-b9bc2b57_20230504.pth' # noqa
)),
neck=dict(
type='HybridEncoder',
in_channels=[192, 384, 768],
deepen_factor=deepen_factor,
widen_factor=widen_factor,
hidden_dim=256,
use_encoder_idx=[2],
output_indices=[2],
encoder_cfg=dict(
self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
ffn_cfg=dict(
embed_dims=256,
feedforward_channels=1024,
ffn_drop=0.0,
act_cfg=dict(type='GELU'))),
),
head=dict(
type='RTMCCHead',
in_channels=256,
out_channels=17,
input_size=codec['input_size'],
in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
simcc_split_ratio=codec['simcc_split_ratio'],
final_layer_kernel_size=7,
gau_cfg=dict(
hidden_dims=256,
s=128,
expansion_factor=2,
dropout_rate=0.,
drop_path=0.,
act_fn='SiLU',
use_rel_bias=False,
pos_enc=False),
loss=dict(
type='KLDiscretLoss',
use_target_weight=True,
beta=10.,
label_softmax=True),
decoder=codec),
test_cfg=dict(flip_test=True, ))
base dataset settings
backend_args = dict(backend='local')
backend_args = dict(
backend='petrel',
path_mapping=dict({
f'{data_root}': 's3://openmmlab/datasets/',
f'{data_root}': 's3://openmmlab/datasets/'
}))
pipelines
train_pipeline = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='RandomFlip', direction='horizontal'),
dict(type='RandomHalfBody'),
dict(
type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='mmdet.YOLOXHSVRandomAug'),
dict(
type='Albumentation',
transforms=[
dict(type='Blur', p=0.1),
dict(type='MedianBlur', p=0.1),
dict(
type='CoarseDropout',
max_holes=1,
max_height=0.4,
max_width=0.4,
min_holes=1,
min_height=0.2,
min_width=0.2,
p=1.0),
]),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs')
]
val_pipeline = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='PackPoseInputs')
]
train_pipeline_stage2 = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='RandomFlip', direction='horizontal'),
dict(type='RandomHalfBody'),
dict(
type='RandomBBoxTransform',
shift_factor=0.,
scale_factor=[0.75, 1.25],
rotate_factor=60),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='mmdet.YOLOXHSVRandomAug'),
dict(
type='Albumentation',
transforms=[
dict(type='Blur', p=0.1),
dict(type='MedianBlur', p=0.1),
dict(
type='CoarseDropout',
max_holes=1,
max_height=0.4,
max_width=0.4,
min_holes=1,
min_height=0.2,
min_width=0.2,
p=0.5),
]),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs')
]
train datasets
dataset_type = 'CocoDataset'
data_mode = 'topdown'
data_root = r'/home/ubuntu/xxf/dataset/body7/'
data loaders
train_dataloader = dict(
batch_size=64,
num_workers=8,
persistent_workers=True,
pin_memory=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='annotations/body7_train.json',
metainfo=dict(from_file='/home/ubuntu/data2/ll/volleyball/mmpose-main/mmpose-main/configs/base/datasets/coco.py'),
data_prefix=dict(img='images/train'),
pipeline=train_pipeline,
))
val_dataloader = dict(
batch_size=32,
num_workers=6,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='annotations/person_keypoints_val2017.json',
# bbox_file='',
bbox_file=f'{data_root}person_detection_results/COCO_val2017_detections_AP_H_56_person.json',
data_prefix=dict(img='images/val'),
metainfo=dict(from_file='/home/ubuntu/data2/ll/volleyball/mmpose-main/mmpose-main/configs/base/datasets/coco.py'),
test_mode=True,
pipeline=val_pipeline,
))
test_dataloader = val_dataloader
hooks
default_hooks = dict(
checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=3))
custom_hooks = [
dict(
type='EMAHook',
ema_type='ExpMomentumEMA',
momentum=0.0002,
update_buffers=True,
priority=49),
dict(
type='mmdet.PipelineSwitchHook',
switch_epoch=max_epochs - stage2_num_epochs,
switch_pipeline=train_pipeline_stage2)
]
evaluators
val_evaluator = dict(
type='CocoMetric',
ann_file=data_root + 'annotations/person_keypoints_val2017.json')
test_evaluator = val_evaluator
Reproduces the problem - command or script
nohup python3 train.py > rtmpose_m_384_body7_1114_3.log 2>&1 &
Reproduces the problem - error message
at Nov 15 15:31:04 2025
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01 Driver Version: 535.183.01 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA GeForce RTX 4090 Off | 00000000:AF:00.0 Off | Off |
| 34% 33C P8 18W / 450W | 15451MiB / 24564MiB | 0% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
+---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| 0 N/A N/A 1651 G /usr/lib/xorg/Xorg 9MiB |
| 0 N/A N/A 1815 G /usr/bin/gnome-shell 8MiB |
| 0 N/A N/A 4038238 C python3 15416MiB |
Additional information
During the model training process, it often gets stuck with no progress, the GPU usage is 0, and the program doesn’t report any errors or crash.