Skip to content

deadlock with mimalloc #13149

@LXYan2333

Description

@LXYan2333

Thank you for taking the time to submit an issue!

Background information

I run my OpenMPI program and found it deadlocks with mimalloc, which will call madvice in its malloc(). The threading level is serialized.

the backtrace in gdb looks like:

Using host libthread_db library "/lib64/libthread_db.so.1".
0x00007ffff7bc081d in __lll_lock_wait () from /lib64/libpthread.so.0
(gdb) bt
#0  0x00007ffff7bc081d in __lll_lock_wait () from /lib64/libpthread.so.0
#1  0x00007ffff7bb9ac9 in pthread_mutex_lock () from /lib64/libpthread.so.0
#2  0x00007ffff6899e56 in ofi_import_monitor_notify () from /export/home/yanlongxiang/miniconda3/envs/xscf/lib/./libfabric.so.1
#3  0x00007ffff6a064ec in opal_mem_hooks_release_hook () from /export/home/yanlongxiang/miniconda3/envs/xscf/lib/./libopen-pal.so.80
#4  0x00007ffff6ab09b1 in _intercept_madvise () from /export/home/yanlongxiang/miniconda3/envs/xscf/lib/./libopen-pal.so.80
#5  0x00005555555ba5e5 in unix_madvise (addr=0x3a9ab680000, size=2424832, advice=4)
    at /export/home/yanlongxiang/.cache/CPM/simple_parallel/f7fbc346b406dfe6e368bde835390aacc6084c66/src/mimalloc_for_simple_parallel/src/prim/unix/prim.c:165
#6  _mi_prim_decommit (start=0x3a9ab680000, size=2424832, needs_recommit=0x7fffffffa28f)
    at /export/home/yanlongxiang/.cache/CPM/simple_parallel/f7fbc346b406dfe6e368bde835390aacc6084c66/src/mimalloc_for_simple_parallel/src/prim/unix/prim.c:368
#7  mi_os_decommit_ex (addr=addr@entry=0x3a9ab680000, size=size@entry=2424832, needs_recommit=needs_recommit@entry=0x7fffffffa28f, tld_stats=0x555556c1b380 <tld_main+960>)
    at /export/home/yanlongxiang/.cache/CPM/simple_parallel/f7fbc346b406dfe6e368bde835390aacc6084c66/src/mimalloc_for_simple_parallel/src/os.c:435
#8  0x00005555555ba73d in _mi_os_purge_ex (p=0x3a9ab680000, size=size@entry=2424832, allow_reset=allow_reset@entry=true, stats=stats@entry=0x555556c1b380 <tld_main+960>)
    at /export/home/yanlongxiang/.cache/CPM/simple_parallel/f7fbc346b406dfe6e368bde835390aacc6084c66/src/mimalloc_for_simple_parallel/src/os.c:485
#9  0x00005555555ba9f2 in _mi_os_purge (p=<optimized out>, size=2424832, stats=0x555556c1b380 <tld_main+960>)
    at /export/home/yanlongxiang/.cache/CPM/simple_parallel/f7fbc346b406dfe6e368bde835390aacc6084c66/src/mimalloc_for_simple_parallel/src/os.c:499
#10 mi_segment_purge (segment=segment@entry=0x3a9aa000000, p=p@entry=0x3a9ab680000 "", size=size@entry=2424832, stats=stats@entry=0x555556c1b380 <tld_main+960>)
    at /export/home/yanlongxiang/.cache/CPM/simple_parallel/f7fbc346b406dfe6e368bde835390aacc6084c66/src/mimalloc_for_simple_parallel/src/segment.c:516
#11 0x00005555555babd9 in mi_segment_purge (segment=0x3a9aa000000, p=0x3a9ab680000 "", size=2424832, stats=<optimized out>)
    at /export/home/yanlongxiang/.cache/CPM/simple_parallel/f7fbc346b406dfe6e368bde835390aacc6084c66/src/mimalloc_for_simple_parallel/src/segment.c:584
#12 mi_segment_try_purge (force=<optimized out>, segment=0x3a9aa000000, stats=0x555556c1b380 <tld_main+960>)
    at /export/home/yanlongxiang/.cache/CPM/simple_parallel/f7fbc346b406dfe6e368bde835390aacc6084c66/src/mimalloc_for_simple_parallel/src/segment.c:586
#13 mi_segment_try_purge (segment=0x3a9aa000000, force=<optimized out>, stats=0x555556c1b380 <tld_main+960>)
    at /export/home/yanlongxiang/.cache/CPM/simple_parallel/f7fbc346b406dfe6e368bde835390aacc6084c66/src/mimalloc_for_simple_parallel/src/segment.c:570
#14 0x00005555555bdd43 in mi_segment_schedule_purge (segment=0x3a9aa000000, p=<optimized out>, size=<optimized out>, stats=<optimized out>)
    at /export/home/yanlongxiang/.cache/CPM/simple_parallel/f7fbc346b406dfe6e368bde835390aacc6084c66/src/mimalloc_for_simple_parallel/src/segment.c:557
#15 mi_segment_span_free (segment=segment@entry=0x3a9aa000000, slice_index=360, slice_count=<optimized out>, allow_purge=allow_purge@entry=true, tld=<optimized out>)
    at /export/home/yanlongxiang/.cache/CPM/simple_parallel/f7fbc346b406dfe6e368bde835390aacc6084c66/src/mimalloc_for_simple_parallel/src/segment.c:624
#16 0x00005555555bdefb in mi_segment_span_free_coalesce (slice=0x3a9aa007188, slice@entry=0x3a9aa007a98, tld=tld@entry=0x555556c1afe0 <tld_main+32>)
    at /export/home/yanlongxiang/.cache/CPM/simple_parallel/f7fbc346b406dfe6e368bde835390aacc6084c66/src/mimalloc_for_simple_parallel/src/segment.c:688
#17 0x00005555555be028 in mi_segment_page_clear (page=page@entry=0x3a9aa007a98, tld=tld@entry=0x555556c1afe0 <tld_main+32>)
    at /export/home/yanlongxiang/.cache/CPM/simple_parallel/f7fbc346b406dfe6e368bde835390aacc6084c66/src/mimalloc_for_simple_parallel/src/segment.c:1004
#18 0x00005555555be4cf in _mi_segment_page_free (page=0x3a9aa007a98, force=false, tld=0x555556c1afe0 <tld_main+32>)
    at /export/home/yanlongxiang/.cache/CPM/simple_parallel/f7fbc346b406dfe6e368bde835390aacc6084c66/src/mimalloc_for_simple_parallel/src/segment.c:1019
#19 _mi_page_free (page=0x3a9aa007a98, pq=pq@entry=0x555556c1ac50 <_mi_heap_main+2192>, force=force@entry=false)
    at /export/home/yanlongxiang/.cache/CPM/simple_parallel/f7fbc346b406dfe6e368bde835390aacc6084c66/src/mimalloc_for_simple_parallel/src/page.c:427
#20 0x00005555555becdc in _mi_heap_collect_retired (heap=heap@entry=0x555556c1a3c0 <_mi_heap_main>, force=force@entry=false)
    at /export/home/yanlongxiang/.cache/CPM/simple_parallel/f7fbc346b406dfe6e368bde835390aacc6084c66/src/mimalloc_for_simple_parallel/src/page.c:483
#21 0x00005555555c0818 in mi_page_queue_find_free_ex (heap=0x555556c1a3c0 <_mi_heap_main>, pq=0x555556c1a860 <_mi_heap_main+1184>, first_try=true)
    at /export/home/yanlongxiang/.cache/CPM/simple_parallel/f7fbc346b406dfe6e368bde835390aacc6084c66/src/mimalloc_for_simple_parallel/src/page.c:751
#22 0x00005555555c0912 in mi_find_free_page (heap=0x555556c1a3c0 <_mi_heap_main>, size=<optimized out>)
    at /export/home/yanlongxiang/.cache/CPM/simple_parallel/f7fbc346b406dfe6e368bde835390aacc6084c66/src/mimalloc_for_simple_parallel/src/page.c:789
#23 mi_find_page (heap=0x555556c1a3c0 <_mi_heap_main>, size=<optimized out>, huge_alignment=<optimized out>)
    at /export/home/yanlongxiang/.cache/CPM/simple_parallel/f7fbc346b406dfe6e368bde835390aacc6084c66/src/mimalloc_for_simple_parallel/src/page.c:887
#24 0x00005555555c1920 in _mi_malloc_generic (heap=0x555556c1a3c0 <_mi_heap_main>, size=40, zero=false, huge_alignment=0)
    at /export/home/yanlongxiang/.cache/CPM/simple_parallel/f7fbc346b406dfe6e368bde835390aacc6084c66/src/mimalloc_for_simple_parallel/src/page.c:913
#25 0x00007ffff687d6be in ofi_rbmap_insert () from /export/home/yanlongxiang/miniconda3/envs/xscf/lib/./libfabric.so.1
#26 0x00007ffff689b694 in ofi_mr_cache_search () from /export/home/yanlongxiang/miniconda3/envs/xscf/lib/./libfabric.so.1
#27 0x00007ffff68c6b52 in vrb_mr_reg_iface () from /export/home/yanlongxiang/miniconda3/envs/xscf/lib/./libfabric.so.1
--Type <RET> for more, q to quit, c to continue without paging--c
#28 0x00007ffff68c6cdf in vrb_mr_regattr () from /export/home/yanlongxiang/miniconda3/envs/xscf/lib/./libfabric.so.1
#29 0x00007ffff6916665 in rxm_msg_mr_reg_internal () from /export/home/yanlongxiang/miniconda3/envs/xscf/lib/./libfabric.so.1
#30 0x00007ffff69167ff in rxm_msg_mr_regv () from /export/home/yanlongxiang/miniconda3/envs/xscf/lib/./libfabric.so.1
#31 0x00007ffff6921663 in rxm_handle_rx_buf () from /export/home/yanlongxiang/miniconda3/envs/xscf/lib/./libfabric.so.1
#32 0x00007ffff692183d in rxm_handle_recv_comp () from /export/home/yanlongxiang/miniconda3/envs/xscf/lib/./libfabric.so.1
#33 0x00007ffff69235a8 in rxm_ep_do_progress () from /export/home/yanlongxiang/miniconda3/envs/xscf/lib/./libfabric.so.1
#34 0x00007ffff6923712 in rxm_ep_progress () from /export/home/yanlongxiang/miniconda3/envs/xscf/lib/./libfabric.so.1
#35 0x00007ffff688f7bd in ofi_cq_progress () from /export/home/yanlongxiang/miniconda3/envs/xscf/lib/./libfabric.so.1
#36 0x00007ffff688e710 in ofi_cq_readfrom () from /export/home/yanlongxiang/miniconda3/envs/xscf/lib/./libfabric.so.1
#37 0x00007ffff74714d7 in ompi_mtl_ofi_progress_no_inline () from /export/home/yanlongxiang/miniconda3/envs/xscf/lib/libmpi.so.40
#38 0x00007ffff6a06c63 in opal_progress () from /export/home/yanlongxiang/miniconda3/envs/xscf/lib/./libopen-pal.so.80
#39 0x00007ffff733e341 in ompi_request_default_wait () from /export/home/yanlongxiang/miniconda3/envs/xscf/lib/libmpi.so.40
#40 0x00007ffff73a7fca in ompi_coll_base_bcast_intra_generic () from /export/home/yanlongxiang/miniconda3/envs/xscf/lib/libmpi.so.40
#41 0x00007ffff73dfcdd in ompi_coll_tuned_bcast_intra_dec_fixed () from /export/home/yanlongxiang/miniconda3/envs/xscf/lib/libmpi.so.40
#42 0x00007ffff73541b5 in PMPI_Bcast () from /export/home/yanlongxiang/miniconda3/envs/xscf/lib/libmpi.so.40
#43 0x0000555556ad0e43 in simple_parallel::bigmpi::Bcast (buffer=<optimized out>, count=258048, datatype=0x555556d29ac0 <ompi_mpi_byte>, root=root@entry=0, comm=0x3a9aa21bd00)
    at /export/home/yanlongxiang/.cache/CPM/simple_parallel/f7fbc346b406dfe6e368bde835390aacc6084c66/src/simple_parallel/bigmpi.cc:53

What version of Open MPI are you using? (e.g., v4.1.6, v5.0.1, git branch name and hash, etc.)

$  mpirun --version
mpirun (Open MPI) 5.0.7

Report bugs to https://www.open-mpi.org/community/help/

Describe how Open MPI was installed (e.g., from a source/distribution tarball, from a git clone, from an operating system distribution package, etc.)

OpenMPI is installed from conda's conda-forge channel.

$  conda list
# packages in environment at /export/home/yanlongxiang/miniconda3/envs/xscf:
#
# Name                    Version                   Build  Channel
_openmp_mutex             4.5                  3_kmp_llvm    conda-forge
attr                      2.5.1                h166bdaf_1    conda-forge
binutils_impl_linux-64    2.43                 h4bf12b8_4    conda-forge
bzip2                     1.0.8                h4bc722e_7    conda-forge
c-ares                    1.34.4               hb9d3cd8_0    conda-forge
ca-certificates           2025.2.25            h06a4308_0  
ccache                    4.11                 hd714d17_0    conda-forge
cmake                     3.31.6               h74e3db0_0    conda-forge
gcc                       14.2.0               h96c4ede_2    conda-forge
gcc_impl_linux-64         14.2.0               hdb7739f_2    conda-forge
gxx                       14.2.0               h96c4ede_2    conda-forge
gxx_impl_linux-64         14.2.0               h2ead766_2    conda-forge
icu                       75.1                 he02047a_0    conda-forge
kernel-headers_linux-64   3.10.0              he073ed8_18    conda-forge
keyutils                  1.6.1                h166bdaf_0    conda-forge
krb5                      1.21.3               h659f571_0    conda-forge
ld_impl_linux-64          2.43                 h712a8e2_4    conda-forge
libblas                   3.8.0               17_openblas    conda-forge
libcap                    2.75                 h39aace5_0    conda-forge
libcblas                  3.8.0               17_openblas    conda-forge
libcurl                   8.12.1               h332b0f4_0    conda-forge
libedit                   3.1.20250104    pl5321h7949ede_0    conda-forge
libev                     4.33                 hd590300_2    conda-forge
libevent                  2.1.12               hf998b51_1    conda-forge
libexpat                  2.6.4                h5888daf_0    conda-forge
libfabric                 2.0.0                ha770c72_1    conda-forge
libfabric1                2.0.0                h14e6f36_1    conda-forge
libffi                    3.4.4                h6a678d5_1  
libgcc                    14.2.0               h767d61c_2    conda-forge
libgcc-devel_linux-64     14.2.0             h9c4974d_102    conda-forge
libgcc-ng                 14.2.0               h69a702a_2    conda-forge
libgcrypt-lib             1.11.0               hb9d3cd8_2    conda-forge
libgfortran               14.2.0               h69a702a_2    conda-forge
libgfortran-ng            14.2.0               h69a702a_2    conda-forge
libgfortran5              14.2.0               hf1ad2bd_2    conda-forge
libgomp                   14.2.0               h767d61c_2    conda-forge
libgpg-error              1.51                 hbd13f7d_1    conda-forge
libhiredis                1.0.2                h2cc385e_0    conda-forge
libhwloc                  2.11.2          default_h0d58e46_1001    conda-forge
libiconv                  1.18                 h4ce23a2_1    conda-forge
liblapack                 3.8.0               17_openblas    conda-forge
liblzma                   5.6.4                hb9d3cd8_0    conda-forge
libnghttp2                1.64.0               h161d5f1_0    conda-forge
libnl                     3.11.0               hb9d3cd8_0    conda-forge
libnsl                    2.0.1                hd590300_0    conda-forge
libopenblas               0.3.10          pthreads_h4812303_5    conda-forge
libpmix                   5.0.6                h658e747_0    conda-forge
libsanitizer              14.2.0               hed042b8_2    conda-forge
libsqlite                 3.49.1               hee588c1_2    conda-forge
libssh2                   1.11.1               hf672d98_0    conda-forge
libstdcxx                 14.2.0               h8f9b012_2    conda-forge
libstdcxx-devel_linux-64  14.2.0             h9c4974d_102    conda-forge
libstdcxx-ng              14.2.0               h4852527_2    conda-forge
libsystemd0               257.4                h4e0b6ca_1    conda-forge
libudev1                  257.4                hbe16f8c_1    conda-forge
libuuid                   2.38.1               h0b41bf4_0    conda-forge
libuv                     1.50.0               hb9d3cd8_0    conda-forge
libxcrypt                 4.4.36               hd590300_1    conda-forge
libxml2                   2.13.6               h8d12d68_0    conda-forge
libzlib                   1.3.1                hb9d3cd8_2    conda-forge
llvm-openmp               19.1.7               h024ca30_0    conda-forge
lz4-c                     1.10.0               h5888daf_1    conda-forge
mkl                       2025.0.1            h901ac74_21    conda-forge
mkl-devel                 2025.0.1            ha770c72_21    conda-forge
mkl-include               2025.0.1            hf2ce2f3_21    conda-forge
mkl-static                2025.0.1            ha770c72_21    conda-forge
mpi                       1.0                     openmpi    conda-forge
ncurses                   6.5                  h2d0b736_3    conda-forge
numpy                     1.22.4          py310h4ef5377_0    conda-forge
openmpi                   5.0.7              hb85ec53_100    conda-forge
openssl                   3.4.1                h7b32b05_0    conda-forge
pip                       25.0            py310h06a4308_0  
python                    3.10.16         he725a3c_1_cpython    conda-forge
python_abi                3.10                    5_cp310    conda-forge
rdma-core                 56.0                 h5888daf_0    conda-forge
readline                  8.2                  h5eee18b_0  
rhash                     1.4.5                hb9d3cd8_0    conda-forge
setuptools                75.8.0          py310h06a4308_0  
sysroot_linux-64          2.17                h0157908_18    conda-forge
tbb                       2022.0.0             hceb3a55_0    conda-forge
tk                        8.6.13          noxft_h4845f30_101    conda-forge
tzdata                    2025a                h78e105d_0    conda-forge
ucc                       1.3.0                had72a48_5    conda-forge
ucx                       1.18.0               hfd9a62f_2    conda-forge
wheel                     0.45.1          py310h06a4308_0  
zstd                      1.5.7                hb8e6e7a_1    conda-forge

If you are building/installing from a git clone, please copy-n-paste the output from git submodule status.

Please describe the system on which you are running

  • Operating system/version:
$ cat /etc/os-release
NAME="Rocky Linux"
VERSION="8.6 (Green Obsidian)"
ID="rocky"
ID_LIKE="rhel centos fedora"
VERSION_ID="8.6"
PLATFORM_ID="platform:el8"
PRETTY_NAME="Rocky Linux 8.6 (Green Obsidian)"
ANSI_COLOR="0;32"
CPE_NAME="cpe:/o:rocky:rocky:8:GA"
HOME_URL="https://rockylinux.org/"
BUG_REPORT_URL="https://bugs.rockylinux.org/"
ROCKY_SUPPORT_PRODUCT="Rocky Linux"
ROCKY_SUPPORT_PRODUCT_VERSION="8"
REDHAT_SUPPORT_PRODUCT="Rocky Linux"
REDHAT_SUPPORT_PRODUCT_VERSION="8"
  • Computer hardware:
$ lscpu
Architecture:        x86_64
CPU op-mode(s):      32-bit, 64-bit
Byte Order:          Little Endian
CPU(s):              32
On-line CPU(s) list: 0-31
Thread(s) per core:  1
Core(s) per socket:  16
Socket(s):           2
NUMA node(s):        2
Vendor ID:           GenuineIntel
CPU family:          6
Model:               207
Model name:          INTEL(R) XEON(R) GOLD 6526Y
Stepping:            2
CPU MHz:             2800.000
BogoMIPS:            5600.00
Virtualization:      VT-x
L1d cache:           48K
L1i cache:           32K
L2 cache:            2048K
L3 cache:            38400K
NUMA node0 CPU(s):   0-15
NUMA node1 CPU(s):   16-31
Flags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq dtes64 ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid dca sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch cpuid_fault epb cat_l3 cat_l2 cdp_l3 invpcid_single cdp_l2 ssbd mba ibrs ibpb stibp ibrs_enhanced tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb intel_pt avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx_vnni avx512_bf16 wbnoinvd dtherm ida arat pln pts hfi avx512vbmi umip pku ospke waitpkg avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg tme avx512_vpopcntdq la57 rdpid bus_lock_detect cldemote movdiri movdir64b enqcmd fsrm md_clear serialize tsxldtrk pconfig arch_lbr amx_bf16 avx512_fp16 amx_tile amx_int8 flush_l1d arch_capabilities
  • Network type:

I run the program with mpirun -x UCX_NET_DEVICES=ib0 -np 2 --host host1,host2 --bind-to none


Details of the problem

Please describe, in detail, the problem that you are having, including the behavior you expect to see, the actual behavior that you are seeing, steps to reproduce the problem, etc. It is most helpful if you can attach a small program that a developer can use to reproduce your problem.

Problem: the program deadlocks inside openmpi when mimalloc is used, and the backtrace is pasted above.

Note: If you include verbatim output (or a code block), please use a GitHub Markdown code block like below:

shell$ mpirun -n 2 ./hello_world

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions