Bug: mmq.cuh:4254: fatal error

### What happened?

```
export MALLOC_CONF="background_thread:true,percpu_arena:phycpu,metadata_thp:auto,dirty_decay_ms:10000,muzzy_decay_ms:60000"
export LD_PRELOAD=/usr/local/lib/libjemalloc.so

#    --seed 3407 \
#    -fmoe \

ulimit -n 9999
ulimit -l unlimited

CUDA_VISIBLE_DEVICES="0,1" \
/opt/ik_llama.cpp/ik_llama.cpp/build/bin/llama-server \
    --model /mnt/data/opt/THIREUS-R1-3.5652bpw/DeepSeek-R1-0528-THIREUS-BF16-SPECIAL_TENSOR-00001-of-01148.gguf \
    --alias THIREUS/DeepSeek-R1-0528-3.5652bpw \
    --ctx-size $((128 * 1024)) \
    --temp 0.5 --top-k 0 --top-p 1.0 --min-p 0.1 --repeat-penalty 1.0 \
    -ctk q8_0 \
    -mla 3 -fa \
    -amb 512 \
    -b $((4 * 1024)) -ub $((2 * 1024)) \
    -fmoe \
    --split-mode layer \
    --tensor-split 7,8 \
    --main-gpu 1 \
    --override-tensor exps=CPU \
    --n-gpu-layers 99 \
    --threads $(grep ^cpu\\scores /proc/cpuinfo | uniq | awk '{print $4}' | xargs -I{} echo "{}-0" | bc) \
    --host 0.0.0.0 \
    --port 8080 \
    --lookup-cache-dynamic /mnt/data/ik_llama.kv.dump \
    --log-enable \
    --logdir /var/log/ \
    --jinja \
    --special \
    --verbose-prompt --verbosity 2
```

### Name and Version

/opt/ik_llama.cpp/ik_llama.cpp/build/bin/llama-server --version
version: 3860 (0cc32ff0)
built with cc (Debian 14.2.0-19) 14.2.0 for x86_64-linux-gnu

### What operating system are you seeing the problem on?

_No response_

### Relevant log output

```shell
VERB [            update_slots] prompt tokenized | tid="139832978124800" timestamp=1756227110 id_slot=0 id_task=0 n_ctx=131072 n_keep=0 n_pr
ompt_tokens=50 prompt_tokens="<｜begin▁of▁sentence｜><｜begin▁of▁sentence｜><｜User｜>Imagine a runaway trolley is hurtling down a track tow
ards five dead people. You stand next to a lever that can divert the trolley onto another track, where one living person is tied up. Do you
pull the lever?<｜Assistant｜>"
INFO [            update_slots] kv cache rm [p0, end) | tid="139832978124800" timestamp=1756227110 id_slot=0 id_task=0 p0=0
VERB [            update_slots] prompt processing progress | tid="139832978124800" timestamp=1756227110 id_slot=0 n_past=50 n_ctx=131072 n_t
okens=50 progress=1.0
VERB [            update_slots] prompt done | tid="139832978124800" timestamp=1756227110 id_slot=0 n_past=50 n_ctx=131072 n_tokens=50
VERB [            update_slots] decoding batch | tid="139832978124800" timestamp=1756227110 n_tokens=50
mmq_x_best=0
/opt/ik_llama.cpp/ik_llama.cpp/ggml/src/ggml-cuda/template-instances/../mmq.cuh:4254: fatal error
[New LWP 26792]
[New LWP 26791]
[New LWP 26790]
[New LWP 26789]
[New LWP 26788]
[New LWP 26787]
[New LWP 26786]
[New LWP 26785]
[New LWP 26784]
[New LWP 26783]
[New LWP 26782]
[New LWP 26781]
[New LWP 26780]
[New LWP 26779]
[New LWP 26778]
[New LWP 26777]
[New LWP 26776]
[New LWP 26775]
[New LWP 26774]
[New LWP 26773]
[New LWP 26772]
[New LWP 26771]
[New LWP 26770]
[New LWP 26769]
[New LWP 26768]
[New LWP 26767]
[New LWP 26766]
[New LWP 26765]
[New LWP 26764]
[New LWP 26763]
[New LWP 26762]
[New LWP 26761]
[New LWP 26760]
[New LWP 26759]
[New LWP 26758]
[New LWP 26757]
[New LWP 26756]
[New LWP 26755]
[New LWP 26754]
[New LWP 26753]
[New LWP 26752]
[New LWP 26751]
...

[New LWP 26610]
[New LWP 26609]
[New LWP 26608]
[New LWP 26607]
[New LWP 26606]
[New LWP 26605]
[New LWP 26604]
[New LWP 26603]
[New LWP 26602]
[New LWP 26601]
[New LWP 26600]
[New LWP 26599]
[New LWP 26598]
[New LWP 26597]
[New LWP 26596]
[New LWP 26585]
[New LWP 26584]
[New LWP 26583]
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
__syscall_cancel_arch () at ../sysdeps/unix/sysv/linux/x86_64/syscall_cancel.S:56
warning: 56     ../sysdeps/unix/sysv/linux/x86_64/syscall_cancel.S: No such file or directory
#0  __syscall_cancel_arch () at ../sysdeps/unix/sysv/linux/x86_64/syscall_cancel.S:56
56      in ../sysdeps/unix/sysv/linux/x86_64/syscall_cancel.S
#1  0x00007f2d43699668 in __internal_syscall_cancel (a1=<optimized out>, a2=<optimized out>, a3=<optimized out>, a4=<optimized out>, a5=a5@entry=0, a6=a6@entry=0, nr=61) at ./nptl/cancellation.c:49
warning: 49     ./nptl/cancellation.c: No such file or directory
#2  0x00007f2d436996ad in __syscall_cancel (a1=<optimized out>, a2=<optimized out>, a3=<optimized out>, a4=<optimized out>, a5=a5@entry=0, a6=a6@entry=0, nr=61) at ./nptl/cancellation.c:75
75      in ./nptl/cancellation.c
#3  0x00007f2d43704787 in __GI___wait4 (pid=<optimized out>, stat_loc=<optimized out>, options=<optimized out>, usage=<optimized out>) at ../sysdeps/unix/sysv/linux/wait4.c:30
warning: 30     ../sysdeps/unix/sysv/linux/wait4.c: No such file or directory
#4  0x00007f2d43cf31a8 in ggml_abort () from /opt/ik_llama.cpp/ik_llama.cpp/build/ggml/src/libggml.so
#5  0x00007f2d44513036 in void mul_mat_q_case<(ggml_type)340>(ggml_backend_cuda_context&, mmq_args const&, CUstream_st*) () from /opt/ik_llama.cpp/ik_llama.cpp/build/ggml/src/libggml.so
#6  0x00007f2d43e5ac44 in ggml_cuda_op_mul_mat_q(ggml_backend_cuda_context&, ggml_tensor const*, ggml_tensor const*, ggml_tensor*, char const*, float const*, char const*, float*, long, long, long, long, CUstream_st*) () from /opt/ik_llama.cpp/ik_llama.cpp/build/ggml/src/libggml.so
#7  0x00007f2d43f16ef3 in ggml_cuda_op_mul_mat(ggml_backend_cuda_context&, ggml_tensor const*, ggml_tensor const*, ggml_tensor*, void (*)(ggml_backend_cuda_context&, ggml_tensor const*, ggml_tensor const*, ggml_tensor*, char const*, float const*, char const*, float*, long, long, long, long, CUstream_st*), void (*)(float const*, void*, long, long, long, long, ggml_type, CUstream_st*)) [clone .constprop.1] () from /opt/ik_llama.cpp/ik_llama.cpp/build/ggml/src/libggml.so
#8  0x00007f2d43f1e6f0 in ggml_backend_cuda_graph_compute(ggml_backend*, ggml_cgraph*) () from /opt/ik_llama.cpp/ik_llama.cpp/build/ggml/src/libggml.so
#9  0x00007f2d43d492e4 in ggml_backend_sched_graph_compute_async () from /opt/ik_llama.cpp/ik_llama.cpp/build/ggml/src/libggml.so
#10 0x00007f2d670b06b6 in llama_decode () from /opt/ik_llama.cpp/ik_llama.cpp/build/src/libllama.so
#11 0x000055bca6865612 in server_context::update_slots() ()
#12 0x000055bca682d147 in server_queue::start_loop() ()
#13 0x000055bca67aa714 in main ()
[Inferior 1 (process 26582) detached]
/mnt/data/opt/THIREUS-R1-3.5652bpw/run-ik_llama.cpp.sh: line 55: 26582 Aborted                 CUDA_VISIBLE_DEVICES="0,1" /opt/ik_llama.cpp/ik_llama.cpp/build/bin/llama-server --model /mnt/data/opt/THIREUS-R1-3.5652bpw/DeepSeek-R1-0528-THIREUS-BF16-SPECIAL_TENSOR-00001-of-01148.gguf --alias THIREUS/DeepSeek-R1-0528-3.5652bpw --ctx-size $((128 * 1024)) --temp 0.5 --top-k 0 --top-p 1.0 --min-p 0.1 --repeat-penalty 1.0 -ctk q8_0 -mla 3 -fa -amb 512 -b $((4 * 1024)) -ub $((2 * 1024)) -fmoe --split-mode layer --tensor-split 7,8 --main-gpu 1 --override-tensor exps=CPU --n-gpu-layers 99 --threads $(grep ^cpu\\scores /proc/cpuinfo | uniq | awk '{print $4}' | xargs -I{} echo "{}-0" | bc) --host 0.0.0.0 --port 8080 --lookup-cache-dynamic /mnt/data/ik_llama.kv.dump --log-enable --logdir /var/log/ --jinja --special --verbose-prompt --verbosity 2
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Bug: mmq.cuh:4254: fatal error #733

What happened?

Name and Version

What operating system are you seeing the problem on?

Relevant log output

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Bug: mmq.cuh:4254: fatal error #733

Description

What happened?

Name and Version

What operating system are you seeing the problem on?

Relevant log output

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions