Skip to content

Commit 6144f46

Browse files
committed
First release of llamacpp fork for gfx906 and head dimension 128 models
1 parent b0d5299 commit 6144f46

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

75 files changed

+7805
-4073
lines changed

README.md

Lines changed: 173 additions & 523 deletions
Large diffs are not rendered by default.

SCRIPT_compile_MI50.sh

Lines changed: 202 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,202 @@
1+
#!/bin/bash
2+
#
3+
# SCRIPT MI50 Compilation Script for llama.cpp
4+
# Optimized build for AMD MI50 (gfx906) with ROCm/HIP support
5+
#
6+
# This script compiles llama.cpp with maximum optimizations for the MI50 GPU
7+
# including server support, flash attention, and all performance features
8+
#
9+
10+
set -e # Exit on any error
11+
12+
# Colors for output
13+
RED='\033[0;31m'
14+
GREEN='\033[0;32m'
15+
YELLOW='\033[1;33m'
16+
BLUE='\033[0;34m'
17+
NC='\033[0m' # No Color
18+
19+
echo -e "${BLUE}======================================${NC}"
20+
echo -e "${BLUE} SCRIPT MI50 llama.cpp Builder ${NC}"
21+
echo -e "${BLUE}======================================${NC}"
22+
23+
# Check if we're in the right directory
24+
if [[ ! -f "CMakeLists.txt" ]]; then
25+
echo -e "${RED}Error: Not in llama.cpp root directory${NC}"
26+
echo "Please run this script from the llama.cpp root directory"
27+
exit 1
28+
fi
29+
30+
# Verify ROCm installation
31+
echo -e "${YELLOW}Checking ROCm installation...${NC}"
32+
if ! command -v rocm_agent_enumerator &> /dev/null; then
33+
echo -e "${RED}Error: ROCm not found. Please install ROCm first.${NC}"
34+
exit 1
35+
fi
36+
37+
# Check for gfx906 support
38+
GPUS=$(rocm_agent_enumerator)
39+
if [[ ! "$GPUS" =~ "gfx906" ]]; then
40+
echo -e "${RED}Warning: gfx906 (MI50) not detected in system${NC}"
41+
echo "Available GPUs: $GPUS"
42+
read -p "Continue anyway? (y/N): " -n 1 -r
43+
echo
44+
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
45+
exit 1
46+
fi
47+
fi
48+
49+
echo -e "${GREEN}✓ ROCm installation verified${NC}"
50+
echo -e "${GREEN}✓ Available GPUs: $GPUS${NC}"
51+
52+
# Set ROCm environment variables for optimal gfx906 compilation
53+
echo -e "${YELLOW}Setting ROCm environment variables for gfx906...${NC}"
54+
export ROCM_PATH=${ROCM_PATH:-/opt/rocm}
55+
export HCC_AMDGPU_TARGET=gfx906
56+
export HSA_OVERRIDE_GFX_VERSION=9.0.6
57+
export AMDGPU_TARGETS=gfx906
58+
export GPU_TARGETS=gfx906
59+
60+
# Clean previous build
61+
echo -e "${YELLOW}Cleaning previous build...${NC}"
62+
rm -rf build
63+
mkdir -p build
64+
65+
# Configure with maximum optimizations
66+
echo -e "${YELLOW}Configuring CMake with MI50 optimizations...${NC}"
67+
cd build
68+
69+
cmake .. \
70+
-DCMAKE_BUILD_TYPE=Release \
71+
-DCMAKE_C_COMPILER=gcc \
72+
-DCMAKE_CXX_COMPILER=g++ \
73+
-DCMAKE_HIP_COMPILER_FORCED=1 \
74+
-DCMAKE_HIP_ARCHITECTURES=gfx906 \
75+
-DCMAKE_C_FLAGS="-O3 -march=native -mtune=native -DNDEBUG -ffast-math -fno-finite-math-only -ffp-contract=fast" \
76+
-DCMAKE_CXX_FLAGS="-O3 -march=native -mtune=native -DNDEBUG -DGGML_HIP_GFX906_OPTIMIZED -ffast-math -fno-finite-math-only -ffp-contract=fast" \
77+
-DCMAKE_HIP_FLAGS=" --offload-arch=gfx906 -DGGML_HIP_GFX906_OPTIMIZED -Wno-ignored-attributes -Wno-cuda-compat -Wno-unused-result -mllvm -amdgpu-simplify-libcall -mllvm -amdgpu-internalize-symbols -mllvm -amdgpu-enable-lower-module-lds=false -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false -ffast-math -ffp-contract=fast" \
78+
-DGGML_HIP=ON \
79+
-DGGML_HIP_MMQ_MFMA=ON \
80+
-DGGML_HIP_GRAPHS=ON \
81+
-DGGML_HIP_NO_VMM=ON \
82+
-DGGML_HIP_EXPORT_METRICS=ON \
83+
-DGGML_HIP_GFX906_OPTIMIZED=ON \
84+
-DGGML_NATIVE=ON \
85+
-DGGML_CUDA_FA=ON \
86+
-DGGML_CUDA_FA_ALL_QUANTS=ON \
87+
-DGGML_CUDA_FORCE_MMQ=OFF \
88+
-DGGML_CUDA_FORCE_CUBLAS=OFF \
89+
-DGGML_CUDA_NO_PEER_COPY=ON \
90+
-DLLAMA_BUILD_SERVER=ON \
91+
-DLLAMA_BUILD_EXAMPLES=ON \
92+
-DLLAMA_BUILD_TOOLS=ON \
93+
-DLLAMA_BUILD_TESTS=OFF \
94+
-DLLAMA_CURL=ON \
95+
-DLLAMA_STATIC=OFF
96+
97+
if [[ $? -ne 0 ]]; then
98+
echo -e "${RED}✗ CMake configuration failed${NC}"
99+
exit 1
100+
fi
101+
102+
echo -e "${GREEN}✓ CMake configuration successful${NC}"
103+
104+
# Compile with all CPU cores and dump detailed logs
105+
NPROC=$(nproc)
106+
LOG_FILE="compilation_log.txt"
107+
echo -e "${YELLOW}Compiling with $NPROC cores...${NC}"
108+
echo -e "${YELLOW}This may take several minutes...${NC}"
109+
echo -e "${YELLOW}Detailed compilation log will be saved to: $LOG_FILE${NC}"
110+
111+
# Clear previous log
112+
> $LOG_FILE
113+
114+
# Run make with detailed output and save to log file
115+
make -j$NPROC 2>&1 | tee $LOG_FILE
116+
117+
if [[ ${PIPESTATUS[0]} -ne 0 ]]; then
118+
echo -e "${RED}✗ Compilation failed${NC}"
119+
echo -e "${RED}Check $LOG_FILE for detailed error information${NC}"
120+
exit 1
121+
fi
122+
123+
echo -e "${GREEN}✓ Compilation successful!${NC}"
124+
125+
# Verify the build
126+
echo -e "${YELLOW}Verifying build...${NC}"
127+
128+
# Check if main executables were built
129+
EXECUTABLES=(
130+
"bin/llama-cli"
131+
"bin/llama-server"
132+
"bin/llama-bench"
133+
"bin/libggml-hip.so"
134+
)
135+
136+
ALL_GOOD=true
137+
for exec in "${EXECUTABLES[@]}"; do
138+
if [[ -f "$exec" ]]; then
139+
echo -e "${GREEN}$exec built successfully${NC}"
140+
141+
# Check HIP linking for executables (not libraries)
142+
if [[ "$exec" =~ ^bin/llama- && ! "$exec" =~ \.so$ ]]; then
143+
if ldd "$exec" | grep -q "libggml-hip.so"; then
144+
echo -e "${GREEN} ✓ HIP backend linked${NC}"
145+
else
146+
echo -e "${RED} ✗ HIP backend not linked${NC}"
147+
ALL_GOOD=false
148+
fi
149+
fi
150+
else
151+
echo -e "${RED}$exec not found${NC}"
152+
ALL_GOOD=false
153+
fi
154+
done
155+
156+
if [[ "$ALL_GOOD" = false ]]; then
157+
echo -e "${RED}✗ Build verification failed${NC}"
158+
exit 1
159+
fi
160+
161+
# Display ROCm libraries linked
162+
echo -e "${YELLOW}ROCm libraries linked:${NC}"
163+
ldd bin/llama-cli | grep -E "(hip|roc)" | head -5
164+
165+
# Quick functionality test
166+
echo -e "${YELLOW}Testing HIP backend availability...${NC}"
167+
if ./bin/llama-cli --help 2>/dev/null | grep -q "backend"; then
168+
echo -e "${GREEN}✓ llama-cli responding correctly${NC}"
169+
else
170+
echo -e "${RED}✗ llama-cli test failed${NC}"
171+
fi
172+
173+
# Success message
174+
echo
175+
echo -e "${GREEN}======================================${NC}"
176+
echo -e "${GREEN} ✓ BUILD COMPLETED SUCCESSFULLY ${NC}"
177+
echo -e "${GREEN}======================================${NC}"
178+
echo
179+
echo -e "${BLUE}Built executables:${NC}"
180+
echo " • CLI: ./build/bin/llama-cli"
181+
echo " • Server: ./build/bin/llama-server"
182+
echo " • Bench: ./build/bin/llama-bench"
183+
echo
184+
echo -e "${BLUE}Optimizations enabled:${NC}"
185+
echo " • Target GPU: AMD MI50 (gfx906)"
186+
echo " • HIP/ROCm backend with MFMA support"
187+
echo " • Flash Attention kernels"
188+
echo " • All quantization formats"
189+
echo " • Performance metrics export"
190+
echo " • Native CPU optimizations"
191+
echo " • Optimization 5: GFX906 compiler flags (-ffast-math, early-inline, function-calls=false)"
192+
echo
193+
echo -e "${BLUE}Ready to run:${NC}"
194+
echo " ./SCRIPT_launch_server_MI50.sh <model.gguf>"
195+
echo
196+
echo -e "${YELLOW}Note: Make sure to set proper ROCm environment variables before running!${NC}"
197+
echo
198+
echo -e "${BLUE}For debugging with maximum HIP logging:${NC}"
199+
echo " export AMD_LOG_LEVEL=8"
200+
echo " export AMD_LOG_MASK=0xFFFFFFFF"
201+
echo " export AMD_SERIALIZE_KERNEL=3"
202+
echo " ./SCRIPT_launch_server_MI50.sh <model.gguf> 2>&1 | tee hip_debug.log"

SCRIPT_launch_server_MI50.sh

2.25 KB
Binary file not shown.
2.26 KB
Binary file not shown.

SCRIPT_llama_bench.sh

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
#!/bin/bash
2+
#
3+
# Run llama-bench with AMD MI50 ROCm support and GFX906 optimizations
4+
# Built for gfx906 architecture - matches SCRIPT_launch_server configuration
5+
#
6+
7+
# Set ROCm environment variables for MI50 ONLY (optimal configuration)
8+
export HSA_OVERRIDE_GFX_VERSION=9.0.6
9+
export HIP_VISIBLE_DEVICES=0 # ONLY MI50 (Device 0)
10+
export CUDA_VISIBLE_DEVICES=0 # Additional CUDA compatibility
11+
export ROCR_VISIBLE_DEVICES=0 # ROCr runtime device selection
12+
export GGML_BACKEND_HIP=1
13+
export HCC_AMDGPU_TARGET=gfx906
14+
15+
# Path to your model file - update this to your actual model path
16+
MODEL_PATH="/home/iacopo/Downloads/Qwen3-30B-A3B-Thinking-2507-Q4_0.gguf"
17+
18+
# Default benchmark parameters (matching server configuration)
19+
BENCH_PARAMS=(
20+
-m "$MODEL_PATH"
21+
-ngl 99 # Offload all layers to GPU
22+
-b 1024 # Batch size (matches server)
23+
-t $(nproc) # Use all CPU threads
24+
-fa 0 # Enable flash attention (GFX906 optimized)
25+
#-ctk q8_0 # q8_0 quantized K cache (matches server)
26+
#-ctv q8_0 # q8_0 quantized V cache (matches server)
27+
--main-gpu 0 # Force MI50 as main GPU
28+
--progress # Show progress indicators
29+
)
30+
31+
# Benchmark configurations
32+
QUICK_TEST="-p 512 -n 128" # Quick test
33+
STANDARD_TEST="-p 512,1024,2048,4096 -n 128,256" # Standard comprehensive test
34+
PROMPT_FOCUS="-p 512,1024,2048,4096,8192 -n 64" # Focus on prompt processing
35+
GENERATION_FOCUS="-p 512 -n 128,256,512,1024" # Focus on text generation
36+
EXTENSIVE_TEST="-p 512,1024,2048,4096,8192 -n 128,256,512" # Extensive testing
37+
38+
# Function to display usage
39+
usage() {
40+
echo "Usage: $0 [test_type] [additional_llama-bench_args...]"
41+
echo ""
42+
echo "Test types:"
43+
echo " quick - Quick test (512 prompt, 128 generation)"
44+
echo " standard - Standard test (multiple prompt sizes, 2 gen sizes) [DEFAULT]"
45+
echo " prompt - Focus on prompt processing (up to 8K prompts)"
46+
echo " generation - Focus on text generation (multiple lengths)"
47+
echo " extensive - Extensive testing (all combinations)"
48+
echo " custom - Use your own parameters (provide as additional args)"
49+
echo ""
50+
echo "Examples:"
51+
echo " $0 # Run standard benchmark"
52+
echo " $0 quick # Run quick benchmark"
53+
echo " $0 prompt # Test prompt processing"
54+
echo " $0 custom -p 1024 -n 256 # Custom benchmark"
55+
echo ""
56+
echo "Model path: $MODEL_PATH"
57+
echo "Output format: markdown (default), add -o csv for CSV output"
58+
}
59+
60+
# Check if model file exists
61+
if [ ! -f "$MODEL_PATH" ]; then
62+
echo "Error: Model file not found at: $MODEL_PATH"
63+
echo "Please update MODEL_PATH in this script or ensure the model exists."
64+
exit 1
65+
fi
66+
67+
# Parse command line arguments
68+
TEST_TYPE="${1:-standard}"
69+
shift # Remove first argument, rest will be passed to llama-bench
70+
71+
case "$TEST_TYPE" in
72+
"help"|"-h"|"--help")
73+
usage
74+
exit 0
75+
;;
76+
"quick")
77+
TEST_PARAMS="$QUICK_TEST"
78+
echo "=== Running Quick Benchmark ==="
79+
;;
80+
"standard")
81+
TEST_PARAMS="$STANDARD_TEST"
82+
echo "=== Running Standard Benchmark ==="
83+
;;
84+
"prompt")
85+
TEST_PARAMS="$PROMPT_FOCUS"
86+
echo "=== Running Prompt Processing Focused Benchmark ==="
87+
;;
88+
"generation")
89+
TEST_PARAMS="$GENERATION_FOCUS"
90+
echo "=== Running Text Generation Focused Benchmark ==="
91+
;;
92+
"extensive")
93+
TEST_PARAMS="$EXTENSIVE_TEST"
94+
echo "=== Running Extensive Benchmark (this will take a while) ==="
95+
;;
96+
"custom")
97+
TEST_PARAMS=""
98+
echo "=== Running Custom Benchmark ==="
99+
echo "Custom parameters: $@"
100+
;;
101+
*)
102+
echo "Unknown test type: $TEST_TYPE"
103+
usage
104+
exit 1
105+
;;
106+
esac
107+
108+
# Display system info
109+
echo "Model: $(basename "$MODEL_PATH")"
110+
echo "GPU: MI50 (gfx906) - Device 0 only"
111+
echo "Flash Attention: ENABLED (GFX906 optimized)"
112+
echo "KV Cache: q8_0 quantized"
113+
echo ""
114+
115+
# Display GPU info
116+
echo "=== ROCm GPU Information ==="
117+
rocm-smi --showproductname --showtemp --showmeminfo --showuse --showpower
118+
echo ""
119+
120+
# Change to script directory
121+
cd "$(dirname "$0")"
122+
123+
# Check if llama-bench exists
124+
if [ ! -f "./build/bin/llama-bench" ]; then
125+
echo "Error: llama-bench not found. Please compile the project first:"
126+
echo " ./SCRIPT_compile_MI50.sh"
127+
exit 1
128+
fi
129+
130+
# Run the benchmark
131+
echo "=== Starting llama-bench with GFX906 Flash Attention ==="
132+
echo "Command: ./build/bin/llama-bench ${BENCH_PARAMS[*]} $TEST_PARAMS $@"
133+
echo ""
134+
135+
./build/bin/llama-bench "${BENCH_PARAMS[@]}" $TEST_PARAMS "$@"
136+
137+
BENCH_EXIT_CODE=$?
138+
139+
echo ""
140+
echo "=== Benchmark Complete ==="
141+
if [ $BENCH_EXIT_CODE -eq 0 ]; then
142+
echo "✓ Benchmark completed successfully"
143+
echo ""
144+
echo "Tip: Add '-o csv' to get CSV output for analysis"
145+
echo "Tip: Add '-r 10' to run more repetitions for better accuracy"
146+
else
147+
echo "✗ Benchmark failed with exit code: $BENCH_EXIT_CODE"
148+
fi
149+
150+
exit $BENCH_EXIT_CODE

ggml/CMakeLists.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -129,11 +129,10 @@ endif()
129129
option(GGML_LASX "ggml: enable lasx" ON)
130130
option(GGML_LSX "ggml: enable lsx" ON)
131131
option(GGML_RVV "ggml: enable rvv" ON)
132-
option(GGML_RV_ZFH "ggml: enable riscv zfh" ON)
133-
option(GGML_RV_ZVFH "ggml: enable riscv zvfh" ON)
134-
option(GGML_RV_ZICBOP "ggml: enable riscv zicbop" ON)
132+
option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF)
135133
option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
136134
option(GGML_VXE "ggml: enable vxe" ON)
135+
option(GGML_NNPA "ggml: enable nnpa" OFF) # temp disabled by default, see: https://github.com/ggml-org/llama.cpp/issues/14877
137136

138137
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
139138
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
@@ -177,6 +176,7 @@ option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAtten
177176
option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
178177
option(GGML_HIP_MMQ_MFMA "ggml: enable MFMA MMA for CDNA in MMQ" ON)
179178
option(GGML_HIP_EXPORT_METRICS "ggml: enable kernel perf metrics output" OFF)
179+
option(GGML_HIP_GFX906_OPTIMIZED "ggml: enable GFX906-specific optimizations" OFF)
180180
option(GGML_MUSA_GRAPHS "ggml: use MUSA graph, experimental, unstable" OFF)
181181
option(GGML_MUSA_MUDNN_COPY "ggml: enable muDNN for accelerated copy" OFF)
182182
option(GGML_VULKAN "ggml: use Vulkan" OFF)

ggml/include/ggml-cpu.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ extern "C" {
101101
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
102102
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
103103
GGML_BACKEND_API int ggml_cpu_has_vxe (void);
104+
GGML_BACKEND_API int ggml_cpu_has_nnpa (void);
104105
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
105106
GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
106107

@@ -134,7 +135,6 @@ extern "C" {
134135
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
135136

136137
GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *, float *, int64_t);
137-
GGML_BACKEND_API void ggml_cpu_fp32_to_i32 (const float *, int32_t *, int64_t);
138138
GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
139139
GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
140140
GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);

0 commit comments

Comments
 (0)