iacopPBK
diff --git a/‎README.md‎
Lines changed: 173 additions & 523 deletions b/‎README.md‎
Lines changed: 173 additions & 523 deletions
diff --git a/‎SCRIPT_compile_MI50.sh‎
Lines changed: 202 additions & 0 deletions b/‎SCRIPT_compile_MI50.sh‎
Lines changed: 202 additions & 0 deletions
diff --git a/‎SCRIPT_launch_server_MI50.sh‎
2.25 KB b/‎SCRIPT_launch_server_MI50.sh‎
2.25 KB
diff --git a/‎SCRIPT_launch_server_MI50_no_fattn.sh‎
2.26 KB b/‎SCRIPT_launch_server_MI50_no_fattn.sh‎
2.26 KB
diff --git a/‎SCRIPT_llama_bench.sh‎
Lines changed: 150 additions & 0 deletions b/‎SCRIPT_llama_bench.sh‎
Lines changed: 150 additions & 0 deletions
diff --git a/‎ggml/CMakeLists.txt‎
Lines changed: 3 additions & 3 deletions b/‎ggml/CMakeLists.txt‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎ggml/include/ggml-cpu.h‎
Lines changed: 1 addition & 1 deletion b/‎ggml/include/ggml-cpu.h‎
Lines changed: 1 addition & 1 deletion
@@ -0,0 +1,202 @@
+#!/bin/bash
+#
+# SCRIPT MI50 Compilation Script for llama.cpp
+# Optimized build for AMD MI50 (gfx906) with ROCm/HIP support
+# 
+# This script compiles llama.cpp with maximum optimizations for the MI50 GPU
+# including server support, flash attention, and all performance features
+#
+
+set -e  # Exit on any error
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+echo -e "${BLUE}======================================${NC}"
+echo -e "${BLUE} SCRIPT MI50 llama.cpp Builder  ${NC}"
+echo -e "${BLUE}======================================${NC}"
+
+# Check if we're in the right directory
+if [[ ! -f "CMakeLists.txt" ]]; then
+    echo -e "${RED}Error: Not in llama.cpp root directory${NC}"
+    echo "Please run this script from the llama.cpp root directory"
+    exit 1
+fi
+
+# Verify ROCm installation
+echo -e "${YELLOW}Checking ROCm installation...${NC}"
+if ! command -v rocm_agent_enumerator &> /dev/null; then
+    echo -e "${RED}Error: ROCm not found. Please install ROCm first.${NC}"
+    exit 1
+fi
+
+# Check for gfx906 support
+GPUS=$(rocm_agent_enumerator)
+if [[ ! "$GPUS" =~ "gfx906" ]]; then
+    echo -e "${RED}Warning: gfx906 (MI50) not detected in system${NC}"
+    echo "Available GPUs: $GPUS"
+    read -p "Continue anyway? (y/N): " -n 1 -r
+    echo
+    if [[ ! $REPLY =~ ^[Yy]$ ]]; then
+        exit 1
+    fi
+fi
+
+echo -e "${GREEN}✓ ROCm installation verified${NC}"
+echo -e "${GREEN}✓ Available GPUs: $GPUS${NC}"
+
+# Set ROCm environment variables for optimal gfx906 compilation
+echo -e "${YELLOW}Setting ROCm environment variables for gfx906...${NC}"
+export ROCM_PATH=${ROCM_PATH:-/opt/rocm}
+export HCC_AMDGPU_TARGET=gfx906
+export HSA_OVERRIDE_GFX_VERSION=9.0.6
+export AMDGPU_TARGETS=gfx906
+export GPU_TARGETS=gfx906
+
+# Clean previous build
+echo -e "${YELLOW}Cleaning previous build...${NC}"
+rm -rf build
+mkdir -p build
+
+# Configure with maximum optimizations
+echo -e "${YELLOW}Configuring CMake with MI50 optimizations...${NC}"
+cd build
+
+cmake .. \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DCMAKE_C_COMPILER=gcc \
+    -DCMAKE_CXX_COMPILER=g++ \
+    -DCMAKE_HIP_COMPILER_FORCED=1 \
+    -DCMAKE_HIP_ARCHITECTURES=gfx906 \
+    -DCMAKE_C_FLAGS="-O3 -march=native -mtune=native -DNDEBUG -ffast-math -fno-finite-math-only -ffp-contract=fast" \
+    -DCMAKE_CXX_FLAGS="-O3 -march=native -mtune=native -DNDEBUG -DGGML_HIP_GFX906_OPTIMIZED -ffast-math -fno-finite-math-only -ffp-contract=fast" \
+    -DCMAKE_HIP_FLAGS=" --offload-arch=gfx906 -DGGML_HIP_GFX906_OPTIMIZED -Wno-ignored-attributes -Wno-cuda-compat -Wno-unused-result -mllvm -amdgpu-simplify-libcall -mllvm -amdgpu-internalize-symbols -mllvm -amdgpu-enable-lower-module-lds=false -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false -ffast-math -ffp-contract=fast" \
+    -DGGML_HIP=ON \
+    -DGGML_HIP_MMQ_MFMA=ON \
+    -DGGML_HIP_GRAPHS=ON \
+    -DGGML_HIP_NO_VMM=ON \
+    -DGGML_HIP_EXPORT_METRICS=ON \
+    -DGGML_HIP_GFX906_OPTIMIZED=ON \
+    -DGGML_NATIVE=ON \
+    -DGGML_CUDA_FA=ON \
+    -DGGML_CUDA_FA_ALL_QUANTS=ON \
+    -DGGML_CUDA_FORCE_MMQ=OFF \
+    -DGGML_CUDA_FORCE_CUBLAS=OFF \
+    -DGGML_CUDA_NO_PEER_COPY=ON \
+    -DLLAMA_BUILD_SERVER=ON \
+    -DLLAMA_BUILD_EXAMPLES=ON \
+    -DLLAMA_BUILD_TOOLS=ON \
+    -DLLAMA_BUILD_TESTS=OFF \
+    -DLLAMA_CURL=ON \
+    -DLLAMA_STATIC=OFF
+
+if [[ $? -ne 0 ]]; then
+    echo -e "${RED}✗ CMake configuration failed${NC}"
+    exit 1
+fi
+
+echo -e "${GREEN}✓ CMake configuration successful${NC}"
+
+# Compile with all CPU cores and dump detailed logs
+NPROC=$(nproc)
+LOG_FILE="compilation_log.txt"
+echo -e "${YELLOW}Compiling with $NPROC cores...${NC}"
+echo -e "${YELLOW}This may take several minutes...${NC}"
+echo -e "${YELLOW}Detailed compilation log will be saved to: $LOG_FILE${NC}"
+
+# Clear previous log
+> $LOG_FILE
+
+# Run make with detailed output and save to log file
+make -j$NPROC 2>&1 | tee $LOG_FILE
+
+if [[ ${PIPESTATUS[0]} -ne 0 ]]; then
+    echo -e "${RED}✗ Compilation failed${NC}"
+    echo -e "${RED}Check $LOG_FILE for detailed error information${NC}"
+    exit 1
+fi
+
+echo -e "${GREEN}✓ Compilation successful!${NC}"
+
+# Verify the build
+echo -e "${YELLOW}Verifying build...${NC}"
+
+# Check if main executables were built
+EXECUTABLES=(
+    "bin/llama-cli"
+    "bin/llama-server"
+    "bin/llama-bench"
+    "bin/libggml-hip.so"
+)
+
+ALL_GOOD=true
+for exec in "${EXECUTABLES[@]}"; do
+    if [[ -f "$exec" ]]; then
+        echo -e "${GREEN}✓ $exec built successfully${NC}"
+        
+        # Check HIP linking for executables (not libraries)
+        if [[ "$exec" =~ ^bin/llama- && ! "$exec" =~ \.so$ ]]; then
+            if ldd "$exec" | grep -q "libggml-hip.so"; then
+                echo -e "${GREEN}  ✓ HIP backend linked${NC}"
+            else
+                echo -e "${RED}  ✗ HIP backend not linked${NC}"
+                ALL_GOOD=false
+            fi
+        fi
+    else
+        echo -e "${RED}✗ $exec not found${NC}"
+        ALL_GOOD=false
+    fi
+done
+
+if [[ "$ALL_GOOD" = false ]]; then
+    echo -e "${RED}✗ Build verification failed${NC}"
+    exit 1
+fi
+
+# Display ROCm libraries linked
+echo -e "${YELLOW}ROCm libraries linked:${NC}"
+ldd bin/llama-cli | grep -E "(hip|roc)" | head -5
+
+# Quick functionality test
+echo -e "${YELLOW}Testing HIP backend availability...${NC}"
+if ./bin/llama-cli --help 2>/dev/null | grep -q "backend"; then
+    echo -e "${GREEN}✓ llama-cli responding correctly${NC}"
+else
+    echo -e "${RED}✗ llama-cli test failed${NC}"
+fi
+
+# Success message
+echo
+echo -e "${GREEN}======================================${NC}"
+echo -e "${GREEN}    ✓ BUILD COMPLETED SUCCESSFULLY    ${NC}"
+echo -e "${GREEN}======================================${NC}"
+echo
+echo -e "${BLUE}Built executables:${NC}"
+echo "  • CLI:    ./build/bin/llama-cli"
+echo "  • Server: ./build/bin/llama-server" 
+echo "  • Bench:  ./build/bin/llama-bench"
+echo
+echo -e "${BLUE}Optimizations enabled:${NC}"
+echo "  • Target GPU: AMD MI50 (gfx906)"
+echo "  • HIP/ROCm backend with MFMA support"
+echo "  • Flash Attention kernels"
+echo "  • All quantization formats"
+echo "  • Performance metrics export"
+echo "  • Native CPU optimizations"
+echo "  • Optimization 5: GFX906 compiler flags (-ffast-math, early-inline, function-calls=false)"
+echo
+echo -e "${BLUE}Ready to run:${NC}"
+echo "  ./SCRIPT_launch_server_MI50.sh <model.gguf>"
+echo
+echo -e "${YELLOW}Note: Make sure to set proper ROCm environment variables before running!${NC}"
+echo
+echo -e "${BLUE}For debugging with maximum HIP logging:${NC}"
+echo "  export AMD_LOG_LEVEL=8"
+echo "  export AMD_LOG_MASK=0xFFFFFFFF" 
+echo "  export AMD_SERIALIZE_KERNEL=3"
+echo "  ./SCRIPT_launch_server_MI50.sh <model.gguf> 2>&1 | tee hip_debug.log"
@@ -0,0 +1,150 @@
+#!/bin/bash
+#
+# Run llama-bench with AMD MI50 ROCm support and GFX906 optimizations
+# Built for gfx906 architecture - matches SCRIPT_launch_server configuration
+#
+
+# Set ROCm environment variables for MI50 ONLY (optimal configuration)
+export HSA_OVERRIDE_GFX_VERSION=9.0.6
+export HIP_VISIBLE_DEVICES=0           # ONLY MI50 (Device 0)
+export CUDA_VISIBLE_DEVICES=0          # Additional CUDA compatibility
+export ROCR_VISIBLE_DEVICES=0          # ROCr runtime device selection
+export GGML_BACKEND_HIP=1
+export HCC_AMDGPU_TARGET=gfx906
+
+# Path to your model file - update this to your actual model path
+MODEL_PATH="/home/iacopo/Downloads/Qwen3-30B-A3B-Thinking-2507-Q4_0.gguf"
+
+# Default benchmark parameters (matching server configuration)
+BENCH_PARAMS=(
+    -m "$MODEL_PATH"
+    -ngl 99                    # Offload all layers to GPU
+    -b 1024                    # Batch size (matches server)
+    -t $(nproc)                # Use all CPU threads
+    -fa 0                      # Enable flash attention (GFX906 optimized)
+    #-ctk q8_0                  # q8_0 quantized K cache (matches server)
+    #-ctv q8_0                  # q8_0 quantized V cache (matches server)
+    --main-gpu 0               # Force MI50 as main GPU
+    --progress                 # Show progress indicators
+)
+
+# Benchmark configurations
+QUICK_TEST="-p 512 -n 128"                           # Quick test
+STANDARD_TEST="-p 512,1024,2048,4096 -n 128,256"    # Standard comprehensive test
+PROMPT_FOCUS="-p 512,1024,2048,4096,8192 -n 64"     # Focus on prompt processing
+GENERATION_FOCUS="-p 512 -n 128,256,512,1024"       # Focus on text generation
+EXTENSIVE_TEST="-p 512,1024,2048,4096,8192 -n 128,256,512"  # Extensive testing
+
+# Function to display usage
+usage() {
+    echo "Usage: $0 [test_type] [additional_llama-bench_args...]"
+    echo ""
+    echo "Test types:"
+    echo "  quick       - Quick test (512 prompt, 128 generation)"
+    echo "  standard    - Standard test (multiple prompt sizes, 2 gen sizes) [DEFAULT]"
+    echo "  prompt      - Focus on prompt processing (up to 8K prompts)"
+    echo "  generation  - Focus on text generation (multiple lengths)"
+    echo "  extensive   - Extensive testing (all combinations)"
+    echo "  custom      - Use your own parameters (provide as additional args)"
+    echo ""
+    echo "Examples:"
+    echo "  $0                          # Run standard benchmark"
+    echo "  $0 quick                    # Run quick benchmark" 
+    echo "  $0 prompt                   # Test prompt processing"
+    echo "  $0 custom -p 1024 -n 256    # Custom benchmark"
+    echo ""
+    echo "Model path: $MODEL_PATH"
+    echo "Output format: markdown (default), add -o csv for CSV output"
+}
+
+# Check if model file exists
+if [ ! -f "$MODEL_PATH" ]; then
+    echo "Error: Model file not found at: $MODEL_PATH"
+    echo "Please update MODEL_PATH in this script or ensure the model exists."
+    exit 1
+fi
+
+# Parse command line arguments
+TEST_TYPE="${1:-standard}"
+shift  # Remove first argument, rest will be passed to llama-bench
+
+case "$TEST_TYPE" in
+    "help"|"-h"|"--help")
+        usage
+        exit 0
+        ;;
+    "quick")
+        TEST_PARAMS="$QUICK_TEST"
+        echo "=== Running Quick Benchmark ==="
+        ;;
+    "standard")
+        TEST_PARAMS="$STANDARD_TEST"
+        echo "=== Running Standard Benchmark ==="
+        ;;
+    "prompt")
+        TEST_PARAMS="$PROMPT_FOCUS"
+        echo "=== Running Prompt Processing Focused Benchmark ==="
+        ;;
+    "generation")
+        TEST_PARAMS="$GENERATION_FOCUS"
+        echo "=== Running Text Generation Focused Benchmark ==="
+        ;;
+    "extensive")
+        TEST_PARAMS="$EXTENSIVE_TEST"
+        echo "=== Running Extensive Benchmark (this will take a while) ==="
+        ;;
+    "custom")
+        TEST_PARAMS=""
+        echo "=== Running Custom Benchmark ==="
+        echo "Custom parameters: $@"
+        ;;
+    *)
+        echo "Unknown test type: $TEST_TYPE"
+        usage
+        exit 1
+        ;;
+esac
+
+# Display system info
+echo "Model: $(basename "$MODEL_PATH")"
+echo "GPU: MI50 (gfx906) - Device 0 only"
+echo "Flash Attention: ENABLED (GFX906 optimized)"
+echo "KV Cache: q8_0 quantized"
+echo ""
+
+# Display GPU info
+echo "=== ROCm GPU Information ==="
+rocm-smi --showproductname --showtemp --showmeminfo --showuse --showpower
+echo ""
+
+# Change to script directory
+cd "$(dirname "$0")"
+
+# Check if llama-bench exists
+if [ ! -f "./build/bin/llama-bench" ]; then
+    echo "Error: llama-bench not found. Please compile the project first:"
+    echo "  ./SCRIPT_compile_MI50.sh"
+    exit 1
+fi
+
+# Run the benchmark
+echo "=== Starting llama-bench with GFX906 Flash Attention ==="
+echo "Command: ./build/bin/llama-bench ${BENCH_PARAMS[*]} $TEST_PARAMS $@"
+echo ""
+
+./build/bin/llama-bench "${BENCH_PARAMS[@]}" $TEST_PARAMS "$@"
+
+BENCH_EXIT_CODE=$?
+
+echo ""
+echo "=== Benchmark Complete ==="
+if [ $BENCH_EXIT_CODE -eq 0 ]; then
+    echo "✓ Benchmark completed successfully"
+    echo ""
+    echo "Tip: Add '-o csv' to get CSV output for analysis"
+    echo "Tip: Add '-r 10' to run more repetitions for better accuracy"
+else
+    echo "✗ Benchmark failed with exit code: $BENCH_EXIT_CODE"
+fi
+
+exit $BENCH_EXIT_CODE
@@ -129,11 +129,10 @@ endif()
 option(GGML_LASX             "ggml: enable lasx"             ON)
 option(GGML_LSX              "ggml: enable lsx"              ON)
 option(GGML_RVV              "ggml: enable rvv"              ON)
-option(GGML_RV_ZFH           "ggml: enable riscv zfh"        ON)
-option(GGML_RV_ZVFH          "ggml: enable riscv zvfh"       ON)
-option(GGML_RV_ZICBOP        "ggml: enable riscv zicbop"     ON)
+option(GGML_RV_ZFH           "ggml: enable riscv zfh"        OFF)
 option(GGML_XTHEADVECTOR     "ggml: enable xtheadvector"     OFF)
 option(GGML_VXE              "ggml: enable vxe"              ON)
+option(GGML_NNPA             "ggml: enable nnpa"             OFF)  # temp disabled by default, see: https://github.com/ggml-org/llama.cpp/issues/14877
 
 option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
 set(GGML_CPU_ARM_ARCH        "" CACHE STRING "ggml: CPU architecture for ARM")
@@ -177,6 +176,7 @@ option(GGML_HIP_ROCWMMA_FATTN               "ggml: enable rocWMMA for FlashAtten
 option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12   "ggml: enable rocWMMA FlashAttention on GFX12"    OFF)
 option(GGML_HIP_MMQ_MFMA                    "ggml: enable MFMA MMA for CDNA in MMQ"           ON)
 option(GGML_HIP_EXPORT_METRICS              "ggml: enable kernel perf metrics output"         OFF)
+option(GGML_HIP_GFX906_OPTIMIZED            "ggml: enable GFX906-specific optimizations"      OFF)
 option(GGML_MUSA_GRAPHS                     "ggml: use MUSA graph, experimental, unstable"    OFF)
 option(GGML_MUSA_MUDNN_COPY                 "ggml: enable muDNN for accelerated copy"         OFF)
 option(GGML_VULKAN                          "ggml: use Vulkan"                                OFF)
 
@@ -101,6 +101,7 @@ extern "C" {
     GGML_BACKEND_API int ggml_cpu_has_riscv_v    (void);
     GGML_BACKEND_API int ggml_cpu_has_vsx        (void);
     GGML_BACKEND_API int ggml_cpu_has_vxe        (void);
+    GGML_BACKEND_API int ggml_cpu_has_nnpa       (void);
     GGML_BACKEND_API int ggml_cpu_has_wasm_simd  (void);
     GGML_BACKEND_API int ggml_cpu_has_llamafile  (void);
 
@@ -134,7 +135,6 @@ extern "C" {
     GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
 
     GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *,       float *, int64_t);
-    GGML_BACKEND_API void ggml_cpu_fp32_to_i32 (const float *,     int32_t *, int64_t);
     GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
     GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
     GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);