diff --git a/.devops/cpu.Dockerfile b/.devops/cpu.Dockerfile index aa2aa0312010e..9459f08c10c94 100644 --- a/.devops/cpu.Dockerfile +++ b/.devops/cpu.Dockerfile @@ -14,9 +14,9 @@ WORKDIR /app COPY . . RUN if [ "$TARGETARCH" = "amd64" ]; then \ - cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \ + cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \ elif [ "$TARGETARCH" = "arm64" ]; then \ - cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \ + cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \ else \ echo "Unsupported architecture"; \ exit 1; \ diff --git a/.devops/cuda.Dockerfile b/.devops/cuda.Dockerfile index 8ae57d2e289f4..94f143397233f 100644 --- a/.devops/cuda.Dockerfile +++ b/.devops/cuda.Dockerfile @@ -21,7 +21,7 @@ COPY . . RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \ export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \ fi && \ - cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ + cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ cmake --build build --config Release -j$(nproc) RUN mkdir -p /app/lib && \ diff --git a/.devops/intel.Dockerfile b/.devops/intel.Dockerfile index 091e1dc5d8b2c..c8839fe027c5a 100644 --- a/.devops/intel.Dockerfile +++ b/.devops/intel.Dockerfile @@ -17,7 +17,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \ && export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \ fi && \ echo "Building with dynamic libs" && \ - cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON ${OPT_SYCL_F16} && \ + cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${OPT_SYCL_F16} && \ cmake --build build --config Release -j$(nproc) RUN mkdir -p /app/lib && \ diff --git a/.devops/llama-cli-cann.Dockerfile b/.devops/llama-cli-cann.Dockerfile index 0eb1af87cb87b..ef43d78cd2a85 100644 --- a/.devops/llama-cli-cann.Dockerfile +++ b/.devops/llama-cli-cann.Dockerfile @@ -22,7 +22,7 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH RUN echo "Building with static libs" && \ source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \ - cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF && \ + cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_TESTS=OFF && \ cmake --build build --config Release --target llama-cli # TODO: use image with NNRT diff --git a/.devops/musa.Dockerfile b/.devops/musa.Dockerfile index 261a2823a0e52..e0f1ad9728b09 100644 --- a/.devops/musa.Dockerfile +++ b/.devops/musa.Dockerfile @@ -35,7 +35,7 @@ COPY . . RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \ export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \ fi && \ - cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ + cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ cmake --build build --config Release -j$(nproc) RUN mkdir -p /app/lib && \ diff --git a/.devops/rocm.Dockerfile b/.devops/rocm.Dockerfile index a1b34723a46af..1c00f1b9c2cd3 100644 --- a/.devops/rocm.Dockerfile +++ b/.devops/rocm.Dockerfile @@ -40,7 +40,7 @@ WORKDIR /app COPY . . RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \ - cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON \ + cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_BUILD_TESTS=OFF \ && cmake --build build --config Release -j$(nproc) RUN mkdir -p /app/lib \ diff --git a/.devops/vulkan.Dockerfile b/.devops/vulkan.Dockerfile index f8f3072e95768..fcd81ffa1e94e 100644 --- a/.devops/vulkan.Dockerfile +++ b/.devops/vulkan.Dockerfile @@ -16,7 +16,7 @@ WORKDIR /app COPY . . -RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON && \ +RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON && \ cmake --build build --config Release -j$(nproc) RUN mkdir -p /app/lib && \ diff --git a/.github/workflows/build-linux-cross.yml b/.github/workflows/build-linux-cross.yml index e8639913ea3a6..d104b8b12432e 100644 --- a/.github/workflows/build-linux-cross.yml +++ b/.github/workflows/build-linux-cross.yml @@ -4,18 +4,25 @@ on: workflow_call: jobs: - ubuntu-latest-riscv64-cpu-cross: - runs-on: ubuntu-latest + ubuntu-24-riscv64-cpu-cross: + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v4 - name: Setup Riscv run: | sudo dpkg --add-architecture riscv64 - sudo sed -i 's|http://azure.archive.ubuntu.com/ubuntu|http://ports.ubuntu.com/ubuntu-ports|g' \ - /etc/apt/sources.list /etc/apt/apt-mirrors.txt - sudo apt-get clean - sudo apt-get update + + # Add arch-specific repositories for non-amd64 architectures + cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list + deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe + deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe + deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe + deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe + EOF + + sudo apt-get update || true ;# Prevent failure due to missing URLs. + sudo apt-get install -y --no-install-recommends \ build-essential \ gcc-14-riscv64-linux-gnu \ @@ -40,21 +47,25 @@ jobs: cmake --build build --config Release -j $(nproc) - ubuntu-latest-riscv64-vulkan-cross: - runs-on: ubuntu-latest + ubuntu-24-riscv64-vulkan-cross: + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - name: Setup Riscv run: | sudo dpkg --add-architecture riscv64 - sudo sed -i 's|http://azure.archive.ubuntu.com/ubuntu|http://ports.ubuntu.com/ubuntu-ports|g' \ - /etc/apt/sources.list /etc/apt/apt-mirrors.txt - sudo apt-get clean - sudo apt-get update + + # Add arch-specific repositories for non-amd64 architectures + cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list + deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe + deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe + deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe + deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe + EOF + + sudo apt-get update || true ;# Prevent failure due to missing URLs. + sudo apt-get install -y --no-install-recommends \ build-essential \ glslc \ @@ -82,21 +93,25 @@ jobs: cmake --build build --config Release -j $(nproc) - ubuntu-latest-arm64-vulkan-cross: - runs-on: ubuntu-latest + ubuntu-24-arm64-vulkan-cross: + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - name: Setup Arm64 run: | sudo dpkg --add-architecture arm64 - sudo sed -i 's|http://azure.archive.ubuntu.com/ubuntu|http://ports.ubuntu.com/ubuntu-ports|g' \ - /etc/apt/sources.list /etc/apt/apt-mirrors.txt - sudo apt-get clean - sudo apt-get update + + # Add arch-specific repositories for non-amd64 architectures + cat << EOF | sudo tee /etc/apt/sources.list.d/arm64-ports.list + deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe + deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe + deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe + deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe + EOF + + sudo apt-get update || true ;# Prevent failure due to missing URLs. + sudo apt-get install -y --no-install-recommends \ build-essential \ glslc \ diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 32c8b7717fec6..34417985d2399 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -601,9 +601,8 @@ jobs: -DGGML_SYCL_F16=ON cmake --build build --config Release -j $(nproc) -# Disabled for now due to sporadic issue syncing. -# build-linux-cross: -# uses: ./.github/workflows/build-linux-cross.yml + build-linux-cross: + uses: ./.github/workflows/build-linux-cross.yml macOS-latest-cmake-ios: runs-on: macos-latest diff --git a/cmake/build-info.cmake b/cmake/build-info.cmake index c1a456e179872..75c78222f2e7f 100644 --- a/cmake/build-info.cmake +++ b/cmake/build-info.cmake @@ -41,14 +41,20 @@ endif() if(MSVC) set(BUILD_COMPILER "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}") - set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME}) + if (CMAKE_VS_PLATFORM_NAME) + set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME}) + else() + set(BUILD_TARGET "${CMAKE_SYSTEM_NAME} ${CMAKE_SYSTEM_PROCESSOR}") + endif() else() execute_process( - COMMAND sh -c "\"$@\" --version | head -1" _ ${CMAKE_C_COMPILER} + COMMAND ${CMAKE_C_COMPILER} --version OUTPUT_VARIABLE OUT OUTPUT_STRIP_TRAILING_WHITESPACE ) + string(REGEX REPLACE " *\n.*" "" OUT "${OUT}") set(BUILD_COMPILER ${OUT}) + execute_process( COMMAND ${CMAKE_C_COMPILER} -dumpmachine OUTPUT_VARIABLE OUT diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 43533fc86abe2..f15e12a96b919 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -39,7 +39,9 @@ add_custom_command( COMMENT "Generating build details from Git" COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION} -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME} - -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake" + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME} -DCMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR} + -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake" WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.." DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX} VERBATIM diff --git a/common/arg.cpp b/common/arg.cpp index 75e8e0bd51aee..e35417de7eb98 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -43,6 +43,25 @@ std::initializer_list mmproj_examples = { // TODO: add LLAMA_EXAMPLE_SERVER when it's ready }; +static std::string read_file(const std::string & fname) { + std::ifstream file(fname); + if (!file) { + throw std::runtime_error(string_format("error: failed to open file '%s'\n", fname.c_str())); + } + std::string content((std::istreambuf_iterator(file)), std::istreambuf_iterator()); + file.close(); + return content; +} + +static void write_file(const std::string & fname, const std::string & content) { + std::ofstream file(fname); + if (!file) { + throw std::runtime_error(string_format("error: failed to open file '%s'\n", fname.c_str())); + } + file << content; + file.close(); +} + common_arg & common_arg::set_examples(std::initializer_list examples) { this->examples = std::move(examples); return *this; @@ -198,11 +217,11 @@ struct curl_slist_ptr { #define CURL_MAX_RETRY 3 #define CURL_RETRY_DELAY_SECONDS 2 -static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) { +static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds, const char * method_name) { int remaining_attempts = max_attempts; while (remaining_attempts > 0) { - LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts); + LOG_INF("%s: %s %s (attempt %d of %d)...\n", __func__ , method_name, url.c_str(), max_attempts - remaining_attempts + 1, max_attempts); CURLcode res = curl_easy_perform(curl); if (res == CURLE_OK) { @@ -213,6 +232,7 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay); remaining_attempts--; + if (remaining_attempts == 0) break; std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay)); } @@ -231,8 +251,6 @@ static bool common_download_file_single(const std::string & url, const std::stri return false; } - bool force_download = false; - // Set the URL, allow to follow http redirection curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str()); curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L); @@ -256,7 +274,7 @@ static bool common_download_file_single(const std::string & url, const std::stri // If the file exists, check its JSON metadata companion file. std::string metadata_path = path + ".json"; - nlohmann::json metadata; + nlohmann::json metadata; // TODO @ngxson : get rid of this json, use regex instead std::string etag; std::string last_modified; @@ -266,14 +284,7 @@ static bool common_download_file_single(const std::string & url, const std::stri if (metadata_in.good()) { try { metadata_in >> metadata; - LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str()); - if (metadata.contains("url") && metadata.at("url").is_string()) { - auto previous_url = metadata.at("url").get(); - if (previous_url != url) { - LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str()); - return false; - } - } + LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str()); if (metadata.contains("etag") && metadata.at("etag").is_string()) { etag = metadata.at("etag"); } @@ -281,10 +292,10 @@ static bool common_download_file_single(const std::string & url, const std::stri last_modified = metadata.at("lastModified"); } } catch (const nlohmann::json::exception & e) { - LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what()); - return false; + LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what()); } } + // if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again) } else { LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str()); } @@ -296,7 +307,10 @@ static bool common_download_file_single(const std::string & url, const std::stri }; common_load_model_from_url_headers headers; + bool head_request_ok = false; + bool should_download = !file_exists; // by default, we should download if the file does not exist + // get ETag to see if the remote file has changed { typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *); auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t { @@ -325,23 +339,28 @@ static bool common_download_file_single(const std::string & url, const std::stri curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast(header_callback)); curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers); - bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS); + // we only allow retrying once for HEAD requests + // this is for the use case of using running offline (no internet), retrying can be annoying + bool was_perform_successful = curl_perform_with_retry(url, curl.get(), 1, 0, "HEAD"); if (!was_perform_successful) { - return false; + head_request_ok = false; } long http_code = 0; curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code); - if (http_code != 200) { - // HEAD not supported, we don't know if the file has changed - // force trigger downloading - force_download = true; - LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code); + if (http_code == 200) { + head_request_ok = true; + } else { + LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code); + head_request_ok = false; } } - bool should_download = !file_exists || force_download; - if (!should_download) { + // if head_request_ok is false, we don't have the etag or last-modified headers + // we leave should_download as-is, which is true if the file does not exist + if (head_request_ok) { + // check if ETag or Last-Modified headers are different + // if it is, we need to download the file again if (!etag.empty() && etag != headers.etag) { LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str()); should_download = true; @@ -350,6 +369,7 @@ static bool common_download_file_single(const std::string & url, const std::stri should_download = true; } } + if (should_download) { std::string path_temporary = path + ".downloadInProgress"; if (file_exists) { @@ -403,7 +423,7 @@ static bool common_download_file_single(const std::string & url, const std::stri // start the download LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__, llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str()); - bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS); + bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS, "GET"); if (!was_perform_successful) { return false; } @@ -424,13 +444,15 @@ static bool common_download_file_single(const std::string & url, const std::stri {"etag", headers.etag}, {"lastModified", headers.last_modified} }); - std::ofstream(metadata_path) << metadata.dump(4); - LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str()); + write_file(metadata_path, metadata.dump(4)); + LOG_DBG("%s: file metadata saved: %s\n", __func__, metadata_path.c_str()); if (rename(path_temporary.c_str(), path.c_str()) != 0) { LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str()); return false; } + } else { + LOG_INF("%s: using cached file: %s\n", __func__, path.c_str()); } return true; @@ -605,16 +627,37 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_ // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response // User-Agent header is already set in common_remote_get_content, no need to set it here + // we use "=" to avoid clashing with other component, while still being allowed on windows + std::string cached_response_fname = "manifest=" + hf_repo + "=" + tag + ".json"; + string_replace_all(cached_response_fname, "/", "_"); + std::string cached_response_path = fs_get_cache_file(cached_response_fname); + // make the request common_remote_params params; params.headers = headers; - auto res = common_remote_get_content(url, params); - long res_code = res.first; - std::string res_str(res.second.data(), res.second.size()); + long res_code = 0; + std::string res_str; + bool use_cache = false; + try { + auto res = common_remote_get_content(url, params); + res_code = res.first; + res_str = std::string(res.second.data(), res.second.size()); + } catch (const std::exception & e) { + LOG_WRN("error: failed to get manifest: %s\n", e.what()); + LOG_WRN("try reading from cache\n"); + // try to read from cache + try { + res_str = read_file(cached_response_path); + res_code = 200; + use_cache = true; + } catch (const std::exception & e) { + throw std::runtime_error("error: failed to get manifest (check your internet connection)"); + } + } std::string ggufFile; std::string mmprojFile; - if (res_code == 200) { + if (res_code == 200 || res_code == 304) { // extract ggufFile.rfilename in json, using regex { std::regex pattern("\"ggufFile\"[\\s\\S]*?\"rfilename\"\\s*:\\s*\"([^\"]+)\""); @@ -631,6 +674,10 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_ mmprojFile = match[1].str(); } } + if (!use_cache) { + // if not using cached response, update the cache file + write_file(cached_response_path, res_str); + } } else if (res_code == 401) { throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token"); } else { @@ -1142,6 +1189,9 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e fprintf(stderr, "%s\n", ex.what()); ctx_arg.params = params_org; return false; + } catch (std::exception & ex) { + fprintf(stderr, "%s\n", ex.what()); + exit(1); // for other exceptions, we exit with status code 1 } return true; @@ -1442,13 +1492,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"-f", "--file"}, "FNAME", "a file containing the prompt (default: none)", [](common_params & params, const std::string & value) { - std::ifstream file(value); - if (!file) { - throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); - } + params.prompt = read_file(value); // store the external file name in params params.prompt_file = value; - std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); if (!params.prompt.empty() && params.prompt.back() == '\n') { params.prompt.pop_back(); } @@ -1458,11 +1504,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"-sysf", "--system-prompt-file"}, "FNAME", "a file containing the system prompt (default: none)", [](common_params & params, const std::string & value) { - std::ifstream file(value); - if (!file) { - throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); - } - std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.system_prompt)); + params.system_prompt = read_file(value); if (!params.system_prompt.empty() && params.system_prompt.back() == '\n') { params.system_prompt.pop_back(); } @@ -1886,23 +1928,32 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"--grammar-file"}, "FNAME", "file to read grammar from", + [](common_params & params, const std::string & value) { + params.sampling.grammar = read_file(value); + } + ).set_sparam()); + add_opt(common_arg( + {"-j", "--json-schema"}, "SCHEMA", + "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead", + [](common_params & params, const std::string & value) { + params.sampling.grammar = json_schema_to_grammar(json::parse(value)); + } + ).set_sparam()); + add_opt(common_arg( + {"-jf", "--json-schema-file"}, "FILE", + "File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead", [](common_params & params, const std::string & value) { std::ifstream file(value); if (!file) { throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); } + std::string schema; std::copy( std::istreambuf_iterator(file), std::istreambuf_iterator(), - std::back_inserter(params.sampling.grammar) + std::back_inserter(schema) ); - } - ).set_sparam()); - add_opt(common_arg( - {"-j", "--json-schema"}, "SCHEMA", - "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead", - [](common_params & params, const std::string & value) { - params.sampling.grammar = json_schema_to_grammar(json::parse(value)); + params.sampling.grammar = json_schema_to_grammar(json::parse(schema)); } ).set_sparam()); add_opt(common_arg( @@ -2815,14 +2866,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "list of built-in templates:\n%s", list_builtin_chat_templates().c_str() ), [](common_params & params, const std::string & value) { - std::ifstream file(value); - if (!file) { - throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); - } - std::copy( - std::istreambuf_iterator(file), - std::istreambuf_iterator(), - std::back_inserter(params.chat_template)); + params.chat_template = read_file(value); } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE")); add_opt(common_arg( diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index b9cea7e4699c6..04ca646b503ca 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -16,6 +16,7 @@ from hashlib import sha256 from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast from itertools import chain +from transformers import AutoConfig import math import numpy as np @@ -66,8 +67,6 @@ class ModelBase: part_names: list[str] is_safetensors: bool hparams: dict[str, Any] - block_count: int - tensor_map: gguf.TensorNameMap tensor_names: set[str] | None gguf_writer: gguf.GGUFWriter model_name: str | None @@ -78,6 +77,10 @@ class ModelBase: # subclasses should define this! model_arch: gguf.MODEL_ARCH + # subclasses should initialize this! + block_count: int + tensor_map: gguf.TensorNameMap + def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, *, is_big_endian: bool = False, use_temp_file: bool = False, eager: bool = False, metadata_override: Path | None = None, model_name: str | None = None, @@ -113,8 +116,6 @@ def get_remote_tensors() -> Iterator[tuple[str, Tensor]]: if not self.is_safetensors: self.part_names = ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin") self.hparams = ModelBase.load_hparams(self.dir_model) if hparams is None else hparams - self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"]) - self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) self.tensor_names = None self.metadata_override = metadata_override self.model_name = model_name @@ -417,15 +418,13 @@ def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str] @staticmethod def load_hparams(dir_model: Path): - with open(dir_model / "config.json", "r", encoding="utf-8") as f: - hparams = json.load(f) - architectures = hparams.get("architectures") - if "text_config" in hparams: - hparams = {**hparams, **hparams["text_config"]} - if architectures is not None: - # preserve "architectures" from root level config - hparams["architectures"] = architectures - return hparams + try: + return AutoConfig.from_pretrained(dir_model).to_dict() + except Exception as e: + logger.warning(f"Failed to load model config from {dir_model}: {e}") + logger.warning("Trying to load config.json instead") + with open(dir_model / "config.json", "r", encoding="utf-8") as f: + return json.load(f) @classmethod def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]: @@ -454,6 +453,23 @@ def from_model_architecture(cls, arch: str, model_type = ModelType.TEXT) -> type class TextModel(ModelBase): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + if "text_config" in self.hparams: + # move the text_config to the root level + self.hparams = {**self.hparams, **self.hparams["text_config"]} + + self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"]) + self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + + @classmethod + def __init_subclass__(cls): + # can't use an abstract property, because overriding it without type errors + # would require using decorated functions instead of simply defining the property + if "model_arch" not in cls.__dict__: + raise TypeError(f"Missing property 'model_arch' for {cls.__name__!r}") + def set_vocab(self): self._set_vocab_gpt2() @@ -1070,9 +1086,9 @@ def __init__(self, *args, **kwargs): if self.model_arch != gguf.MODEL_ARCH.CLIP_VISION: raise TypeError("VisionModel must be subclassed with model_arch = gguf.MODEL_ARCH.CLIP_VISION") - # small hack to correct the number of layers - self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.CLIP_VISION, 128) - self.n_embd_text = self.find_hparam(["hidden_size", "n_embd"]) + # get n_embd of the text model + text_config = {**self.hparams, **self.hparams["text_config"]} + self.n_embd_text = text_config.get("hidden_size", text_config.get("n_embd", 0)) assert self.n_embd_text > 0, "n_embd not found in hparams" if "vision_config" not in self.hparams: @@ -1081,6 +1097,9 @@ def __init__(self, *args, **kwargs): self.global_config = self.hparams self.hparams = self.hparams["vision_config"] + self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"]) + self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.CLIP_VISION, self.block_count) + # load preprocessor config with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f: self.preprocessor_config = json.load(f) @@ -1098,12 +1117,12 @@ def set_gguf_parameters(self): self.gguf_writer.add_vision_patch_size(self.find_hparam(["patch_size"])) self.gguf_writer.add_vision_embedding_length(self.find_hparam(["hidden_size"])) self.gguf_writer.add_vision_feed_forward_length(self.find_hparam(["intermediate_size"])) - self.gguf_writer.add_vision_block_count(self.find_hparam(["num_hidden_layers"])) + self.gguf_writer.add_vision_block_count(self.block_count) self.gguf_writer.add_vision_head_count(self.find_hparam(["num_attention_heads"])) # preprocessor config self.gguf_writer.add_vision_image_mean(self.preprocessor_config["image_mean"]) - self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_mean"]) + self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_std"]) def write_vocab(self): raise ValueError("VisionModel does not support vocab writing") @@ -1719,23 +1738,12 @@ def prepare_tensors(self): "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM", - "Idefics3ForConditionalGeneration", - "SmolVLMForConditionalGeneration", + "VLlama3ForCausalLM", "LlavaForConditionalGeneration") class LlamaModel(TextModel): model_arch = gguf.MODEL_ARCH.LLAMA undo_permute = True - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - # fix for SmolVLM2, missing `num_attention_heads` in config.json - if self.hparams["architectures"][0] == "SmolVLMForConditionalGeneration": - self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32) - # fix for Pixtral, missing `num_attention_heads` in config.json - if self.hparams["architectures"][0] == "LlavaForConditionalGeneration" \ - and self.hparams.get("model_type") == "mistral": - self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32) - def set_vocab(self): try: self._set_vocab_sentencepiece() @@ -1891,31 +1899,50 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") -@ModelBase.register("LlavaForConditionalGeneration") +@ModelBase.register( + "LlavaForConditionalGeneration", # pixtral + "Mistral3ForConditionalGeneration", # mistral small 3.1 +) class LlavaVisionModel(VisionModel): img_break_tok_id = -1 def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) if self.hparams["model_type"] == "pixtral": - # fix missing config.json values - self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 16) - self.hparams["num_hidden_layers"] = self.hparams.get("num_hidden_layers", 24) - self.hparams["intermediate_size"] = self.hparams.get("intermediate_size", 4096) - self.hparams["hidden_size"] = self.hparams.get("hidden_size", 1024) + # layer_norm_eps is not in config.json, it is hard-coded in modeling_pixtral.py self.hparams["layer_norm_eps"] = self.hparams.get("layer_norm_eps", 1e-5) - self.img_break_tok_id = 12 # see tokenizer_config.json + self.img_break_tok_id = self.get_token_id("[IMG_BREAK]") + logger.info(f"Image break token id: {self.img_break_tok_id}") else: raise ValueError(f"Unsupported model type: {self.hparams['model_type']}") + def get_token_id(self, token: str) -> int: + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + added_tokens_decoder = json.load(f)['added_tokens_decoder'] + for id_, token_data in added_tokens_decoder.items(): + if token_data["content"] == token: + return int(id_) + raise ValueError(f"Token '{token}' not found in tokenizer config.") + def set_gguf_parameters(self): super().set_gguf_parameters() hparams = self.hparams if hparams["model_type"] == "pixtral": self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.PIXTRAL) - # default values below are taken from HF tranformers code self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"]) - self.gguf_writer.add_vision_use_silu(True) + + # hidden_act + if hparams["hidden_act"] == "silu": + self.gguf_writer.add_vision_use_silu(True) + elif hparams["hidden_act"] == "gelu": + self.gguf_writer.add_vision_use_gelu(True) + else: + raise ValueError(f"Unsupported hidden_act: {hparams['hidden_act']}") + + # spatial_merge_size + if "spatial_merge_size" in self.global_config: + self.gguf_writer.add_vision_spatial_merge_size(self.global_config["spatial_merge_size"]) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused @@ -1944,13 +1971,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter class SmolVLMModel(VisionModel): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - # fix for SmolVLM2, missing some keys in config.json - # default values are taken from transformers code if self.hparams["model_type"] == "smolvlm_vision": + # fix for SmolVLM2, missing some keys in config.json + # default values are taken from transformers code self.hparams["hidden_size"] = self.hparams.get("hidden_size", 1152) self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 16) self.hparams["intermediate_size"] = self.hparams.get("intermediate_size", 3072) - self.hparams["num_hidden_layers"] = self.hparams.get("num_hidden_layers", 12) def set_gguf_parameters(self): super().set_gguf_parameters() @@ -3505,6 +3531,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @ModelBase.register("NomicBertModel") class NomicBertModel(BertModel): + model_arch = gguf.MODEL_ARCH.BERT + def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, **kwargs: Any): hparams = kwargs.pop("hparams", None) if hparams is None: @@ -5849,6 +5877,19 @@ def split_str_to_n_bytes(split_str: str) -> int: return n +def get_model_architecture(dir_model: Path, model_type: ModelType, hparams: Any = None) -> str: + hparams = ModelBase.load_hparams(dir_model) if hparams is None else hparams + text_config = hparams.get("text_config", {}) + vision_config = hparams.get("vision_config", {}) + arch = hparams["architectures"][0] + # if "architectures" is found in the sub-config, use that instead + if model_type == ModelType.TEXT and text_config.get("architectures") is not None: + arch = text_config["architectures"][0] + elif model_type == ModelType.VISION and vision_config.get("architectures") is not None: + arch = vision_config["architectures"][0] + return arch + + def main() -> None: args = parse_args() @@ -5901,16 +5942,15 @@ def main() -> None: logger.info(f"Loading model: {dir_model.name}") - hparams = ModelBase.load_hparams(dir_model) - if args.mmproj: if "mmproj" not in fname_out.name: fname_out = ModelBase.add_prefix_to_filename(fname_out, "mmproj-") with torch.inference_mode(): output_type = ftype_map[args.outtype] - model_architecture = hparams["architectures"][0] model_type = ModelType.VISION if args.mmproj else ModelType.TEXT + model_architecture = get_model_architecture(dir_model, model_type) + logger.info(f"Model architecture: {model_architecture}") try: model_class = ModelBase.from_model_architecture(model_architecture, model_type=model_type) except NotImplementedError: diff --git a/examples/llava/README.md b/examples/llava/README.md index f58d9de7107e8..3b62627ce829f 100644 --- a/examples/llava/README.md +++ b/examples/llava/README.md @@ -34,6 +34,9 @@ llama-mtmd-cli -hf ggml-org/SmolVLM2-500M-Video-Instruct-GGUF # Pixtral 12B llama-mtmd-cli -hf ggml-org/pixtral-12b-GGUF + +# Mistral Small 3.1 24B (IQ2_M quantization) +llama-mtmd-cli -hf ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF --chat-template mistral-v7 ``` ## How it works and what is `mmproj`? @@ -73,3 +76,4 @@ For the following models, you can use `convert_hf_to_gguf.py`with `--mmproj` fla - SmolVLM (from [HuggingFaceTB](https://huggingface.co/HuggingFaceTB)) - SmolVLM2 (from [HuggingFaceTB](https://huggingface.co/HuggingFaceTB)) - [Pixtral 12B](https://huggingface.co/mistral-community/pixtral-12b) - only works with `transformers`-compatible checkpoint +- [Mistral Small 3.1 24B](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503) diff --git a/examples/llava/clip-impl.h b/examples/llava/clip-impl.h index 04bfcbb5e575f..b575ca4d7c2a9 100644 --- a/examples/llava/clip-impl.h +++ b/examples/llava/clip-impl.h @@ -2,8 +2,6 @@ #include "gguf.h" #include "clip.h" -#include "clip.h" - #include #include #include @@ -33,6 +31,7 @@ #define KEY_FEATURE_LAYER "clip.vision.feature_layer" #define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor" #define KEY_PROJ_TYPE "clip.projector_type" +#define KEY_SPATIAL_MERGE_SIZE "clip.vision.spatial_merge_size" #define KEY_USE_GLU_MLP "clip.use_glu_mlp" // for qwen2.5vl #define KEY_USE_RMS_NORM "clip.use_rms_norm" // for qwen2.5vl @@ -70,9 +69,11 @@ #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s" #define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s" #define TN_IMAGE_NEWLINE "model.image_newline" +#define TN_MM_INP_NORM "mm.input_norm.weight" #define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3 #define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3 #define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3 +#define TN_MM_PATCH_MERGER "mm.patch_merger.weight" // mistral small 3.1 #define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral // mimicpmv diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index ad3e7df1d8a3a..7607d4e3ae3a4 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -172,6 +172,7 @@ struct clip_hparams { std::unordered_set vision_feature_layer; int32_t attn_window_size = 0; int32_t n_wa_pattern = 0; + int32_t spatial_merge_size = 0; }; struct clip_layer { @@ -232,6 +233,7 @@ struct clip_vision_model { struct ggml_tensor * projection; // LLaVA projection + struct ggml_tensor * mm_input_norm_w = nullptr; struct ggml_tensor * mm_0_w = nullptr; struct ggml_tensor * mm_0_b = nullptr; struct ggml_tensor * mm_2_w = nullptr; @@ -311,6 +313,7 @@ struct clip_vision_model { // pixtral struct ggml_tensor * token_embd_img_break = nullptr; + struct ggml_tensor * mm_patch_merger_w = nullptr; }; struct clip_ctx { @@ -637,6 +640,7 @@ static ggml_cgraph * clip_image_build_graph_pixtral(clip_ctx * ctx, const clip_i const int d_head = hidden_size / n_head; const int n_layer = hparams.n_layer; const float eps = hparams.eps; + const int n_merge = hparams.spatial_merge_size; struct ggml_init_params params = { /*.mem_size =*/ ctx->buf_compute_meta.size(), @@ -721,7 +725,13 @@ static ggml_cgraph * clip_image_build_graph_pixtral(clip_ctx * ctx, const clip_i { ggml_tensor * gate_proj = ggml_mul_mat(ctx0, model.layers[il].ff_gate_w, cur); ggml_tensor * up_proj = ggml_mul_mat(ctx0, model.layers[il].ff_up_w, cur); - gate_proj = ggml_silu(ctx0, gate_proj); // pixtral uses silu + if (ctx->use_silu) { + gate_proj = ggml_silu(ctx0, gate_proj); + } else if (ctx->use_gelu) { + gate_proj = ggml_gelu(ctx0, gate_proj); + } else { + GGML_ABORT("Pixtral: Unsupported activation"); + } cur = ggml_mul(ctx0, up_proj, gate_proj); cur = ggml_mul_mat(ctx0, model.layers[il].ff_down_w, cur); } @@ -732,14 +742,42 @@ static ggml_cgraph * clip_image_build_graph_pixtral(clip_ctx * ctx, const clip_i embeddings = cur; } - // LlavaMultiModalProjector (with GELU activation) + // mistral small 3.1 patch merger + // ref: https://github.com/huggingface/transformers/blob/7a3e208892c06a5e278144eaf38c8599a42f53e7/src/transformers/models/mistral3/modeling_mistral3.py#L67 + if (model.mm_patch_merger_w) { + GGML_ASSERT(hparams.spatial_merge_size > 0); + + ggml_tensor * cur = embeddings; + cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.mm_input_norm_w); + + // reshape image tokens to 2D grid + cur = ggml_reshape_3d(ctx0, cur, hidden_size, n_patches_x, n_patches_y); + cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); // [x, y, hidden_size] + cur = ggml_cont(ctx0, cur); + + // torch.nn.functional.unfold is just an im2col under the hood + // we just need a dummy kernel to make it work + ggml_tensor * kernel = ggml_view_3d(ctx0, cur, n_merge, n_merge, cur->ne[2], 0, 0, 0); + cur = ggml_im2col(ctx0, kernel, cur, n_merge, n_merge, 0, 0, 1, 1, true, inp->type); + + // project to hidden_size + cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]); + cur = ggml_mul_mat(ctx0, model.mm_patch_merger_w, cur); + embeddings = cur; + } + + // LlavaMultiModalProjector (always using GELU activation) { embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings); - embeddings = ggml_add(ctx0, embeddings, model.mm_1_b); + if (model.mm_1_b) { + embeddings = ggml_add(ctx0, embeddings, model.mm_1_b); + } embeddings = ggml_gelu(ctx0, embeddings); embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); - embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); + if (model.mm_2_b) { + embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); + } } // arrangement of the [IMG_BREAK] token @@ -749,11 +787,14 @@ static ggml_cgraph * clip_image_build_graph_pixtral(clip_ctx * ctx, const clip_i // and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension // after the concatenation, we have a tensor with shape [hidden_size, n_patches_per_row + 1, n_rows] + const int p_y = n_merge > 0 ? n_patches_y / n_merge : n_patches_y; + const int p_x = n_merge > 0 ? n_patches_x / n_merge : n_patches_x; + const int p_total = p_x * p_y; const int n_embd_text = embeddings->ne[0]; - const int n_tokens_output = num_patches + n_patches_y - 1; // one [IMG_BREAK] per row, except the last row + const int n_tokens_output = p_total + p_y - 1; // one [IMG_BREAK] per row, except the last row - ggml_tensor * cur = ggml_reshape_3d(ctx0, embeddings, n_embd_text, n_patches_x, n_patches_y); - ggml_tensor * tok = ggml_new_tensor_3d(ctx0, embeddings->type, n_embd_text, 1, n_patches_y); + ggml_tensor * cur = ggml_reshape_3d(ctx0, embeddings, n_embd_text, p_x, p_y); + ggml_tensor * tok = ggml_new_tensor_3d(ctx0, embeddings->type, n_embd_text, 1, p_y); tok = ggml_scale(ctx0, tok, 0.0); // clear the tensor tok = ggml_add(ctx0, tok, model.token_embd_img_break); cur = ggml_concat(ctx0, cur, tok, 1); @@ -1734,6 +1775,7 @@ struct clip_model_loader { case PROJECTOR_TYPE_PIXTRAL: { hparams.rope_theta = 10000.0f; + get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size, false); } break; case PROJECTOR_TYPE_QWEN25VL: { @@ -1957,11 +1999,14 @@ struct clip_model_loader { case PROJECTOR_TYPE_PIXTRAL: { vision_model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight")); - vision_model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias")); + vision_model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false); vision_model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); - vision_model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias")); + vision_model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false); // [IMG_BREAK] token embedding vision_model.token_embd_img_break = get_tensor(TN_TOK_IMG_BREAK); + // for mistral small 3.1 + vision_model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false); + vision_model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false); } break; default: GGML_ASSERT(false && "unknown projector type"); @@ -2516,7 +2561,7 @@ struct llava_uhd { // no pinpoints, dynamically calculate the grid size (e.g. minicpmv) - auto best_size = get_best_resize(original_size, slice_size, patch_size, has_slices); + auto best_size = get_best_resize(original_size, slice_size, patch_size, !has_slices); res.overview_size = best_size; if (!has_slices) { @@ -2926,8 +2971,9 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im } else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) { n_patches /= ctx->vision_model.hparams.proj_scale_factor; } else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) { - int n_patches_x = img->nx / params.patch_size; - int n_patches_y = img->ny / params.patch_size; + int n_merge = ctx->vision_model.hparams.spatial_merge_size; + int n_patches_x = img->nx / params.patch_size / (n_merge > 0 ? n_merge : 1); + int n_patches_y = img->ny / params.patch_size / (n_merge > 0 ? n_merge : 1); n_patches = n_patches_y*n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row } @@ -3484,7 +3530,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { return ctx->vision_model.mm_model_peg_0_b->ne[0]; case PROJECTOR_TYPE_MLP: case PROJECTOR_TYPE_PIXTRAL: - return ctx->vision_model.mm_2_b->ne[0]; + return ctx->vision_model.mm_2_w->ne[1]; case PROJECTOR_TYPE_MLP_NORM: return ctx->vision_model.mm_3_b->ne[0]; case PROJECTOR_TYPE_MINICPMV: diff --git a/examples/llava/mtmd-cli.cpp b/examples/llava/mtmd-cli.cpp index 4d857ca64e0b4..aa52d92cab871 100644 --- a/examples/llava/mtmd-cli.cpp +++ b/examples/llava/mtmd-cli.cpp @@ -94,6 +94,7 @@ struct mtmd_cli_context { LOG_ERR("Model does not have chat template.\n"); LOG_ERR(" For old llava models, you may need to use '--chat-template vicuna'\n"); LOG_ERR(" For MobileVLM models, use '--chat-template deepseek'\n"); + LOG_ERR(" For Mistral Small 3.1, use '--chat-template mistral-v7'\n"); exit(1); } diff --git a/examples/llava/tests.sh b/examples/llava/tests.sh index 75604315cfeba..4af370064086f 100755 --- a/examples/llava/tests.sh +++ b/examples/llava/tests.sh @@ -59,6 +59,7 @@ add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M" # to test the big models, run: ./tests.sh big add_test_big "llama-mtmd-cli" "ggml-org/pixtral-12b-GGUF:Q4_K_M" +add_test_big "llama-mtmd-cli" "ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF" "mistral-v7" # these models always give the wrong answer, not sure why # add_test "llama-mtmd-cli" "ggml-org/SmolVLM-Instruct-GGUF:Q4_K_M" diff --git a/examples/rpc/rpc-server.cpp b/examples/rpc/rpc-server.cpp index 0277e25cb5ec2..b663a81191c44 100644 --- a/examples/rpc/rpc-server.cpp +++ b/examples/rpc/rpc-server.cpp @@ -304,8 +304,9 @@ int main(int argc, char * argv[]) { get_backend_memory(&free_mem, &total_mem); } const char * cache_dir = nullptr; - std::string cache_dir_str = fs_get_cache_directory() + "rpc/"; + std::string cache_dir_str; if (params.use_cache) { + cache_dir_str = fs_get_cache_directory() + "rpc/"; if (!fs_create_directory_with_parents(cache_dir_str)) { fprintf(stderr, "Failed to create cache directory: %s\n", cache_dir_str.c_str()); return 1; diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 61fe15a15f074..b463cbd9b3c39 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -360,3 +360,27 @@ write_basic_package_version_file( install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake ${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml) + +if (MSVC) + set(MSVC_WARNING_FLAGS + /wd4005 # Macro redefinition + /wd4244 # Conversion from one type to another type, possible loss of data + /wd4267 # Conversion from 'size_t' to a smaller type, possible loss of data + ) + function(disable_msvc_warnings target_name) + if(TARGET ${target_name}) + target_compile_options(${target_name} PRIVATE ${MSVC_WARNING_FLAGS}) + endif() + endfunction() + + disable_msvc_warnings(ggml-base) + disable_msvc_warnings(ggml) + disable_msvc_warnings(ggml-cpu) + disable_msvc_warnings(ggml-cpu-x64) + disable_msvc_warnings(ggml-cpu-sse42) + disable_msvc_warnings(ggml-cpu-sandybridge) + disable_msvc_warnings(ggml-cpu-haswell) + disable_msvc_warnings(ggml-cpu-skylakex) + disable_msvc_warnings(ggml-cpu-icelake) + disable_msvc_warnings(ggml-cpu-alderlake) +endif() diff --git a/ggml/include/ggml-cpp.h b/ggml/include/ggml-cpp.h index a12342c25debe..48aa79682b65d 100644 --- a/ggml/include/ggml-cpp.h +++ b/ggml/include/ggml-cpp.h @@ -24,7 +24,7 @@ typedef std::unique_ptr gguf_context_ptr; struct ggml_gallocr_deleter { void operator()(ggml_gallocr_t galloc) { ggml_gallocr_free(galloc); } }; -typedef std::unique_ptr ggml_gallocr_ptr; +typedef std::unique_ptr ggml_gallocr_ptr; // ggml-backend diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c index a3d3f690133b0..5fd379f6a9461 100644 --- a/ggml/src/ggml-alloc.c +++ b/ggml/src/ggml-alloc.c @@ -816,7 +816,10 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) { size_t node_size = 0; if (!node->data && !node->view_src) { - GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API + // If we previously had data but don't now then reallocate + if (talloc->buffer_id < 0) { + return false; + } node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node); } return talloc->size_max >= node_size; diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index 6a652738c10a9..9a3085befc476 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -352,10 +352,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name) # TODO: Separation to determine activation of VX/VXE/VXE2 if (${S390X_M} MATCHES "8561|8562") message(STATUS "z15 target") - list(APPEND ARCH_FLAGS -march=z15 -mtune=z15) + list(APPEND ARCH_FLAGS -march=z15) elseif (${S390X_M} MATCHES "3931") message(STATUS "z16 target") - list(APPEND ARCH_FLAGS -march=z16 -mtune=z16) + list(APPEND ARCH_FLAGS -march=z16) + elseif (${S390X_M} MATCHES "9175|9176") + # NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version. + message(STATUS "z17 target") + list(APPEND ARCH_FLAGS -march=z17) else() message(STATUS "Unknown target") message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.") diff --git a/ggml/src/ggml-cpu/simd-mappings.h b/ggml/src/ggml-cpu/simd-mappings.h index 04d10cec266cc..45c31cf1faffe 100644 --- a/ggml/src/ggml-cpu/simd-mappings.h +++ b/ggml/src/ggml-cpu/simd-mappings.h @@ -341,7 +341,7 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) { #define GGML_F32_EPR 4 #define GGML_F32x4 vector float -#define GGML_F32x4_ZERO 0.0f +#define GGML_F32x4_ZERO {0.0f} #define GGML_F32x4_SET1 vec_splats #define GGML_F32x4_LOAD(p) vec_xl(0, p) #define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p) diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt index 8623214c78a72..f3cfdeaef513a 100644 --- a/ggml/src/ggml-cuda/CMakeLists.txt +++ b/ggml/src/ggml-cuda/CMakeLists.txt @@ -133,6 +133,7 @@ if (CUDAToolkit_FOUND) COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion" OUTPUT_VARIABLE CUDA_CCVER ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE ) else() if (CUDA_CCFULLVER MATCHES Apple) @@ -143,7 +144,7 @@ if (CUDAToolkit_FOUND) string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER}) endif() - message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}") + message(STATUS "CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}") ggml_get_flags(${CUDA_CCID} ${CUDA_CCVER}) list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS}) # This is passed to -Xcompiler later diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu index 2d46176eab344..d027271fcd932 100644 --- a/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu @@ -592,6 +592,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg dest_ptrs_d = ctx.cuda_graph->dest_ptrs_d; graph_cpynode_index = ctx.cuda_graph->graph_cpynode_index; } +#else + GGML_UNUSED(disable_indirection_for_this_node); #endif if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) { GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1)); diff --git a/ggml/src/ggml-cuda/getrows.cu b/ggml/src/ggml-cuda/getrows.cu index 4cef53a98cfd6..ea8bf69160996 100644 --- a/ggml/src/ggml-cuda/getrows.cu +++ b/ggml/src/ggml-cuda/getrows.cu @@ -33,8 +33,8 @@ static __global__ void k_get_rows( dfloat2 v; dequantize_kernel(src0_row, ib, iqs, v); - dst_row[iybs + iqs + 0] = v.x; - dst_row[iybs + iqs + y_offset] = v.y; + dst_row[iybs + iqs + 0] = float(v.x); + dst_row[iybs + iqs + y_offset] = float(v.y); } template @@ -60,7 +60,7 @@ static __global__ void k_get_rows_float( dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; const src0_t * src0_row = (const src0_t *)((const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03); - dst_row[i00] = src0_row[i00]; + dst_row[i00] = float(src0_row[i00]); } template @@ -86,122 +86,161 @@ static __global__ void k_get_rows_back_float( dst[dst_row*ncols + col] = sum; } -template -static void get_rows_cuda( - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) { - - GGML_TENSOR_BINARY_OP_LOCALS - +template +static void get_rows_cuda_q( + const void * src0_d, const int32_t * src1_d, dst_t * dst_d, + const int64_t ne00, const size_t nb01, const size_t nb02, const size_t nb03, + const int64_t ne10, const int64_t ne11, const int64_t ne12, const size_t nb10, const size_t nb11, const size_t nb12, + const size_t nb1, const size_t nb2, const size_t nb3, + cudaStream_t stream) { const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1); const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE); const dim3 block_nums(block_num_x, ne10, ne11*ne12); // strides in elements - //const size_t s0 = nb0 / ggml_element_size(dst); - const size_t s1 = nb1 / ggml_element_size(dst); - const size_t s2 = nb2 / ggml_element_size(dst); - const size_t s3 = nb3 / ggml_element_size(dst); + // const size_t s0 = nb0 / sizeof(dst_t); + const size_t s1 = nb1 / sizeof(dst_t); + const size_t s2 = nb2 / sizeof(dst_t); + const size_t s3 = nb3 / sizeof(dst_t); - const size_t s10 = nb10 / ggml_element_size(src1); - const size_t s11 = nb11 / ggml_element_size(src1); - const size_t s12 = nb12 / ggml_element_size(src1); - //const size_t s13 = nb13 / ggml_element_size(src1); + const size_t s10 = nb10 / sizeof(int32_t); + const size_t s11 = nb11 / sizeof(int32_t); + const size_t s12 = nb12 / sizeof(int32_t); + // const size_t s13 = nb13 / sizeof(int32_t); GGML_ASSERT(ne00 % 2 == 0); k_get_rows<<>>( - src0_dd, src1_dd, dst_dd, + src0_d, src1_d, dst_d, ne00, /*ne01, ne02, ne03,*/ /*ne10, ne11,*/ ne12, /*ne13,*/ /* s0,*/ s1, s2, s3, /* nb00,*/ nb01, nb02, nb03, s10, s11, s12/*, s13*/); - - GGML_UNUSED(dst); } -template +template static void get_rows_cuda_float( - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) { - - GGML_TENSOR_BINARY_OP_LOCALS - - GGML_ASSERT(ne13 == 1); - + const src0_t * src0_d, const int32_t * src1_d, dst_t * dst_d, + const int64_t ne00, const size_t nb01, const size_t nb02, const size_t nb03, + const int64_t ne10, const int64_t ne11, const int64_t ne12, const size_t nb10, const size_t nb11, const size_t nb12, + const size_t nb1, const size_t nb2, const size_t nb3, + cudaStream_t stream) { const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1); const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE; const dim3 block_nums(block_num_x, ne10, ne11*ne12); // strides in elements - //const size_t s0 = nb0 / ggml_element_size(dst); - const size_t s1 = nb1 / ggml_element_size(dst); - const size_t s2 = nb2 / ggml_element_size(dst); - const size_t s3 = nb3 / ggml_element_size(dst); + // const size_t s0 = nb0 / sizeof(dst_t); + const size_t s1 = nb1 / sizeof(dst_t); + const size_t s2 = nb2 / sizeof(dst_t); + const size_t s3 = nb3 / sizeof(dst_t); - const size_t s10 = nb10 / ggml_element_size(src1); - const size_t s11 = nb11 / ggml_element_size(src1); - const size_t s12 = nb12 / ggml_element_size(src1); - //const size_t s13 = nb13 / ggml_element_size(src1); + const size_t s10 = nb10 / sizeof(int32_t); + const size_t s11 = nb11 / sizeof(int32_t); + const size_t s12 = nb12 / sizeof(int32_t); + // const size_t s13 = nb13 / sizeof(int32_t); k_get_rows_float<<>>( - src0_dd, src1_dd, dst_dd, + src0_d, src1_d, dst_d, ne00, /*ne01, ne02, ne03,*/ /*ne10, ne11,*/ ne12, /*ne13,*/ /* s0,*/ s1, s2, s3, /* nb00,*/ nb01, nb02, nb03, s10, s11, s12/*, s13*/); - - GGML_UNUSED(dst); } -void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - const ggml_tensor * src0 = dst->src[0]; - const ggml_tensor * src1 = dst->src[1]; - - const void * src0_d = (const void *) src0->data; - const int32_t * src1_d = (const int32_t *) src1->data; - float * dst_d = (float *) dst->data; - - cudaStream_t stream = ctx.stream(); - - GGML_ASSERT(src1->type == GGML_TYPE_I32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); - - GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type)); - GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type)); - GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type)); - - switch (src0->type) { +template +static void ggml_cuda_get_rows_switch_src0_type( + const void * src0_d, const ggml_type src0_type, const int32_t * src1_d, dst_t * dst_d, + const int64_t ne00, const size_t nb01, const size_t nb02, const size_t nb03, + const int64_t ne10, const int64_t ne11, const int64_t ne12, const size_t nb10, const size_t nb11, const size_t nb12, + const size_t nb1, const size_t nb2, const size_t nb3, + cudaStream_t stream) { + switch (src0_type) { case GGML_TYPE_F16: - get_rows_cuda_float(src0, src1, dst, (const half *) src0_d, src1_d, dst_d, stream); + get_rows_cuda_float((const half *) src0_d, src1_d, dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); break; case GGML_TYPE_F32: - get_rows_cuda_float(src0, src1, dst, (const float *) src0_d, src1_d, dst_d, stream); + get_rows_cuda_float((const float *) src0_d, src1_d, dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); + break; + case GGML_TYPE_BF16: + get_rows_cuda_float((const nv_bfloat16 *) src0_d, src1_d, dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); break; case GGML_TYPE_Q4_0: - get_rows_cuda(src0, src1, dst, src0_d, src1_d, dst_d, stream); + get_rows_cuda_q(src0_d, src1_d, dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); break; case GGML_TYPE_Q4_1: - get_rows_cuda(src0, src1, dst, src0_d, src1_d, dst_d, stream); + get_rows_cuda_q(src0_d, src1_d, dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); break; case GGML_TYPE_Q5_0: - get_rows_cuda(src0, src1, dst, src0_d, src1_d, dst_d, stream); + get_rows_cuda_q(src0_d, src1_d, dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); break; case GGML_TYPE_Q5_1: - get_rows_cuda(src0, src1, dst, src0_d, src1_d, dst_d, stream); + get_rows_cuda_q(src0_d, src1_d, dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); break; case GGML_TYPE_Q8_0: - get_rows_cuda(src0, src1, dst, src0_d, src1_d, dst_d, stream); + get_rows_cuda_q(src0_d, src1_d, dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); break; default: // TODO: k-quants - GGML_ABORT("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type)); + GGML_ABORT("%s: unsupported src0 type: %s\n", __func__, ggml_type_name(src0_type)); break; } } +void get_rows_cuda( + const void * src0_d, ggml_type src0_type, const int32_t * src1_d, void * dst_d, ggml_type dst_type, + int64_t ne00, size_t nb01, size_t nb02, size_t nb03, + int64_t ne10, int64_t ne11, int64_t ne12, size_t nb10, size_t nb11, size_t nb12, + size_t nb1, size_t nb2, size_t nb3, + cudaStream_t stream) { + switch (dst_type) { + case GGML_TYPE_F32: + ggml_cuda_get_rows_switch_src0_type(src0_d, src0_type, src1_d, (float *) dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); + break; + case GGML_TYPE_F16: + ggml_cuda_get_rows_switch_src0_type(src0_d, src0_type, src1_d, (half *) dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); + break; + case GGML_TYPE_BF16: + ggml_cuda_get_rows_switch_src0_type(src0_d, src0_type, src1_d, (nv_bfloat16 *) dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); + break; + default: + GGML_ABORT("%s: unsupported dst type: %s\n", __func__, ggml_type_name(dst_type)); + break; + } +} + +void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + + cudaStream_t stream = ctx.stream(); + + GGML_TENSOR_BINARY_OP_LOCALS + + GGML_ASSERT(src1->type == GGML_TYPE_I32); + GGML_ASSERT(ne13 == 1); + + GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type)); + GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type)); + GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type)); + + get_rows_cuda(src0->data, src0->type, (const int32_t *) src1->data, dst->data, dst->type, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); +} + void ggml_cuda_op_get_rows_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; // gradients of forward pass output const ggml_tensor * src1 = dst->src[1]; // src1 in forward pass diff --git a/ggml/src/ggml-cuda/getrows.cuh b/ggml/src/ggml-cuda/getrows.cuh index a1ca643f1c530..3c5bea5f48c1c 100644 --- a/ggml/src/ggml-cuda/getrows.cuh +++ b/ggml/src/ggml-cuda/getrows.cuh @@ -3,6 +3,13 @@ #define CUDA_GET_ROWS_BLOCK_SIZE 256 #define CUDA_GET_ROWS_BACK_BLOCK_SIZE 256 +void get_rows_cuda( + const void * src0_d, ggml_type src0_type, const int32_t * src1_d, void * dst_d, ggml_type dst_type, + int64_t ne00, size_t nb01, size_t nb02, size_t nb03, + int64_t ne10, int64_t ne11, int64_t ne12, size_t nb10, size_t nb11, size_t nb12, + size_t nb1, size_t nb2, size_t nb3, + cudaStream_t stream); + void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_get_rows_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index fba8cb6565bae..9fb2134f98d3d 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -1551,7 +1551,7 @@ static void ggml_cuda_op_mul_mat( if (src1_on_device && src1_is_contiguous) { quantize_src1( - dev[id].src1_ddf, dev[id].src1_ddq, src0->type, ne10, + dev[id].src1_ddf, nullptr, dev[id].src1_ddq, src0->type, ne10, nb11/sizeof(float), nb12/sizeof(float), nb13/sizeof(float), src1_padded_col_size, ne11, ne12, ne13, stream); CUDA_CHECK(cudaGetLastError()); @@ -1649,7 +1649,7 @@ static void ggml_cuda_op_mul_mat( if (quantize_src1 && !src1_is_contiguous) { quantize_src1( - src1_ddf_i, src1_ddq_i, src0->type, ne10, ne10, ne11*ne10, ne12*ne11*ne10, + src1_ddf_i, nullptr, src1_ddq_i, src0->type, ne10, ne10, ne11*ne10, ne12*ne11*ne10, src1_padded_col_size, src1_ncols, 1, 1, stream); CUDA_CHECK(cudaGetLastError()); } @@ -1949,6 +1949,8 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor ggml_cuda_mul_mat_vec(ctx, src0, src1, nullptr, dst); } else if (!split && use_mul_mat_vec_q) { ggml_cuda_mul_mat_vec_q(ctx, src0, src1, nullptr, dst); + } else if (!split && use_mul_mat_q) { + ggml_cuda_mul_mat_q(ctx, src0, src1, nullptr, dst); } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { // general KQ + KQV multi-batch without FlashAttention @@ -1964,183 +1966,145 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor } } -struct mmid_row_mapping { - int32_t i1; - int32_t i2; -}; - -static __global__ void k_copy_src1_to_contiguous(const char * __restrict__ src1_original, char * __restrict__ src1_contiguous, - int * __restrict__ cur_src1_row, mmid_row_mapping * __restrict__ row_mapping, - const char * __restrict ids, int64_t i02, size_t ids_nb1, size_t ids_nb0, - int64_t ne11, int64_t ne10, - size_t nb11, size_t nb12) { - int32_t iid1 = blockIdx.x; - int32_t id = blockIdx.y; - - const int32_t row_id_i = *(const int32_t *) (ids + iid1*ids_nb1 + id*ids_nb0); - - if (row_id_i != i02) { - return; - } - - const int64_t i11 = id % ne11; - const int64_t i12 = iid1; - - __shared__ int src1_row; - if (threadIdx.x == 0) { - src1_row = atomicAdd(cur_src1_row, 1); - row_mapping[src1_row] = {id, iid1}; - } - __syncthreads(); - - const float * src1_row_original = (const float *)(src1_original + i11*nb11 + i12*nb12); - float * src1_row_contiguous = (float *)(src1_contiguous + src1_row*nb11); - - for (int i = threadIdx.x; i < ne10; i += blockDim.x) { - src1_row_contiguous[i] = src1_row_original[i]; - } -} - -static __global__ void k_copy_dst_from_contiguous(char * __restrict__ dst_original, const char * __restrict__ dst_contiguous, - const mmid_row_mapping * __restrict__ row_mapping, - int64_t ne0, - size_t nb1, size_t nb2) { - int32_t i = blockIdx.x; - - const int32_t i1 = row_mapping[i].i1; - const int32_t i2 = row_mapping[i].i2; - - const float * dst_row_contiguous = (const float *)(dst_contiguous + i*nb1); - float * dst_row_original = (float *)(dst_original + i1*nb1 + i2*nb2); - - for (int j = threadIdx.x; j < ne0; j += blockDim.x) { - dst_row_original[j] = dst_row_contiguous[j]; - } -} - static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; const ggml_tensor * ids = dst->src[2]; - GGML_TENSOR_BINARY_OP_LOCALS - - if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && ne2 == 1) { - if (ggml_is_quantized(src0->type)) { - ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst); - } else { - ggml_cuda_mul_mat_vec(ctx, src0, src1, ids, dst); - } - return; - } - + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft) && "mul_mat_id does not support split buffers"); - cudaStream_t stream = ctx.stream(); + GGML_TENSOR_BINARY_OP_LOCALS - const int64_t n_as = ne02; - const int64_t n_ids = ids->ne[0]; + const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; - std::vector ids_host(ggml_nbytes(ids)); - const char * ids_dev = (const char *) ids->data; - CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream)); - CUDA_CHECK(cudaStreamSynchronize(stream)); + if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + if (ne2 == 1) { + if (ggml_is_quantized(src0->type)) { + ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst); + } else { + ggml_cuda_mul_mat_vec(ctx, src0, src1, ids, dst); + } + return; + } - ggml_tensor src0_row = *src0; - ggml_tensor src1_row = *src1; - ggml_tensor dst_row = *dst; + if (ggml_cuda_should_use_mmq(src0->type, cc, ne12)) { + ggml_cuda_mul_mat_q(ctx, src0, src1, ids, dst); + return; + } + } - char * src0_original = (char *) src0->data; - char * src1_original = (char *) src1->data; - char * dst_original = (char *) dst->data; + cudaStream_t stream = ctx.stream(); - src0_row.ne[2] = 1; - src0_row.ne[3] = 1; - src0_row.nb[3] = nb02; + GGML_ASSERT(nb12 % nb11 == 0); + GGML_ASSERT(nb2 % nb1 == 0); - src1_row.ne[1] = 1; - src1_row.ne[2] = 1; - src1_row.ne[3] = 1; - src1_row.nb[2] = nb11; - src1_row.nb[3] = nb11; + const ggml_type type_src1_sorted = (src0->type == GGML_TYPE_F16 && !fast_fp16_hardware_available(cc)) + || ggml_is_quantized(src0->type) ? GGML_TYPE_F32 : src0->type; + const ggml_type type_dst_sorted = GGML_TYPE_F32; + const size_t ts_src1_sorted = ggml_type_size(type_src1_sorted); + const size_t ts_dst_sorted = ggml_type_size(type_dst_sorted); - dst_row.ne[1] = 1; - dst_row.ne[2] = 1; - dst_row.ne[3] = 1; - dst_row.nb[2] = nb1; - dst_row.nb[3] = nb1; + const int64_t n_expert_used = ids->ne[0]; + const int64_t ne_get_rows = ne12 * n_expert_used; - ggml_cuda_pool_alloc src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1)); - ggml_cuda_pool_alloc dst_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst)); + std::vector ids_to_sorted_host; + ids_to_sorted_host.reserve(2*ne_get_rows); + std::vector ids_from_sorted_host(ne_get_rows); - src1_row.data = src1_contiguous.get(); - dst_row.data = dst_contiguous.get(); + ggml_cuda_pool_alloc ids_buf_dev(ctx.pool(), 2*ne_get_rows); - for (int64_t i02 = 0; i02 < n_as; i02++) { - int64_t num_src1_rows = 0; + std::vector tokens_per_expert(ne02); - for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) { - for (int64_t id = 0; id < n_ids; id++) { - const int32_t row_id_i = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]); + ggml_cuda_pool_alloc src1_sorted(ctx.pool(), ne12*n_expert_used*ne10*ts_src1_sorted); + ggml_cuda_pool_alloc dst_sorted(ctx.pool(), ne2 *n_expert_used* ne0*ts_dst_sorted); - GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as); + std::vector ids_host(ggml_nbytes(ids)); + CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids->data, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); - if (row_id_i != i02) { - continue; + for (int64_t i02 = 0; i02 < ne02; ++i02) { // expert matrices + for (int64_t i12 = 0; i12 < ne12; ++i12) { // tokens + for (int64_t iex = 0; iex < n_expert_used; ++iex) { + const int32_t expert_to_use = *(const int32_t *)(ids_host.data() + i12*ids->nb[1] + iex*ids->nb[0]); + assert(expert_to_use >= 0 && expert_to_use < ne02); + if (expert_to_use == i02) { + ids_from_sorted_host[i12*n_expert_used + iex] = ids_to_sorted_host.size(); + ids_to_sorted_host.push_back(i12*ne11 + iex % ne11); + tokens_per_expert[i02]++; + break; } - - num_src1_rows++; } } + } + GGML_ASSERT(ids_to_sorted_host.size() == size_t(ne_get_rows)); - if (num_src1_rows == 0) { - continue; - } - - ggml_cuda_pool_alloc dev_cur_src1_row(ctx.pool(), 1); - ggml_cuda_pool_alloc dev_row_mapping(ctx.pool(), num_src1_rows); - CUDA_CHECK(cudaMemsetAsync(dev_cur_src1_row.get(), 0, sizeof(int), stream)); - - { - dim3 block_dims(std::min((unsigned int)ne10, 768u)); - dim3 grid_dims(ids->ne[1], n_ids); - k_copy_src1_to_contiguous<<>>( - src1_original, src1_contiguous.get(), - dev_cur_src1_row.get(), dev_row_mapping.get(), - ids_dev, i02, ids->nb[1], ids->nb[0], - ne11, ne10, - nb11, nb12); - CUDA_CHECK(cudaGetLastError()); - } + ids_to_sorted_host.insert(ids_to_sorted_host.end(), ids_from_sorted_host.begin(), ids_from_sorted_host.end()); - src0_row.data = src0_original + i02*nb02; + CUDA_CHECK(cudaMemcpyAsync(ids_buf_dev.ptr, ids_to_sorted_host.data(), 2*ne_get_rows*sizeof(int32_t), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); - GGML_ASSERT(nb11 == sizeof(float)*ne10); - GGML_ASSERT(nb1 == sizeof(float)*ne0); + const int32_t * ids_to_sorted = ids_buf_dev.ptr + 0*ne_get_rows; + const int32_t * ids_from_sorted = ids_buf_dev.ptr + 1*ne_get_rows; - src1_row.ne[1] = num_src1_rows; - src1_row.nb[1] = nb11; - src1_row.nb[2] = num_src1_rows*nb11; - src1_row.nb[3] = num_src1_rows*nb11; + get_rows_cuda(src1->data, src1->type, ids_to_sorted, src1_sorted.ptr, type_src1_sorted, + ne10, nb11, nb12, nb13, + ne_get_rows, 1, 1, sizeof(int32_t), ne_get_rows*sizeof(int32_t), ne_get_rows*sizeof(int32_t), + ne10*ts_src1_sorted, ne_get_rows*ne10*ts_src1_sorted, ne_get_rows*ne10*ts_src1_sorted, stream); + CUDA_CHECK(cudaGetLastError()); - dst_row.ne[1] = num_src1_rows; - dst_row.nb[1] = nb1; - dst_row.nb[2] = num_src1_rows*nb1; - dst_row.nb[3] = num_src1_rows*nb1; + char * src1_data_cur = (char *) src1_sorted.ptr; + char * dst_data_cur = (char *) dst_sorted.ptr; + for (int64_t i02 = 0; i02 < ne02; ++i02) { + if (tokens_per_expert[i02] == 0) { + continue; + } - ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row); + ggml_tensor src0_slice = *src0; + src0_slice.ne[2] = 1; + src0_slice.nb[3] = src0_slice.nb[2]; + src0_slice.data = (char *) src0->data + i02*nb02; + + ggml_tensor src1_slice; + memset(&src1_slice, 0, sizeof(src1_slice)); + src1_slice.buffer = src1->buffer; + src1_slice.type = type_src1_sorted; + src1_slice.ne[0] = ne10; + src1_slice.ne[1] = tokens_per_expert[i02]; + src1_slice.ne[2] = 1; + src1_slice.ne[3] = 1; + src1_slice.nb[0] = ts_src1_sorted; + src1_slice.nb[1] = src1_slice.ne[0] * src1_slice.nb[0]; + src1_slice.nb[2] = src1_slice.ne[1] * src1_slice.nb[1]; + src1_slice.nb[3] = src1_slice.ne[2] * src1_slice.nb[2]; + src1_slice.data = src1_data_cur; + + ggml_tensor dst_slice; + memset(&dst_slice, 0, sizeof(dst_slice)); + dst_slice.buffer = dst->buffer; + dst_slice.type = type_dst_sorted; + dst_slice.ne[0] = ne0; + dst_slice.ne[1] = tokens_per_expert[i02]; + dst_slice.ne[2] = 1; + dst_slice.ne[3] = 1; + dst_slice.nb[0] = ts_dst_sorted; + dst_slice.nb[1] = dst_slice.ne[0] * dst_slice.nb[0]; + dst_slice.nb[2] = dst_slice.ne[1] * dst_slice.nb[1]; + dst_slice.nb[3] = dst_slice.ne[2] * dst_slice.nb[2]; + dst_slice.data = dst_data_cur; + + ggml_cuda_mul_mat(ctx, &src0_slice, &src1_slice, &dst_slice); + CUDA_CHECK(cudaGetLastError()); - { - dim3 block_dims(std::min((unsigned int)ne0, 768u)); - dim3 grid_dims(num_src1_rows); - k_copy_dst_from_contiguous<<>>( - dst_original, dst_contiguous.get(), - dev_row_mapping.get(), - ne0, - nb1, nb2); - CUDA_CHECK(cudaGetLastError()); - } + src1_data_cur += src1_slice.nb[2]; + dst_data_cur += dst_slice.nb[2]; } + + get_rows_cuda(dst_sorted.ptr, type_dst_sorted, ids_from_sorted, dst->data, dst->type, + ne0, ne0*ts_dst_sorted, ne_get_rows*ne0*ts_dst_sorted, ne_get_rows*ne0*ts_dst_sorted, + ne_get_rows, 1, 1, sizeof(int32_t), ne_get_rows*sizeof(int32_t), ne_get_rows*sizeof(int32_t), + nb1, nb2, nb3, stream); } static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) { diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index b36b43d5417ba..f397a7e038469 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -1,37 +1,10 @@ #include "mmq.cuh" +#include "quantize.cuh" -void ggml_cuda_op_mul_mat_q( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, - const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, - const int64_t src1_padded_row_size, cudaStream_t stream) { - - const int64_t ne00 = src0->ne[0]; - - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - GGML_ASSERT(ne10 % QK8_1 == 0); +#include - const int64_t ne0 = dst->ne[0]; - - const int64_t row_diff = row_high - row_low; - const int64_t stride00 = ne00 / ggml_blck_size(src0->type); - - int id = ggml_cuda_get_device(); - const int cc = ggml_cuda_info().devices[id].cc; - - // the main device has a larger memory buffer to hold the results from all GPUs - // nrows_dst == nrows of the matrix that the kernel writes into - const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff; - - // The stream-k decomposition is only faster for recent NVIDIA GPUs. - // Also its fixup needs to allocate a temporary buffer in the memory pool. - // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer. - const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) && - ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && src1_ncols == ne11; - const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k}; - - switch (src0->type) { +static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) { + switch (args.type_x) { case GGML_TYPE_Q4_0: mul_mat_q_case(ctx, args, stream); break; @@ -90,10 +63,195 @@ void ggml_cuda_op_mul_mat_q( GGML_ABORT("fatal error"); break; } +} + +void ggml_cuda_mul_mat_q( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) { + GGML_ASSERT( src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ids || ids->type == GGML_TYPE_I32); // Optional, used for batched GGML_MUL_MAT_ID. + + GGML_TENSOR_BINARY_OP_LOCALS; + + cudaStream_t stream = ctx.stream(); + const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; + + const size_t ts_src0 = ggml_type_size(src0->type); + const size_t ts_src1 = ggml_type_size(src1->type); + const size_t ts_dst = ggml_type_size(dst->type); + + GGML_ASSERT( nb00 == ts_src0); + GGML_ASSERT( nb10 == ts_src1); + GGML_ASSERT( nb0 == ts_dst); + GGML_ASSERT(!ids || ids->nb[0] == ggml_type_size(ids->type)); + + const char * src0_d = (const char *) src0->data; + const float * src1_d = (const float *) src1->data; + float * dst_d = (float *) dst->data; + + const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING); + + const int64_t s01 = src0->nb[1] / ts_src0; + const int64_t s1 = dst->nb[1] / ts_dst; + const int64_t s02 = src0->nb[2] / ts_src0; + const int64_t s2 = dst->nb[2] / ts_dst; + const int64_t s03 = src0->nb[3] / ts_src0; + const int64_t s3 = dst->nb[3] / ts_dst; + + const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA; + + if (!ids) { + const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 + + get_mmq_x_max_host(cc)*sizeof(block_q8_1_mmq); + ggml_cuda_pool_alloc src1_q8_1(ctx.pool(), nbytes_src1_q8_1); + + { + const int64_t s11 = src1->nb[1] / ts_src1; + const int64_t s12 = src1->nb[2] / ts_src1; + const int64_t s13 = src1->nb[3] / ts_src1; + quantize_mmq_q8_1_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, + ne10, s11, s12, s13, ne10_padded, ne11, ne12, ne13, stream); + } + + const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int)); + const int64_t s13 = ne12*s12; + + const mmq_args args = { + src0_d, src0->type, (const int *) src1_q8_1.ptr, nullptr, nullptr, dst_d, + ne00, ne01, ne1, s01, s1, + ne02, ne12, s02, s12, s2, + ne03, ne13, s03, s13, s3, + use_stream_k}; + ggml_cuda_mul_mat_q_switch_type(ctx, args, stream); + return; + } + + GGML_ASSERT(ne13 == 1); + GGML_ASSERT(nb12 % nb11 == 0); + GGML_ASSERT(nb2 % nb1 == 0); + + const int64_t n_expert_used = ids->ne[0]; + const int64_t ne_get_rows = ne12 * n_expert_used; + + std::vector ids_host(ggml_nbytes(ids)); + std::vector ids_src1_host; + ids_src1_host.reserve(ne_get_rows); + std::vector ids_dst_host; + ids_dst_host.reserve(ne_get_rows); + std::vector tokens_per_expert_host(ne02); + std::vector expert_bounds_host(ne02 + 1); + ggml_cuda_pool_alloc ids_buf_dev(ctx.pool()); + + CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids->data, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + for (int64_t i02 = 0; i02 < ne02; ++i02) { // expert matrices + for (int64_t i12 = 0; i12 < ne12; ++i12) { // tokens + for (int64_t iex = 0; iex < n_expert_used; ++iex) { + const int32_t expert_to_use = *(const int32_t *)(ids_host.data() + i12*ids->nb[1] + iex*ids->nb[0]); + assert(expert_to_use >= 0 && expert_to_use < ne02); + if (expert_to_use == i02) { + ids_src1_host.push_back(i12*(nb12/nb11) + iex % ne11); + ids_dst_host.push_back(i12*ne1 + iex); + tokens_per_expert_host[i02]++; + break; + } + } + } + } + + int32_t cumsum = 0; + for (int64_t i = 0; i < ne02; ++i) { + expert_bounds_host[i] = cumsum; + cumsum += tokens_per_expert_host[i]; + } + expert_bounds_host[ne02] = cumsum; + + std::vector ids_buf_host; + ids_buf_host.reserve(ids_src1_host.size() + ids_dst_host.size() + expert_bounds_host.size()); + ids_buf_host.insert(ids_buf_host.end(), ids_src1_host.begin(), ids_src1_host.end()); + ids_buf_host.insert(ids_buf_host.end(), ids_dst_host.begin(), ids_dst_host.end()); + ids_buf_host.insert(ids_buf_host.end(), expert_bounds_host.begin(), expert_bounds_host.end()); + ids_buf_dev.alloc(ids_buf_host.size() + get_mmq_x_max_host(cc)); // Expert bounds are padded on device. + CUDA_CHECK(cudaMemcpyAsync(ids_buf_dev.ptr, ids_buf_host.data(), ids_buf_host.size()*sizeof(int32_t), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + const int32_t * ids_src1_dev = ids_buf_dev.ptr; + const int32_t * ids_dst_dev = ids_src1_dev + ids_src1_host.size(); + const int32_t * expert_bounds_dev = ids_dst_dev + ids_dst_host.size(); + + const size_t nbytes_src1_q8_1 = ne12*n_expert_used*ne10_padded * sizeof(block_q8_1)/QK8_1 + + get_mmq_x_max_host(cc)*sizeof(block_q8_1_mmq); + ggml_cuda_pool_alloc src1_q8_1(ctx.pool(), nbytes_src1_q8_1); + + const int64_t ne11_flat = ne12*n_expert_used; + const int64_t ne12_flat = 1; + const int64_t ne13_flat = 1; + + { + const int64_t s11 = src1->nb[1] / ts_src1; + const int64_t s12 = src1->nb[2] / ts_src1; + const int64_t s13 = src1->nb[2] / ts_src1; + quantize_mmq_q8_1_cuda(src1_d, ids_src1_dev, src1_q8_1.get(), src0->type, + ne10, s11, s12, s13, ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream); + } + + const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int)); + const int64_t s13 = ne12*s12; + + // Note that ne02 is used instead of ne12 because the number of y channels determines the z dimension of the CUDA grid. + const mmq_args args = { + src0_d, src0->type, (const int *) src1_q8_1.ptr, ids_dst_dev, expert_bounds_dev, dst_d, + ne00, ne01, ne_get_rows, s01, s1, + ne02, ne02, s02, s12, s2, + ne03, ne13, s03, s13, s3, + use_stream_k}; + + ggml_cuda_mul_mat_q_switch_type(ctx, args, stream); +} + +void ggml_cuda_op_mul_mat_q( + ggml_backend_cuda_context & ctx, + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, + const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, + const int64_t src1_padded_row_size, cudaStream_t stream) { + + const int64_t ne00 = src0->ne[0]; + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + GGML_ASSERT(ne10 % QK8_1 == 0); + + const int64_t ne0 = dst->ne[0]; + + const int64_t row_diff = row_high - row_low; + const int64_t stride01 = ne00 / ggml_blck_size(src0->type); + + const int id = ggml_cuda_get_device(); + const int cc = ggml_cuda_info().devices[id].cc; + + // the main device has a larger memory buffer to hold the results from all GPUs + // nrows_dst == nrows of the matrix that the kernel writes into + const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff; + + // The stream-k decomposition is only faster for recent NVIDIA GPUs. + // Also its fixup needs to allocate a temporary buffer in the memory pool. + // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer. + const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) && + ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && src1_ncols == ne11; + const mmq_args args = { + src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i, + ne00, row_diff, src1_ncols, stride01, nrows_dst, + 1, 1, 0, 0, 0, + 1, 1, 0, 0, 0, + use_stream_k}; + + ggml_cuda_mul_mat_q_switch_type(ctx, args, stream); GGML_UNUSED(src1); GGML_UNUSED(dst); GGML_UNUSED(src1_ddf_i); + GGML_UNUSED(src1_padded_row_size); } bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) { diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index 3cb2015520ba1..8c93e8326e20b 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -13,9 +13,10 @@ using namespace ggml_cuda_mma; #define MMQ_ITER_K 256 #define MMQ_NWARPS 8 -typedef void (*load_tiles_mmq_t)(const char * __restrict__ x, int * x_tile, const int & kbx0, const int & i_max, const int & stride); -typedef void (*vec_dot_mmq_t)(const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00); -typedef void (*mmq_write_back_t)(const float * __restrict__ sum, float * __restrict__ dst, const int & stride, const int & i_max, const int & j_max); +typedef void (*load_tiles_mmq_t)(const char * __restrict__ x, int * x_tile, const int kbx0, const int i_max, const int stride); +typedef void (*vec_dot_mmq_t)(const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00); +typedef void (*mmq_write_back_t)(const float * __restrict__ sum, const int32_t * __restrict__ get_rows_to_sorted, + float * __restrict__ dst, const int stride, const int i_max, const int j_max); enum mmq_q8_1_ds_layout { MMQ_Q8_1_DS_LAYOUT_D4, @@ -233,7 +234,7 @@ static constexpr __device__ int mmq_get_granularity_device(const int /* mmq_x */ // ------------------------------------------------------------ template static __device__ __forceinline__ void load_tiles_q4_0( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -289,7 +290,7 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q4_0_q8_1_dp4a( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_0, mmq_y); const int * x_qs = (const int *) x; @@ -328,7 +329,7 @@ static __device__ __forceinline__ void vec_dot_q4_0_q8_1_dp4a( } template static __device__ __forceinline__ void load_tiles_q4_1( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -384,7 +385,7 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q4_1_q8_1_dp4a( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_1, mmq_y); const int * x_qs = (const int *) x; @@ -423,7 +424,7 @@ static __device__ __forceinline__ void vec_dot_q4_1_q8_1_dp4a( } template static __device__ __forceinline__ void load_tiles_q5_0( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -495,7 +496,7 @@ template static __device__ __forceinlin } template static __device__ __forceinline__ void load_tiles_q5_1( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -565,7 +566,7 @@ template static __device__ __forceinlin } template static __device__ __forceinline__ void load_tiles_q8_0( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -621,7 +622,7 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q8_0_q8_1_dp4a( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q8_0, mmq_y); const int * x_qs = (const int *) x; @@ -651,7 +652,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_dp4a( template static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { typedef tile<16, 8, int> tile_A; typedef tile< 8, 8, int> tile_B; @@ -732,7 +733,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma( template static __device__ __forceinline__ void vec_dot_q8_1_q8_1_dp4a( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q5_1, mmq_y); const int * x_qs = (const int *) x; @@ -762,7 +763,7 @@ static __device__ __forceinline__ void vec_dot_q8_1_q8_1_dp4a( template static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { typedef tile<16, 8, int> tile_A; typedef tile< 8, 8, int> tile_B; @@ -839,7 +840,7 @@ static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma( template static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_dp4a( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { constexpr tile_x_sizes txs = MMQ_DP4A_TXS_Q8_0_16; const int * x_qs = (const int *) x; @@ -871,7 +872,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_dp4a( template static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { #ifdef NEW_MMA_AVAILABLE typedef tile<16, 4, int> tile_A; @@ -955,7 +956,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma( } template static __device__ __forceinline__ void load_tiles_q2_K( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -1011,7 +1012,7 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q2_K_q8_1_dp4a( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q2_K, mmq_y); const int * x_qs = (const int *) x; @@ -1074,7 +1075,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_dp4a( template static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { #ifdef NEW_MMA_AVAILABLE typedef tile<16, 4, int> tile_A; @@ -1201,7 +1202,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma( } template static __device__ __forceinline__ void load_tiles_q3_K( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -1298,7 +1299,7 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q3_K_q8_1_dp4a( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q3_K, mmq_y); const int * x_qs = (const int *) x; @@ -1340,7 +1341,7 @@ static __device__ __forceinline__ int unpack_scales_q45_K(const int * scales, co } template static __device__ __forceinline__ void load_tiles_q4_K( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -1437,7 +1438,7 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q4_K_q8_1_dp4a( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_K, mmq_y); const int * x_qs = (const int *) x; @@ -1469,7 +1470,7 @@ static __device__ __forceinline__ void vec_dot_q4_K_q8_1_dp4a( } template static __device__ __forceinline__ void load_tiles_q5_K( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -1578,7 +1579,7 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q5_K_q8_1_dp4a( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q5_K, mmq_y); const int * x_qs = (const int *) x; @@ -1610,7 +1611,7 @@ static __device__ __forceinline__ void vec_dot_q5_K_q8_1_dp4a( } template static __device__ __forceinline__ void load_tiles_q6_K( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -1693,7 +1694,7 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q6_K_q8_1_dp4a( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q6_K, mmq_y); const int * x_qs = (const int *) x; @@ -1726,7 +1727,7 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_dp4a( template static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { #ifdef NEW_MMA_AVAILABLE typedef tile<16, 4, int> tile_A; @@ -1835,7 +1836,7 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma( } template static __device__ __forceinline__ void load_tiles_iq4_nl( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -1893,7 +1894,7 @@ template static __device__ __forceinlin } template static __device__ __forceinline__ void load_tiles_iq2_xxs( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -1951,7 +1952,7 @@ template static __device__ __forceinlin } template static __device__ __forceinline__ void load_tiles_iq2_xs( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -2007,7 +2008,7 @@ template static __device__ __forceinlin } template static __device__ __forceinline__ void load_tiles_iq2_s( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -2070,7 +2071,7 @@ template static __device__ __forceinlin } template static __device__ __forceinline__ void load_tiles_iq3_xxs( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -2126,7 +2127,7 @@ template static __device__ __forceinlin } template static __device__ __forceinline__ void load_tiles_iq3_s( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -2189,7 +2190,7 @@ template static __device__ __forceinlin } template static __device__ __forceinline__ void load_tiles_iq1_s( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -2245,7 +2246,7 @@ template static __device__ __forceinlin } template static __device__ __forceinline__ void load_tiles_iq4_xs( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -2306,8 +2307,8 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void mmq_write_back_dp4a( - const float * __restrict__ sum, float * __restrict__ dst, const int & stride, const int & i_max, const int & j_max) { - + const float * __restrict__ sum, const int32_t * __restrict__ ids_dst, float * __restrict__ dst, + const int stride, const int i_max, const int j_max) { #pragma unroll for (int j0 = 0; j0 < mmq_x; j0 += nwarps) { const int j = j0 + threadIdx.y; @@ -2324,15 +2325,15 @@ static __device__ __forceinline__ void mmq_write_back_dp4a( continue; } - dst[j*stride + i] = sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE]; + dst[ids_dst[j]*stride + i] = sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE]; } } } template static __device__ __forceinline__ void mmq_write_back_mma( - const float * __restrict__ sum, float * __restrict__ dst, const int & stride, const int & i_max, const int & j_max) { - + const float * __restrict__ sum, const int * __restrict__ ids_dst, float * __restrict__ dst, + const int stride, const int i_max, const int j_max) { typedef tile<16, 8, int> tile_C; constexpr int granularity = mmq_get_granularity_device(mmq_x); @@ -2362,7 +2363,7 @@ static __device__ __forceinline__ void mmq_write_back_mma( continue; } - dst[j*stride + i] = sum[(j0/tile_C::J + n)*tile_C::ne + l]; + dst[ids_dst[j]*stride + i] = sum[(j0/tile_C::J + n)*tile_C::ne + l]; } } } @@ -2518,17 +2519,18 @@ struct mmq_type_traits { }; template -static __device__ void mul_mat_q_process_tile( - const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, float * __restrict__ tmp_fixup, - const int & ne00, const int & ne01, const int & stride01, const int & ne10, const int & ne11, const int & stride11, const int & ne0, - const int & it, const int & jt, const int & kb0_start, const int & kb0_stop) { +static __device__ __forceinline__ void mul_mat_q_process_tile( + const char * __restrict__ x, const int offset_x, const int * __restrict__ y, + const int * __restrict__ ids_dst, float * __restrict__ dst, float * __restrict__ tmp_fixup, + const int nrows_x, const int ncols_y, const int stride_row_x, const int stride_col_dst, + const int tile_x_max_i, const int tile_y_max_j, const int kb0_start, const int kb0_stop) { constexpr int qk = ggml_cuda_type_traits::qk; constexpr int mmq_y = get_mmq_y_device(); constexpr load_tiles_mmq_t load_tiles = mmq_type_traits::load_tiles; - extern __shared__ char data_mul_mat_q[]; - int * tile_y = (int *) data_mul_mat_q; + extern __shared__ int data_mul_mat_q[]; + int * tile_y = data_mul_mat_q + mmq_x; int * tile_x = tile_y + GGML_PAD(mmq_x*(WARP_SIZE + WARP_SIZE/QI8_1), nwarps*WARP_SIZE); #ifdef NEW_MMA_AVAILABLE @@ -2543,16 +2545,11 @@ static __device__ void mul_mat_q_process_tile( float sum[mmq_x*mmq_y / (nwarps*WARP_SIZE)] = {0.0f}; - const int tile_x_max_i = ne01 - it*mmq_y - 1; - const int tile_y_max_j = ne11 - jt*mmq_x - 1; - - const int * y = (const int *) yc + jt*(mmq_x*sizeof(block_q8_1_mmq)/sizeof(int)); - for (int kb0 = kb0_start; kb0 < kb0_stop; kb0 += blocks_per_iter) { - load_tiles(x, tile_x, stride01*it*mmq_y + kb0, tile_x_max_i, stride01); + load_tiles(x, tile_x, offset_x + kb0, tile_x_max_i, stride_row_x); { - const int * by0 = y + stride11*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + 0*sizeof(block_q8_1_mmq)/sizeof(int)); + const int * by0 = y + ncols_y*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + 0*sizeof(block_q8_1_mmq)/sizeof(int)); #pragma unroll for (int l0 = 0; l0 < mmq_x*MMQ_TILE_Y_K; l0 += nwarps*WARP_SIZE) { int l = l0 + threadIdx.y*WARP_SIZE + threadIdx.x; @@ -2568,7 +2565,7 @@ static __device__ void mul_mat_q_process_tile( __syncthreads(); { - const int * by0 = y + stride11*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + 1*sizeof(block_q8_1_mmq)/sizeof(int)); + const int * by0 = y + ncols_y*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + 1*sizeof(block_q8_1_mmq)/sizeof(int)); #pragma unroll for (int l0 = 0; l0 < mmq_x*MMQ_TILE_Y_K; l0 += nwarps*WARP_SIZE) { int l = l0 + threadIdx.y*WARP_SIZE + threadIdx.x; @@ -2585,12 +2582,10 @@ static __device__ void mul_mat_q_process_tile( } if (fixup) { - write_back(sum, tmp_fixup + blockIdx.x*(mmq_x*mmq_y), mmq_y, mmq_y, mmq_x); + write_back(sum, ids_dst, tmp_fixup + blockIdx.x*(mmq_x*mmq_y), mmq_y, mmq_y, mmq_x); } else { - write_back(sum, dst + jt*mmq_x*ne0 + it*mmq_y, ne0, tile_x_max_i, tile_y_max_j); + write_back(sum, ids_dst, dst, stride_col_dst, tile_x_max_i, tile_y_max_j); } - - GGML_UNUSED(ne00); GGML_UNUSED(ne10); } @@ -2609,8 +2604,11 @@ template #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) static __global__ void mul_mat_q( - const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, float * __restrict__ tmp_fixup, - const int ne00, const int ne01, const int stride01, const int ne10, const int ne11, const int stride11, const int ne0) { + const char * __restrict__ x, const int * __restrict__ y, const int32_t * __restrict__ ids_dst, + const int32_t * __restrict__ expert_bounds, float * __restrict__ dst, float * __restrict__ tmp_fixup, + const int ncols_x, const int nrows_x, const int ncols_y, const int stride_row_x, const int stride_col_dst, + const int channel_ratio, const int nchannels_y, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst, + const int sample_ratio, const int nsamples_y, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) { // Skip unused template specializations for faster compilation: if (mmq_x > get_mmq_x_max_device() || mmq_x % mmq_get_granularity_device(mmq_x) != 0) { @@ -2621,26 +2619,85 @@ static __global__ void mul_mat_q( constexpr int qk = ggml_cuda_type_traits::qk; constexpr int mmq_y = get_mmq_y_device(); + const int ntx = (ncols_y + mmq_x - 1) / mmq_x; // Number of tiles x + const int nty = (nrows_x + mmq_y - 1) / mmq_y; // Number of tiles y + + // Initialize the ids for writing back data with just the index. + // For regular matrix multiplications this is never changed. + // For MoE the correct indices are loaded from ids_dst. + extern __shared__ int ids_dst_shared[]; // Stored at beginning of shared memory. +#pragma unroll + for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) { + const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x; + + if (j0 + nwarps*WARP_SIZE > mmq_x && j >= mmq_x) { + break; + } + + ids_dst_shared[j] = j; + } + // On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead: #if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA { + const int wt = blockIdx.z / nchannels_y; + const int zt = blockIdx.z - wt*nchannels_y; + const int jt = blockIdx.y; + const int it = blockIdx.x; + + // Defaults for regular matrix multiplication: + int col_low = 0; + int col_high = ncols_y; + int col_diff = ncols_y; + int offset_y = wt*stride_sample_y + zt*stride_channel_y; + int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst; + + if (ids_dst) { + col_low = expert_bounds[zt + 0]; + col_high = expert_bounds[zt + 1]; + col_diff = col_high - col_low; + + offset_y = 0; + offset_dst = 0; + + if (jt*mmq_x >= col_diff) { + return; + } + +#pragma unroll + for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) { + const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x; + + if (j0 + nwarps*WARP_SIZE > mmq_x && j >= mmq_x) { + break; + } + + ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j]; + } + } + + offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int)); + offset_dst += it*mmq_y; + + const int tile_x_max_i = nrows_x - it*mmq_y - 1; + const int tile_y_max_j = col_diff - jt*mmq_x - 1; + + const int offset_x = (wt/sample_ratio)*stride_sample_x + (zt/channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x; + constexpr bool fixup = false; mul_mat_q_process_tile - (x, yc, dst, tmp_fixup, ne00, ne01, stride01, ne10, ne11, stride11, ne0, - blockIdx.x, blockIdx.y, 0, ne00/qk); + (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, ncols_y, stride_row_x, stride_col_dst, + tile_x_max_i, tile_y_max_j, 0, ncols_x/qk); return; } #endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA - const int64_t blocks_per_ne00 = ne00 / qk; + const int64_t blocks_per_ne00 = ncols_x / qk; constexpr int blocks_per_iter = MMQ_ITER_K / qk; - const int ntx = (ne11 + mmq_x - 1) / mmq_x; // Number of tiles x - const int nty = (ne01 + mmq_y - 1) / mmq_y; // Number of tiles y - // kbc == k block continuous, current index in continuous ijk space. - int64_t kbc = (int64_t) blockIdx.x *blocks_per_ne00*ntx*nty / gridDim.x; - int64_t kbc_stop = (int64_t)(blockIdx.x + 1)*blocks_per_ne00*ntx*nty / gridDim.x; + int64_t kbc = (int64_t) blockIdx.x *nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x; + int64_t kbc_stop = (int64_t)(blockIdx.x + 1)*nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x; kbc -= (kbc % blocks_per_ne00) % blocks_per_iter; kbc_stop -= (kbc_stop % blocks_per_ne00) % blocks_per_iter; @@ -2649,13 +2706,64 @@ static __global__ void mul_mat_q( int kb0_start = kbc % blocks_per_ne00; int kb0_stop = min(blocks_per_ne00, kb0_start + kbc_stop - kbc); while (kbc < kbc_stop && kb0_stop == blocks_per_ne00) { - const int jt = kbc / (blocks_per_ne00*nty); // j index of current tile. - const int it = (kbc - jt*(blocks_per_ne00*nty)) / blocks_per_ne00; // i index of current tile. + int tmp = kbc; + const int it = tmp / (nsamples_y*nchannels_y*ntx*blocks_per_ne00); + tmp -= it * (nsamples_y*nchannels_y*ntx*blocks_per_ne00); + const int wt = tmp / (nchannels_y*ntx*blocks_per_ne00); + tmp -= wt * (nchannels_y*ntx*blocks_per_ne00); + const int zt = tmp / (ntx*blocks_per_ne00); + tmp -= zt * (ntx*blocks_per_ne00); + const int jt = tmp / blocks_per_ne00; + + // Defaults for regular matrix multiplication: + int col_low = 0; + int col_high = ncols_y; + int col_diff = ncols_y; + int offset_y = wt*stride_sample_y + zt*stride_channel_y; + int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst; + + if (ids_dst) { + col_low = expert_bounds[zt + 0]; + col_high = expert_bounds[zt + 1]; + col_diff = col_high - col_low; + + offset_y = 0; + offset_dst = 0; + + if (jt*mmq_x >= col_diff) { + kbc += blocks_per_ne00; + kbc -= kbc % blocks_per_ne00; + + kb0_start = 0; + kb0_stop = min(blocks_per_ne00, kbc_stop - kbc); + + continue; + } + +#pragma unroll + for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) { + const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x; + + if (j0 + nwarps*WARP_SIZE > mmq_x && j >= mmq_x) { + break; + } + + ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j]; + } + } + + offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int)); + offset_dst += it*mmq_y; + + const int tile_x_max_i = nrows_x - it*mmq_y - 1; + const int tile_y_max_j = col_diff - jt*mmq_x - 1; + + const int offset_x = (wt/sample_ratio)*stride_sample_x + (zt/channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x; constexpr bool fixup = false; // All but (potentially) the last iterations write their data to dst rather than the fixup buffer. mul_mat_q_process_tile - (x, yc, dst, tmp_fixup, ne00, ne01, stride01, ne10, ne11, stride11, ne0, - it, jt, kb0_start, kb0_stop); + (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, ncols_y, stride_row_x, stride_col_dst, + tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop); kbc += blocks_per_ne00; kbc -= kbc % blocks_per_ne00; @@ -2668,55 +2776,106 @@ static __global__ void mul_mat_q( return; } - const int jt = kbc / (blocks_per_ne00*nty); - const int it = (kbc - jt*(blocks_per_ne00*nty)) / blocks_per_ne00; + int tmp = kbc; + const int it = tmp / (nsamples_y*nchannels_y*ntx*blocks_per_ne00); + tmp -= it * (nsamples_y*nchannels_y*ntx*blocks_per_ne00); + const int wt = tmp / (nchannels_y*ntx*blocks_per_ne00); + tmp -= wt * (nchannels_y*ntx*blocks_per_ne00); + const int zt = tmp / (ntx*blocks_per_ne00); + tmp -= zt * (ntx*blocks_per_ne00); + const int jt = tmp / blocks_per_ne00; + + // Defaults for regular matrix multiplication: + int col_low = 0; + int col_high = ncols_y; + int col_diff = ncols_y; + int offset_y = wt*stride_sample_y + zt*stride_channel_y; + int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst; + + if (ids_dst) { + col_low = expert_bounds[zt + 0]; + col_high = expert_bounds[zt + 1]; + col_diff = col_high - col_low; + + offset_y = 0; + offset_dst = 0; + + if (jt*mmq_x >= col_diff) { + return; + } + + // The memory layout for the fixup buffer is always contiguous, therefore reset ids: +#pragma unroll + for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) { + const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x; + + if (j0 + nwarps*WARP_SIZE > mmq_x && j >= mmq_x) { + break; + } + + ids_dst_shared[j] = j; + } + } + + offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int)); + offset_dst += it*mmq_y; + + const int tile_x_max_i = nrows_x - it*mmq_y - 1; + const int tile_y_max_j = col_diff - jt*mmq_x - 1; + + const int offset_x = (wt/sample_ratio)*stride_sample_x + (zt/channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x; constexpr bool fixup = true; // Last index writes its data to fixup buffer to avoid data races with other blocks. mul_mat_q_process_tile - (x, yc, dst, tmp_fixup, ne00, ne01, stride01, ne10, ne11, stride11, ne0, - it, jt, kb0_start, kb0_stop); + (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, ncols_y, stride_row_x, stride_col_dst, + tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop); } template static __global__ void mul_mat_q_stream_k_fixup( - float * __restrict__ dst, const float * __restrict__ tmp_last_tile, const int ne00, const int ne01, const int ne11, const int ne0, const int block_num_mmq) { - + const int32_t * ids_dst, const int32_t * expert_bounds, float * __restrict__ dst, const float * __restrict__ tmp_last_tile, + const int ncols_x, const int nrows_x, const int ncols_y, const int stride_col_dst, + const int nchannels_y, const int stride_channel_dst, const int nsamples_y, const int stride_sample_dst) { constexpr int mmq_y = get_mmq_y_device(); constexpr int qk = ggml_cuda_type_traits::qk; constexpr int blocks_per_iter = MMQ_ITER_K / qk; - const int64_t blocks_per_ne00 = ne00 / qk; + const int64_t blocks_per_ne00 = ncols_x / qk; float sum[mmq_x*mmq_y / (nwarps*WARP_SIZE)] = {0.0f}; - const int ntx = (ne11 + mmq_x - 1) / mmq_x; - const int nty = (ne01 + mmq_y - 1) / mmq_y; - - bool any_fixup = false; + const int ntx = (ncols_y + mmq_x - 1) / mmq_x; + const int nty = (nrows_x + mmq_y - 1) / mmq_y; - const int bidx_start = ((blockIdx.y*nty + blockIdx.x) * block_num_mmq) / (gridDim.y*gridDim.x); - const int bidx_stop = ((blockIdx.y*nty + blockIdx.x + 1) * block_num_mmq + gridDim.y*gridDim.x - 1) / (gridDim.y*gridDim.x); + const int bidx0 = blockIdx.x; - int64_t kbc_0; - int64_t kbc_stop_0 = (int64_t) bidx_start*blocks_per_ne00*ntx*nty / block_num_mmq; - - for (int bidx = bidx_start; bidx < bidx_stop; ++bidx) { - kbc_0 = kbc_stop_0; - kbc_stop_0 = (int64_t) (bidx + 1)*blocks_per_ne00*ntx*nty / block_num_mmq; + // kbc == k block continuous, current index in continuous ijk space. + int64_t kbc0 = (int64_t) bidx0 *nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x; + int64_t kbc0_stop = (int64_t)(bidx0 + 1)*nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x; - const int64_t kbc = kbc_0 - (kbc_0 % blocks_per_ne00) % blocks_per_iter; - const int64_t kbc_stop = kbc_stop_0 - (kbc_stop_0 % blocks_per_ne00) % blocks_per_iter; + kbc0 -= (kbc0 % blocks_per_ne00) % blocks_per_iter; + kbc0_stop -= (kbc0_stop % blocks_per_ne00) % blocks_per_iter; - // Skip fixup tile if the MMQ CUDA block never wrote anything to it: - if (kbc == kbc_stop || kbc_stop % blocks_per_ne00 == 0) { - continue; - } + const bool did_not_have_any_data = kbc0 == kbc0_stop; + const bool wrote_beginning_of_tile = kbc0 % blocks_per_ne00 == 0; + const bool did_not_write_last = kbc0/blocks_per_ne00 == kbc0_stop/blocks_per_ne00 && kbc0_stop % blocks_per_ne00 != 0; + if (did_not_have_any_data || wrote_beginning_of_tile || did_not_write_last) { + return; + } - const int jt = kbc_stop / (blocks_per_ne00*nty); - const int it = (kbc_stop - jt*(blocks_per_ne00*nty)) / blocks_per_ne00; + bool any_fixup = false; - // Skip fixup tile if it's unrelated to the output tile assigned to this CUDA block: - if ((unsigned)it != blockIdx.x || (unsigned)jt != blockIdx.y) { + // Iterate over previous blocks and sum up partial sums written to fixup buffer. + // All CUDA blocks that get here must have a previous block that needs a fixup. + int64_t bidx = bidx0 - 1; + int64_t kbc_stop = kbc0; + while(true) { + int64_t kbc = bidx*nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x; + kbc -= (kbc % blocks_per_ne00) % blocks_per_iter; + + if (kbc == kbc_stop) { // Did not have any data. + bidx--; + kbc_stop = kbc; continue; } @@ -2733,16 +2892,71 @@ static __global__ void mul_mat_q_stream_k_fixup( sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE] += tmp_last_tile[bidx*(mmq_x*mmq_y) + j*mmq_y + i]; } } + + // If this block started in a previous tile we are done and don't need to combine additional partial results. + if (kbc % blocks_per_ne00 == 0 || kbc/blocks_per_ne00 < kbc0/blocks_per_ne00) { + break; + } + bidx--; + kbc_stop = kbc; } if (!any_fixup) { return; } - dst += blockIdx.y*mmq_x*ne0 + blockIdx.x*mmq_y; + int tmp = kbc0; + const int it = tmp / (nsamples_y*nchannels_y*ntx*blocks_per_ne00); + tmp -= it * (nsamples_y*nchannels_y*ntx*blocks_per_ne00); + const int wt = tmp / (nchannels_y*ntx*blocks_per_ne00); + tmp -= wt * (nchannels_y*ntx*blocks_per_ne00); + const int zt = tmp / (ntx*blocks_per_ne00); + tmp -= zt * (ntx*blocks_per_ne00); + const int jt = tmp / blocks_per_ne00; - const int i_max = ne01 - blockIdx.x*mmq_y - 1; - const int j_max = ne11 - blockIdx.y*mmq_x - 1; + if (!ids_dst) { + const int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst + it*mmq_y; + dst += offset_dst; + + const int i_max = nrows_x - it*mmq_y - 1; + const int j_max = ncols_y - jt*mmq_x - 1; + +#pragma unroll + for (int j0 = 0; j0 < mmq_x; j0 += nwarps) { + const int j = j0 + threadIdx.y; + + if (j > j_max) { + return; + } + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) { + const int i = i0 + threadIdx.x; + + if (need_check && i > i_max) { + continue; + } + + dst[j*stride_col_dst + i] += sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE]; + } + } + return; + } + + __shared__ int ids_dst_shared[mmq_x]; + const int col_low = expert_bounds[zt + 0]; + const int col_high = expert_bounds[zt + 1]; + const int col_diff = col_high - col_low; + + for (int j = threadIdx.y*WARP_SIZE + threadIdx.x; j < mmq_x; j += nwarps*WARP_SIZE) { + ids_dst_shared[j] = ids_dst[col_low + j]; + } + + const int offset_dst = it*mmq_y; + dst += offset_dst; + + const int i_max = nrows_x - it*mmq_y - 1; + const int j_max = col_diff - jt*mmq_x - 1; #pragma unroll for (int j0 = 0; j0 < mmq_x; j0 += nwarps) { @@ -2760,26 +2974,27 @@ static __global__ void mul_mat_q_stream_k_fixup( continue; } - dst[j*ne0 + i] += sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE]; + dst[ids_dst_shared[j]*stride_col_dst + i] += sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE]; } } } struct mmq_args { - const char * x; const char * y; float * dst; - int64_t ne00; int64_t ne01; int64_t stride01; - int64_t ne10; int64_t ne11; int64_t stride11; - int64_t ne0; + const char * x; ggml_type type_x; const int * y; const int32_t * ids_dst; const int32_t * expert_bounds; float * dst; + int64_t ncols_x; int64_t nrows_x; int64_t ncols_y; int64_t stride_row_x; int64_t nrows_dst; + int64_t nchannels_x; int64_t nchannels_y; int64_t stride_channel_x; int64_t stride_channel_y; int64_t stride_channel_dst; + int64_t nsamples_x; int64_t nsamples_y; int64_t stride_sample_x; int64_t stride_sample_y; int64_t stride_sample_dst; bool use_stream_k; }; template -static int mmq_get_shmem(const int mmq_x, const int mmq_y, const int cc) { +static size_t mmq_get_nbytes_shared(const int mmq_x, const int mmq_y, const int cc) { const tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(type, mmq_y); const int mmq_tile_x_k = mmq_get_mma_tile_x_k(type); - const int shmem_x = new_mma_available(cc) ? mmq_y*mmq_tile_x_k*sizeof(int) : txs.qs*sizeof(int) + txs.dm*sizeof(half2) + txs.sc*sizeof(int); - const int shmem_y = mmq_x*sizeof(block_q8_1_mmq); - return shmem_x + GGML_PAD(shmem_y, MMQ_NWARPS*WARP_SIZE*sizeof(int)); + const size_t nbs_ids = mmq_x*sizeof(int); + const size_t nbs_x = new_mma_available(cc) ? mmq_y*mmq_tile_x_k*sizeof(int) : txs.qs*sizeof(int) + txs.dm*sizeof(half2) + txs.sc*sizeof(int); + const size_t nbs_y = mmq_x*sizeof(block_q8_1_mmq); + return nbs_ids + nbs_x + GGML_PAD(nbs_y, MMQ_NWARPS*WARP_SIZE*sizeof(int)); } template @@ -2791,86 +3006,114 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a const dim3 block_dims(WARP_SIZE, MMQ_NWARPS, 1); - const int shmem = mmq_get_shmem(mmq_x, mmq_y, cc); + const int nbytes_shared = mmq_get_nbytes_shared(mmq_x, mmq_y, cc); #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA) - static bool shmem_limit_raised[GGML_CUDA_MAX_DEVICES] = {false}; - if (!shmem_limit_raised[id]) { - CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem)); - CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem)); - shmem_limit_raised[id] = true; + static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false}; + if (!shared_memory_limit_raised[id]) { + CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared)); + CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared)); + shared_memory_limit_raised[id] = true; } #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA) - const int nty = (args.ne01 + mmq_y - 1) / mmq_y; - const int ntx = (args.ne11 + mmq_x - 1) / mmq_x; - const dim3 block_nums_xy_tiling(nty, ntx, 1); + const int nty = (args.nrows_x + mmq_y - 1) / mmq_y; + const int ntx = (args.ncols_y + mmq_x - 1) / mmq_x; + const int ntzw = args.nchannels_y * args.nsamples_y; + const dim3 block_nums_xy_tiling(nty, ntx, ntzw); + + GGML_ASSERT(args.nchannels_y % args.nchannels_x == 0); + GGML_ASSERT(args.nsamples_y % args.nsamples_x == 0); + const int channel_ratio = args.nchannels_y / args.nchannels_x; + const int sample_ratio = args.nsamples_y / args.nsamples_x; if (!args.use_stream_k) { - if (args.ne01 % mmq_y == 0) { + if (args.nrows_x % mmq_y == 0) { constexpr bool need_check = false; - mul_mat_q<<>> - (args.x, args.y, args.dst, nullptr, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0); + mul_mat_q<<>> + (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr, + args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst, + channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst, + sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst); } else { constexpr bool need_check = true; - mul_mat_q<<>> - (args.x, args.y, args.dst, nullptr, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0); + mul_mat_q<<>> + (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr, + args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst, + channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst, + sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst); } return; } - const dim3 block_nums_mmq(nsm, 1, 1); + const dim3 block_nums_stream_k(nsm, 1, 1); + const bool fixup_needed = ntx*nty*ntzw % nsm != 0; ggml_cuda_pool & pool = ctx.pool(id); - ggml_cuda_pool_alloc tmp_fixup(pool, block_nums_mmq.x * mmq_x*mmq_y); + ggml_cuda_pool_alloc tmp_fixup(pool); + if (fixup_needed) { + tmp_fixup.alloc(block_nums_stream_k.x * mmq_x*mmq_y); + } - if (args.ne01 % mmq_y == 0) { + if (args.nrows_x % mmq_y == 0) { constexpr bool need_check = false; - mul_mat_q<<>> - (args.x, args.y, args.dst, tmp_fixup.ptr, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0); + mul_mat_q<<>> + (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, + args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst, + channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst, + sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst); + + if (!fixup_needed) { + return; + } - mul_mat_q_stream_k_fixup<<>> - (args.dst, tmp_fixup.ptr, args.ne00, args.ne01, args.ne11, args.ne0, block_nums_mmq.x); + mul_mat_q_stream_k_fixup<<>> + (args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_y, + args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst); } else { constexpr bool need_check = true; - mul_mat_q<<>> - (args.x, args.y, args.dst, tmp_fixup.ptr, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0); + mul_mat_q<<>> + (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, + args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst, + channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst, + sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst); + + if (!fixup_needed) { + return; + } - mul_mat_q_stream_k_fixup<<>> - (args.dst, tmp_fixup.ptr, args.ne00, args.ne01, args.ne11, args.ne0, block_nums_mmq.x); + mul_mat_q_stream_k_fixup<<>> + (args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_y, + args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst); } } template void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) { - const int id = ggml_cuda_get_device(); - const int cc = ggml_cuda_info().devices[id].cc; - const int smpbo = ggml_cuda_info().devices[id].smpbo; + const int id = ggml_cuda_get_device(); + const int cc = ggml_cuda_info().devices[id].cc; + const size_t smpbo = ggml_cuda_info().devices[id].smpbo; const int mmq_x_max = get_mmq_x_max_host(cc); const int mmq_y = get_mmq_y_host(cc); - const int block_num_y = (args.ne01 + mmq_y - 1) / mmq_y; - const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA; int mmq_x_best = 0; - int nparts_best = INT_MAX; + int ntiles_x_best = INT_MAX; - for (int mmq_x = 8; mmq_x <= mmq_x_max && nparts_best > 1; mmq_x += 8) { + for (int mmq_x = 8; mmq_x <= mmq_x_max && ntiles_x_best > 1; mmq_x += 8) { const int granularity = mmq_get_granularity_host(mmq_x, cc); - if (mmq_x % granularity != 0 || mmq_get_shmem(mmq_x, mmq_y, cc) > smpbo) { + if (mmq_x % granularity != 0 || mmq_get_nbytes_shared(mmq_x, mmq_y, cc) > smpbo) { continue; } - const int ntiles_x = (args.ne11 + mmq_x - 1) / mmq_x; - const int nwaves_xy_tiling = ntiles_x*block_num_y; - const int nparts = use_stream_k ? ntiles_x : nwaves_xy_tiling; + const int ntiles_x = (args.ncols_y + mmq_x - 1) / mmq_x; - if (nparts < nparts_best) { - mmq_x_best = mmq_x; - nparts_best = nparts; + if (ntiles_x < ntiles_x_best) { + mmq_x_best = mmq_x; + ntiles_x_best = ntiles_x; } } @@ -2954,6 +3197,9 @@ extern DECL_MMQ_CASE(GGML_TYPE_IQ4_XS); // ------------------------------------------------------------------------------------------------------------------------- +void ggml_cuda_mul_mat_q( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst); + void ggml_cuda_op_mul_mat_q( ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index d846e35a6a26d..132c466fd1aa6 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -158,7 +158,7 @@ static __global__ void mul_mat_vec_q( const int blocks_per_row_x = ncols_x / qk; constexpr int blocks_per_iter = vdr * nwarps*warp_size / qi; - // The MUL_MAT_ID code path with ids != nullptr is only implemetned for ncols_dst == 1. + // The MUL_MAT_ID code path with ids != nullptr is only implemented for ncols_dst == 1. const int channel_dst = blockIdx.y; const int channel_x = ncols_dst == 1 && ids ? ids[channel_dst] : channel_dst / channel_ratio; const int channel_y = ncols_dst == 1 && ids ? channel_dst % nchannels_y : channel_dst; @@ -507,7 +507,7 @@ void ggml_cuda_mul_mat_vec_q( GGML_ASSERT( nb0 == ts_dst); GGML_ASSERT(!ids || ids->nb[0] == ggml_type_size(ids->type)); - GGML_ASSERT(!ids || ne12 == 1); // Implementation is only correct for batch size 1. + GGML_ASSERT(!ids || ne12 == 1); // Implementation is only correct for batch size 1. const float * src1_d = (const float *) src1->data; const int32_t * ids_d = ids ? (const int32_t *) ids->data : nullptr; @@ -519,7 +519,7 @@ void ggml_cuda_mul_mat_vec_q( const int64_t s11 = src1->nb[1] / ts_src1; const int64_t s12 = src1->nb[2] / ts_src1; const int64_t s13 = src1->nb[3] / ts_src1; - quantize_row_q8_1_cuda(src1_d, src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded, ne11, ne12, ne13, stream); + quantize_row_q8_1_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded, ne11, ne12, ne13, stream); } const int64_t s01 = src0->nb[1] / ts_src0; diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu index 3bab47d56a22e..931a45ad347dc 100644 --- a/ggml/src/ggml-cuda/quantize.cu +++ b/ggml/src/ggml-cuda/quantize.cu @@ -49,29 +49,38 @@ static __global__ void quantize_q8_1( template static __global__ void quantize_mmq_q8_1( - const float * __restrict__ x, void * __restrict__ vy, const int64_t kx0, const int64_t kx1, const int64_t kx0_padded) { + const float * __restrict__ x, const int32_t * __restrict__ ids, void * __restrict__ vy, + const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03, + const int64_t ne0, const int ne1, const int ne2) { constexpr int vals_per_scale = ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6 ? 64 : 32; constexpr int vals_per_sum = ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6 ? 16 : 32; - const int64_t ix0 = ((int64_t)blockDim.x*blockIdx.x + threadIdx.x)*4; + const int64_t i0 = ((int64_t)blockDim.x*blockIdx.x + threadIdx.x)*4; - if (ix0 >= kx0_padded) { + if (i0 >= ne0) { return; } - const float4 * x4 = (const float4 *) x; + const int64_t i1 = blockIdx.y; + const int64_t i2 = blockIdx.z % ne2; + const int64_t i3 = blockIdx.z / ne2; - const int64_t ix1 = kx1*blockIdx.z + blockIdx.y; + const int64_t i00 = i0; + const int64_t i01 = ids ? ids[i1] : i1; + const int64_t i02 = i2; + const int64_t i03 = i3; + + const float4 * x4 = (const float4 *) x; block_q8_1_mmq * y = (block_q8_1_mmq *) vy; const int64_t ib0 = blockIdx.z*((int64_t)gridDim.y*gridDim.x*blockDim.x/QK8_1); // first block of channel - const int64_t ib = ib0 + (ix0 / (4*QK8_1))*kx1 + blockIdx.y; // block index in channel - const int64_t iqs = ix0 % (4*QK8_1); // quant index in block + const int64_t ib = ib0 + (i0 / (4*QK8_1))*ne1 + blockIdx.y; // block index in channel + const int64_t iqs = i0 % (4*QK8_1); // quant index in block // Load 4 floats per thread and calculate max. abs. value between them: - const float4 xi = ix0 < kx0 ? x4[(ix1*kx0 + ix0)/4] : make_float4(0.0f, 0.0f, 0.0f, 0.0f); + const float4 xi = i0 < ne00 ? x4[(i03*s03 + i02*s02 + i01*s01 + i00)/4] : make_float4(0.0f, 0.0f, 0.0f, 0.0f); float amax = fabsf(xi.x); amax = fmaxf(amax, fabsf(xi.y)); amax = fmaxf(amax, fabsf(xi.z)); @@ -87,7 +96,7 @@ static __global__ void quantize_mmq_q8_1( if (ds_layout != MMQ_Q8_1_DS_LAYOUT_D4) { sum = xi.x + xi.y + xi.z + xi.w; - // Exchange calculate sum across vals_per_sum/4 threads. + // Calculate sums across vals_per_sum/4 threads. #pragma unroll for (int offset = vals_per_sum/8; offset > 0; offset >>= 1) { sum += __shfl_xor_sync(0xFFFFFFFF, sum, offset, WARP_SIZE); @@ -137,9 +146,10 @@ static __global__ void quantize_mmq_q8_1( } void quantize_row_q8_1_cuda( - const float * x, void * vy, const ggml_type type_src0, const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03, - const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) { - + const float * x, const int32_t * ids, void * vy, const ggml_type type_src0, + const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03, + const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) { + GGML_ASSERT(!ids); GGML_ASSERT(ne0 % QK8_1 == 0); const int64_t block_num_x = (ne0 + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE; @@ -150,9 +160,9 @@ void quantize_row_q8_1_cuda( } void quantize_mmq_q8_1_cuda( - const float * x, void * vy, const ggml_type type_src0, const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03, - const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) { - + const float * x, const int32_t * ids, void * vy, const ggml_type type_src0, + const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03, + const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) { GGML_ASSERT(ne0 % (4*QK8_1) == 0); const int64_t block_num_x = (ne0 + 4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ - 1) / (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ); @@ -161,21 +171,18 @@ void quantize_mmq_q8_1_cuda( switch (mmq_get_q8_1_ds_layout(type_src0)) { case MMQ_Q8_1_DS_LAYOUT_D4: quantize_mmq_q8_1 - <<>>(x, vy, ne00, ne1, ne0); + <<>>(x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2); break; case MMQ_Q8_1_DS_LAYOUT_DS4: quantize_mmq_q8_1 - <<>>(x, vy, ne00, ne1, ne0); + <<>>(x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2); break; case MMQ_Q8_1_DS_LAYOUT_D2S6: quantize_mmq_q8_1 - <<>>(x, vy, ne00, ne1, ne0); + <<>>(x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2); break; default: GGML_ABORT("fatal error"); break; } - GGML_UNUSED(s01); - GGML_UNUSED(s02); - GGML_UNUSED(s03); } diff --git a/ggml/src/ggml-cuda/quantize.cuh b/ggml/src/ggml-cuda/quantize.cuh index b627c4e4008b4..725ab52443c0e 100644 --- a/ggml/src/ggml-cuda/quantize.cuh +++ b/ggml/src/ggml-cuda/quantize.cuh @@ -12,13 +12,16 @@ static_assert(MATRIX_ROW_PADDING % CUDA_QUANTIZE_BLOCK_SIZE == 0, "Risk static_assert(MATRIX_ROW_PADDING % (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ) == 0, "Risk of out-of-bounds access."); typedef void (*quantize_cuda_t)( - const float * x, void * vy, const ggml_type type_src0, const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03, - const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream); + const float * x, const int32_t * ids, void * vy, + ggml_type type_src0, int64_t ne00, int64_t s01, int64_t s02, int64_t s03, + int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, cudaStream_t stream); void quantize_row_q8_1_cuda( - const float * x, void * vy, const ggml_type type_src0, const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03, - const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream); + const float * x, const int32_t * ids, void * vy, + ggml_type type_src0, int64_t ne00, int64_t s01, int64_t s02, int64_t s03, + int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, cudaStream_t stream); void quantize_mmq_q8_1_cuda( - const float * x, void * vy, const ggml_type type_src0, const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03, - const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream); + const float * x, const int32_t * ids, void * vy, + ggml_type type_src0, int64_t ne00, int64_t s01, int64_t s02, int64_t s03, + int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, cudaStream_t stream); diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp index 140a775f9806f..e662cc6eb3f48 100644 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp @@ -518,6 +518,11 @@ static rpc_tensor serialize_tensor(const ggml_tensor * tensor) { result.view_src = reinterpret_cast(tensor->view_src); result.view_offs = tensor->view_offs; result.data = reinterpret_cast(tensor->data); + + // Avoid sending uninitialized data over the wire + memset(result.name, 0, sizeof(result.name)); + memset(result.padding, 0, sizeof(result.padding)); + snprintf(result.name, GGML_MAX_NAME, "%s", tensor->name); return result; } diff --git a/ggml/src/ggml-vulkan/CMakeLists.txt b/ggml/src/ggml-vulkan/CMakeLists.txt index 9d028f718d0fe..31816219c06fd 100644 --- a/ggml/src/ggml-vulkan/CMakeLists.txt +++ b/ggml/src/ggml-vulkan/CMakeLists.txt @@ -71,6 +71,22 @@ if (Vulkan_FOUND) add_compile_definitions(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) endif() + # Compile a test shader to determine whether GL_EXT_bfloat16 is supported. + # If it's not, there will be an error to stderr. + # If it's supported, set a define to indicate that we should compile those shaders + execute_process(COMMAND ${Vulkan_GLSLC_EXECUTABLE} -o - -fshader-stage=compute --target-env=vulkan1.3 "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_bfloat16_support.comp" + OUTPUT_VARIABLE glslc_output + ERROR_VARIABLE glslc_error) + + if (${glslc_error} MATCHES ".*extension not supported: GL_EXT_bfloat16.*") + message(STATUS "GL_EXT_bfloat16 not supported by glslc") + set(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT OFF) + else() + message(STATUS "GL_EXT_bfloat16 supported by glslc") + set(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT ON) + add_compile_definitions(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) + endif() + target_link_libraries(ggml-vulkan PRIVATE Vulkan::Vulkan) target_include_directories(ggml-vulkan PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) @@ -142,6 +158,7 @@ if (Vulkan_FOUND) -DGGML_VULKAN_COOPMAT_GLSLC_SUPPORT=${GGML_VULKAN_COOPMAT_GLSLC_SUPPORT} -DGGML_VULKAN_COOPMAT2_GLSLC_SUPPORT=${GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT} -DGGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT=${GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT} + -DGGML_VULKAN_BFLOAT16_GLSLC_SUPPORT=${GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT} BUILD_COMMAND ${CMAKE_COMMAND} --build . INSTALL_COMMAND ${CMAKE_COMMAND} --install . INSTALL_DIR ${CMAKE_BINARY_DIR} diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index c0bdb9e17a7b4..eac0b422bc67b 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -51,6 +51,24 @@ #include "ggml-vulkan-shaders.hpp" +// remove this once it's more widely available in the SDK +#if !defined(VK_KHR_shader_bfloat16) + +#define VK_KHR_shader_bfloat16 1 +#define VK_KHR_SHADER_BFLOAT16_SPEC_VERSION 1 +#define VK_KHR_SHADER_BFLOAT16_EXTENSION_NAME "VK_KHR_shader_bfloat16" +#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_BFLOAT16_FEATURES_KHR ((VkStructureType)1000141000) +#define VK_COMPONENT_TYPE_BFLOAT16_KHR ((VkComponentTypeKHR)1000141000) + +typedef struct VkPhysicalDeviceShaderBfloat16FeaturesKHR { + VkStructureType sType; + void* pNext; + VkBool32 shaderBFloat16Type; + VkBool32 shaderBFloat16DotProduct; + VkBool32 shaderBFloat16CooperativeMatrix; +} VkPhysicalDeviceShaderBfloat16FeaturesKHR; +#endif + #define ROUNDUP_POW2(M, N) (((M) + (N) - 1) & ~((N) - 1)) #define CEIL_DIV(M, N) (((M) + (N)-1) / (N)) static bool is_pow2(uint32_t x) { return x > 1 && (x & (x-1)) == 0; } @@ -266,8 +284,9 @@ struct vk_device_struct { bool subgroup_require_full_support; bool coopmat_support; - bool coopmat_acc_f32_support; - bool coopmat_acc_f16_support; + bool coopmat_acc_f32_support {}; + bool coopmat_acc_f16_support {}; + bool coopmat_bf16_support {}; uint32_t coopmat_m; uint32_t coopmat_n; uint32_t coopmat_k; @@ -293,6 +312,7 @@ struct vk_device_struct { vk_matmul_pipeline pipeline_matmul_f32 {}; vk_matmul_pipeline pipeline_matmul_f32_f16 {}; + vk_matmul_pipeline pipeline_matmul_bf16 {}; vk_matmul_pipeline2 pipeline_matmul_f16; vk_matmul_pipeline2 pipeline_matmul_f16_f32; @@ -301,6 +321,7 @@ struct vk_device_struct { vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_COUNT]; vk_matmul_pipeline pipeline_matmul_id_f32 {}; + vk_matmul_pipeline pipeline_matmul_id_bf16 {}; vk_matmul_pipeline2 pipeline_matmul_id_f16; vk_matmul_pipeline2 pipeline_matmul_id_f16_f32; @@ -333,8 +354,8 @@ struct vk_device_struct { vk_pipeline pipeline_clamp_f32; vk_pipeline pipeline_pad_f32; vk_pipeline pipeline_repeat_f32, pipeline_repeat_back_f32; - vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16; - vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16; + vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16, pipeline_cpy_f32_bf16; + vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16, pipeline_contig_cpy_f32_bf16; vk_pipeline pipeline_cpy_f32_quant[GGML_TYPE_COUNT]; vk_pipeline pipeline_cpy_quant_f32[GGML_TYPE_COUNT]; vk_pipeline pipeline_norm_f32; @@ -1791,6 +1812,12 @@ static void ggml_vk_load_shaders(vk_device& device) { if (!device->pipeline_matmul_id_f32) { device->pipeline_matmul_id_f32 = std::make_shared(); } + if (!device->pipeline_matmul_bf16) { + device->pipeline_matmul_bf16 = std::make_shared(); + } + if (!device->pipeline_matmul_id_bf16) { + device->pipeline_matmul_id_bf16 = std::make_shared(); + } std::vector> compiles; auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint, @@ -1900,6 +1927,11 @@ static void ggml_vk_load_shaders(vk_device& device) { CREATE_MM(PIPELINE_NAME . f32acc, NAMELC, , WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \ CREATE_MM2(pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3) +#if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) + if (device->coopmat_bf16_support) { + CREATE_MM(pipeline_matmul_bf16, matmul_bf16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3) + } +#endif CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) @@ -1921,6 +1953,11 @@ static void ggml_vk_load_shaders(vk_device& device) { CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) CREATE_MM2(pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_id_push_constants, 4) +#if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) + if (device->coopmat_bf16_support) { + CREATE_MM(pipeline_matmul_id_bf16, matmul_id_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, 4) + } +#endif CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) @@ -1974,6 +2011,11 @@ static void ggml_vk_load_shaders(vk_device& device) { CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_f16_f32, matmul_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); +#if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) + if (device->coopmat_bf16_support) { + CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_bf16, matmul_bf16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ) + } +#endif if (device->coopmat_acc_f16_support) { CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); @@ -2022,6 +2064,11 @@ static void ggml_vk_load_shaders(vk_device& device) { CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id); CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id); CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id); +#if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) + if (device->coopmat_bf16_support) { + CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_bf16, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id); + } +#endif if (device->coopmat_acc_f16_support) { CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); @@ -2104,6 +2151,8 @@ static void ggml_vk_load_shaders(vk_device& device) { CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_f16_f32, matmul_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); + CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_bf16, matmul_bf16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); + CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); @@ -2139,6 +2188,8 @@ static void ggml_vk_load_shaders(vk_device& device) { CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id); CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id); + CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); @@ -2191,6 +2242,8 @@ static void ggml_vk_load_shaders(vk_device& device) { CREATE_MM(GGML_TYPE_F16, pipeline_matmul_f16.f32acc, matmul_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_F16, pipeline_matmul_f16_f32.f32acc, matmul_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); + CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_bf16, matmul_bf16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); + CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); @@ -2226,6 +2279,8 @@ static void ggml_vk_load_shaders(vk_device& device) { CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16.f32acc, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id); CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16_f32.f32acc, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id); + CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f32acc, matmul_id_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f32acc, matmul_id_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f32acc, matmul_id_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); @@ -2246,8 +2301,26 @@ static void ggml_vk_load_shaders(vk_device& device) { CREATE_MM(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f32acc, matmul_id_iq3_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS].f32acc, matmul_id_iq4_xs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f32acc, matmul_id_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); -#undef CREATE_MM } + // reusing CREATE_MM from the fp32 path + if ((device->coopmat2 || device->coopmat_support) +#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) + && !device->coopmat_bf16_support +#endif + ) { + // use scalar tile sizes + l_warptile = { 128, 128, 128, 16, subgroup_size_8 * 2, 64, 2, 4, 4, 1, subgroup_size_8 }; + m_warptile = { 128, 64, 64, 16, subgroup_size_8, 32, 2, 4, 2, 1, subgroup_size_8 }; + s_warptile = { subgroup_size_16, 32, 32, 16, 32, 32, 2, 2, 2, 1, subgroup_size_8 }; + + l_wg_denoms = {128, 128, 1 }; + m_wg_denoms = { 64, 64, 1 }; + s_wg_denoms = { 32, 32, 1 }; + + CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_bf16, matmul_bf16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); + CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, 4, _id); + } +#undef CREATE_MM // mul mat vec @@ -2266,6 +2339,7 @@ static void ggml_vk_load_shaders(vk_device& device) { for (uint32_t i = 0; i < mul_mat_vec_max_cols; ++i) { ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f32_f32_"+std::to_string(i+1), mul_mat_vec_f32_f32_f32_len, mul_mat_vec_f32_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f32_f32_"+std::to_string(i+1), mul_mat_vec_f16_f32_f32_len, mul_mat_vec_f16_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_BF16][i], "mul_mat_vec_bf16_f32_f32_"+std::to_string(i+1), mul_mat_vec_bf16_f32_f32_len, mul_mat_vec_bf16_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_f32_f32_"+std::to_string(i+1), mul_mat_vec_q4_0_f32_f32_len, mul_mat_vec_q4_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_1][i], "mul_mat_vec_q4_1_f32_f32_"+std::to_string(i+1), mul_mat_vec_q4_1_f32_f32_len, mul_mat_vec_q4_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0][i], "mul_mat_vec_q5_0_f32_f32_"+std::to_string(i+1), mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true); @@ -2288,6 +2362,7 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f16_f32_"+std::to_string(i+1), mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f16_f32_"+std::to_string(i+1), mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_BF16][i], "mul_mat_vec_bf16_f16_f32_"+std::to_string(i+1), mul_mat_vec_bf16_f16_f32_len, mul_mat_vec_bf16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_f16_f32_"+std::to_string(i+1), mul_mat_vec_q4_0_f16_f32_len, mul_mat_vec_q4_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_1][i], "mul_mat_vec_q4_1_f16_f32_"+std::to_string(i+1), mul_mat_vec_q4_1_f16_f32_len, mul_mat_vec_q4_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0][i], "mul_mat_vec_q5_0_f16_f32_"+std::to_string(i+1), mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true); @@ -2311,6 +2386,7 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_BF16], "mul_mat_vec_id_bf16_f32", mul_mat_vec_id_bf16_f32_len, mul_mat_vec_id_bf16_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", mul_mat_vec_id_q4_0_f32_len, mul_mat_vec_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", mul_mat_vec_id_q4_1_f32_len, mul_mat_vec_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true); @@ -2356,6 +2432,7 @@ static void ggml_vk_load_shaders(vk_device& device) { // get_rows ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_F32 ], "get_rows_f32", get_rows_f32_len, get_rows_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_F16 ], "get_rows_f16", get_rows_f16_len, get_rows_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_BF16], "get_rows_bf16", get_rows_bf16_len, get_rows_bf16_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q4_0], "get_rows_q4_0", get_rows_q4_0_len, get_rows_q4_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q4_1], "get_rows_q4_1", get_rows_q4_1_len, get_rows_q4_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); @@ -2373,6 +2450,7 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32", get_rows_f32_f32_len, get_rows_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_BF16], "get_rows_bf16_f32", get_rows_bf16_f32_len, get_rows_bf16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q4_0], "get_rows_q4_0_f32", get_rows_q4_0_f32_len, get_rows_q4_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q4_1], "get_rows_q4_1_f32", get_rows_q4_1_f32_len, get_rows_q4_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); @@ -2399,7 +2477,7 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_p021_f16_f32[i], "mul_mat_vec_p021_f16_f32"+std::to_string(i+1), mul_mat_vec_p021_f16_f32_len, mul_mat_vec_p021_f16_f32_data, "main", 3, 6 * sizeof(uint32_t), {1, 1, 1}, {device->subgroup_size, i + 1}, 1, true); } } - ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 7 * sizeof(uint32_t), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 9 * sizeof(uint32_t), {1, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_group_norm_f32, "group_norm_f32", group_norm_f32_len, group_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); @@ -2410,10 +2488,13 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_f32, "cpy_f32_f32", cpy_f32_f32_len, cpy_f32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_f16, "cpy_f32_f16", cpy_f32_f16_len, cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_f16_f16, "cpy_f16_f16", cpy_f16_f16_len, cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_bf16,"cpy_f32_bf16",cpy_f32_bf16_len,cpy_f32_bf16_data,"main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_f32, "contig_cpy_f32_f32", contig_cpy_f32_f32_len, contig_cpy_f32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_f16, "contig_cpy_f32_f16", contig_cpy_f32_f16_len, contig_cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f16_f16, "contig_cpy_f16_f16", contig_cpy_f16_f16_len, contig_cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_bf16,"contig_cpy_f32_bf16",contig_cpy_f32_bf16_len,contig_cpy_f32_bf16_data,"main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + if (device->float_controls_rte_fp16) { ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_rte_len, cpy_f32_q4_0_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_0), 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_rte_len, cpy_f32_q4_1_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_1), 1, 1}, {}, 1); @@ -2578,6 +2659,7 @@ static vk_device ggml_vk_get_device(size_t idx) { bool coopmat2_support = false; device->coopmat_support = false; device->integer_dot_product = false; + bool bfloat16_support = false; for (const auto& properties : ext_props) { if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) { @@ -2608,6 +2690,9 @@ static vk_device ggml_vk_get_device(size_t idx) { !getenv("GGML_VK_DISABLE_INTEGER_DOT_PRODUCT")) { device->integer_dot_product = true; #endif + } else if (strcmp("VK_KHR_shader_bfloat16", properties.extensionName) == 0 && + !getenv("GGML_VK_DISABLE_BFLOAT16")) { + bfloat16_support = true; } } @@ -2794,6 +2879,17 @@ static vk_device ggml_vk_get_device(size_t idx) { } #endif +#if defined(VK_KHR_shader_bfloat16) + VkPhysicalDeviceShaderBfloat16FeaturesKHR bfloat16_features {}; + bfloat16_features.pNext = nullptr; + bfloat16_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_BFLOAT16_FEATURES_KHR; + if (bfloat16_support) { + last_struct->pNext = (VkBaseOutStructure *)&bfloat16_features; + last_struct = (VkBaseOutStructure *)&bfloat16_features; + device_extensions.push_back("VK_KHR_shader_bfloat16"); + } +#endif + VkPhysicalDeviceMaintenance4Features maint4_features {}; maint4_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_4_FEATURES; if (maintenance4_support) { @@ -2991,6 +3087,25 @@ static vk_device ggml_vk_get_device(size_t idx) { device->coopmat_int_n = prop.NSize; device->coopmat_int_k = prop.KSize; } +#if defined(VK_KHR_shader_bfloat16) && defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) + if (prop.AType == VK_COMPONENT_TYPE_BFLOAT16_KHR && + prop.BType == VK_COMPONENT_TYPE_BFLOAT16_KHR && + prop.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && + prop.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR && + (vk::ScopeKHR)prop.scope == vk::ScopeKHR::eSubgroup + ) { + // coopmat sizes not set yet + if (device->coopmat_m == 0) { + device->coopmat_bf16_support = true; + device->coopmat_m = prop.MSize; + device->coopmat_n = prop.NSize; + device->coopmat_k = prop.KSize; + } else if (device->coopmat_m == prop.MSize && device->coopmat_n == prop.NSize && device->coopmat_k == prop.KSize) { + // Only enable if shape is identical + device->coopmat_bf16_support = true; + } + } +#endif } if (device->coopmat_m == 0 || !device->coopmat_acc_f32_support) { @@ -2998,11 +3113,19 @@ static vk_device ggml_vk_get_device(size_t idx) { GGML_LOG_DEBUG("ggml_vulkan: WARNING: No suitable matrix core mode found. Disabling matrix cores.\n"); device->coopmat_support = false; } + if (getenv("GGML_VK_DISABLE_BFLOAT16")) { + device->coopmat_bf16_support = false; + } } if (device->coopmat_support) { device_extensions.push_back("VK_KHR_cooperative_matrix"); } +#if defined(VK_KHR_shader_bfloat16) + if (device->coopmat_bf16_support) { + device_extensions.push_back("VK_KHR_shader_bfloat16"); + } +#endif #endif device->name = GGML_VK_NAME + std::to_string(idx); @@ -3459,6 +3582,9 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) { return ctx->device->pipeline_matmul_f32_f16; } + if (src0_type == GGML_TYPE_BF16 && src1_type == GGML_TYPE_BF16) { + return ctx->device->pipeline_matmul_bf16; + } if (prec == GGML_PREC_DEFAULT && ctx->device->fp16 && !(ctx->device->coopmat_support && !ctx->device->coopmat_acc_f16_support)) { if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { return ctx->device->pipeline_matmul_f16_f32.f16acc; @@ -3530,6 +3656,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * switch (a_type) { case GGML_TYPE_F32: case GGML_TYPE_F16: + case GGML_TYPE_BF16: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: @@ -3562,6 +3689,9 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_co if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) { return ctx->device->pipeline_matmul_id_f32; } + if (src0_type == GGML_TYPE_BF16 && src1_type == GGML_TYPE_BF16) { + return ctx->device->pipeline_matmul_id_bf16; + } if (prec == GGML_PREC_DEFAULT && ctx->device->fp16 && !(ctx->device->coopmat_support && !ctx->device->coopmat_acc_f16_support)) { if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { return ctx->device->pipeline_matmul_id_f16_f32.f16acc; @@ -3615,6 +3745,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context switch (a_type) { case GGML_TYPE_F32: case GGML_TYPE_F16: + case GGML_TYPE_BF16: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: @@ -4350,6 +4481,13 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const return ctx->device->pipeline_cpy_f16_f16; } } + if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_BF16) { + if (contig) { + return ctx->device->pipeline_contig_cpy_f32_bf16; + } else { + return ctx->device->pipeline_cpy_f32_bf16; + } + } if (src->type == GGML_TYPE_F32) { switch (to) { case GGML_TYPE_Q4_0: @@ -4477,8 +4615,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub const bool x_non_contig = (ctx->device->coopmat2 && src0->type == GGML_TYPE_F32) || !ggml_vk_dim01_contiguous(src0); const bool y_non_contig = (ctx->device->coopmat2 && src1->type == GGML_TYPE_F32) || + (src0->type == GGML_TYPE_BF16 && src1->type != GGML_TYPE_BF16) || !ggml_vk_dim01_contiguous(src1); + // If src0 is BF16, try to use a BF16 x BF16 multiply + ggml_type f16_type = src0->type == GGML_TYPE_BF16 ? GGML_TYPE_BF16 : GGML_TYPE_F16; + const bool y_f32_kernel = src1->type == GGML_TYPE_F32 && !y_non_contig; bool quantize_y = ctx->device->integer_dot_product && src1->type == GGML_TYPE_F32 && ggml_is_contiguous(src1) && (ne11 * ne10) % 4 == 0; @@ -4488,25 +4630,25 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub if (mmp == nullptr) { // Fall back to f16 dequant mul mat - mmp = ggml_vk_get_mul_mat_mat_pipeline(ctx, src0->type, y_non_contig ? GGML_TYPE_F16 : src1->type, (ggml_prec)dst->op_params[0]); + mmp = ggml_vk_get_mul_mat_mat_pipeline(ctx, src0->type, y_non_contig ? f16_type : src1->type, (ggml_prec)dst->op_params[0]); quantize_y = false; } const bool qx_needs_dequant = mmp == nullptr || x_non_contig; - const bool qy_needs_dequant = !quantize_y && ((src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig); + const bool qy_needs_dequant = !quantize_y && ((src1->type != f16_type && !y_f32_kernel) || y_non_contig); if (qx_needs_dequant) { // Fall back to dequant + f16 mulmat - mmp = ggml_vk_get_mul_mat_mat_pipeline(ctx, GGML_TYPE_F16, y_f32_kernel ? GGML_TYPE_F32 : GGML_TYPE_F16, (ggml_prec)dst->op_params[0]); + mmp = ggml_vk_get_mul_mat_mat_pipeline(ctx, f16_type, y_f32_kernel ? GGML_TYPE_F32 : f16_type, (ggml_prec)dst->op_params[0]); } // Not implemented GGML_ASSERT(y_non_contig || !qy_needs_dequant); // NOLINT - const uint32_t kpad = quantize_y ? 0 : ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, ne11, qx_needs_dequant ? GGML_TYPE_F16 : src0->type, quantize_y ? GGML_TYPE_Q8_1 : (y_f32_kernel ? GGML_TYPE_F32 : src1->type))); + const uint32_t kpad = quantize_y ? 0 : ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, ne11, qx_needs_dequant ? f16_type : src0->type, quantize_y ? GGML_TYPE_Q8_1 : (y_f32_kernel ? GGML_TYPE_F32 : src1->type))); const bool aligned = !quantize_y && ne10 == kpad && ne01 > 8 && ne11 > 8; - vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, ne11, aligned, qx_needs_dequant ? GGML_TYPE_F16 : src0->type, quantize_y ? GGML_TYPE_Q8_1 : (y_f32_kernel ? GGML_TYPE_F32 : src1->type)); + vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, ne11, aligned, qx_needs_dequant ? f16_type : src0->type, quantize_y ? GGML_TYPE_Q8_1 : (y_f32_kernel ? GGML_TYPE_F32 : src1->type)); // Reserve extra storage in the N dimension for the Y matrix, so we can avoid bounds-checking uint32_t padded_n = qy_needs_dequant ? ROUNDUP_POW2(ne11, pipeline->wg_denoms[1]) : ne11; @@ -4527,12 +4669,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub vk_pipeline to_q8_1 = nullptr; if (x_non_contig) { - to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, GGML_TYPE_F16); + to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, f16_type); } else { to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type); } if (y_non_contig) { - to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, GGML_TYPE_F16); + to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, f16_type); } else { to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); } @@ -4949,6 +5091,8 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con const uint64_t nb01 = src0->nb[1]; const uint64_t nb02 = src0->nb[2]; + const uint64_t nb12 = src1->nb[2]; + // const uint64_t ne10 = src1->ne[0]; const uint64_t ne11 = src1->ne[1]; const uint64_t ne12 = src1->ne[2]; @@ -4974,6 +5118,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con const uint32_t row_stride_x = nb01 / sizeof(ggml_fp16_t); const uint32_t channel_stride_x = nb02 / sizeof(ggml_fp16_t); + const uint32_t channel_stride_y = nb12 / sizeof(float); const uint64_t qx_sz = ggml_nbytes(src0); const uint64_t qy_sz = ggml_nbytes(src1); @@ -5004,7 +5149,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset; // compute - const std::array pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) }; + const std::array pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, channel_stride_y, (uint32_t)(ne12 / ne02), (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) }; ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); @@ -5029,7 +5174,7 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, c // mul_mat_vec supports batching ne12*ne13 when ne11==1, or treating ne11 as the batch size (up to four) // when ne12 and ne13 are one. } else if ((dst->ne[1] == 1 || (dst->ne[1] <= mul_mat_vec_max_cols && src1->ne[2] * src1->ne[3] == 1)) && - (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) { + (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16 || ggml_is_quantized(src0->type))) { ggml_vk_mul_mat_vec_q_f16(ctx, subctx, src0, src1, dst, dryrun); } else { ggml_vk_mul_mat_q_f16(ctx, subctx, src0, src1, dst, dryrun); @@ -5097,27 +5242,31 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& const bool x_non_contig = (ctx->device->coopmat2 && src0->type == GGML_TYPE_F32) || !ggml_vk_dim01_contiguous(src0); const bool y_non_contig = (ctx->device->coopmat2 && src1->type == GGML_TYPE_F32) || + (src0->type == GGML_TYPE_BF16 && src1->type != GGML_TYPE_BF16) || !ggml_vk_dim01_contiguous(src1); + // If src0 is BF16, try to use a BF16 x BF16 multiply + ggml_type f16_type = src0->type == GGML_TYPE_BF16 ? GGML_TYPE_BF16 : GGML_TYPE_F16; + const bool y_f32_kernel = src1->type == GGML_TYPE_F32 && !y_non_contig; - vk_matmul_pipeline mmp = ggml_vk_get_mul_mat_mat_id_pipeline(ctx, src0->type, y_non_contig ? GGML_TYPE_F16 : src1->type, (ggml_prec)dst->op_params[0]); + vk_matmul_pipeline mmp = ggml_vk_get_mul_mat_mat_id_pipeline(ctx, src0->type, y_non_contig ? f16_type : src1->type, (ggml_prec)dst->op_params[0]); const bool qx_needs_dequant = mmp == nullptr || x_non_contig; - const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig; + const bool qy_needs_dequant = (src1->type != f16_type && !y_f32_kernel) || y_non_contig; if (qx_needs_dequant) { // Fall back to dequant + f16 mulmat - mmp = ggml_vk_get_mul_mat_mat_id_pipeline(ctx, GGML_TYPE_F16, y_f32_kernel ? GGML_TYPE_F32 : GGML_TYPE_F16, (ggml_prec)dst->op_params[0]); + mmp = ggml_vk_get_mul_mat_mat_id_pipeline(ctx, f16_type, y_f32_kernel ? GGML_TYPE_F32 : f16_type, (ggml_prec)dst->op_params[0]); } // Not implemented GGML_ASSERT(y_non_contig || !qy_needs_dequant); // NOLINT - const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_id_pipeline_align(ctx, mmp, ne01, nei1, qx_needs_dequant ? GGML_TYPE_F16 : src0->type)); + const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_id_pipeline_align(ctx, mmp, ne01, nei1, qx_needs_dequant ? f16_type : src0->type)); const bool aligned = ne10 == kpad && ne01 > 8 && nei1 > 8; - vk_pipeline pipeline = ggml_vk_guess_matmul_id_pipeline(ctx, mmp, ne01, nei1, aligned, qx_needs_dequant ? GGML_TYPE_F16 : src0->type); + vk_pipeline pipeline = ggml_vk_guess_matmul_id_pipeline(ctx, mmp, ne01, nei1, aligned, qx_needs_dequant ? f16_type : src0->type); // Reserve extra storage in the N dimension for the Y matrix, so we can avoid bounds-checking uint32_t padded_n = qy_needs_dequant ? ROUNDUP_POW2(ne11, pipeline->wg_denoms[1]) :ne11; @@ -5136,12 +5285,12 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& vk_pipeline to_fp16_vk_1 = nullptr; if (x_non_contig) { - to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, GGML_TYPE_F16); + to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, f16_type); } else { to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type); } if (y_non_contig) { - to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, GGML_TYPE_F16); + to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, f16_type); } else { to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); } @@ -9227,6 +9376,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm switch (src0_type) { case GGML_TYPE_F32: case GGML_TYPE_F16: + case GGML_TYPE_BF16: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: @@ -9262,10 +9412,15 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm if (a->ne[3] != b->ne[3]) { return false; } - if (!(ggml_vk_dim01_contiguous(op->src[0]) || op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) || + if (!(ggml_vk_dim01_contiguous(op->src[0]) || op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16 || op->src[0]->type == GGML_TYPE_BF16) || !(ggml_vk_dim01_contiguous(op->src[1]) || op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F16)) { return false; } + if (op->src[0]->type == GGML_TYPE_BF16 && op->src[1]->type == GGML_TYPE_F16) { + // We currently don't have a bf16 x f16 shader, or an fp16->bf16 copy shader. + // So don't support this combination for now. + return false; + } return true; } break; @@ -9338,6 +9493,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm switch (op->src[0]->type) { case GGML_TYPE_F32: case GGML_TYPE_F16: + case GGML_TYPE_BF16: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: @@ -9368,6 +9524,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm switch (src1_type) { case GGML_TYPE_F32: case GGML_TYPE_F16: + case GGML_TYPE_BF16: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt b/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt index d6e0b2a5a5dd6..ad13f69b3fb1a 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +++ b/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt @@ -12,6 +12,9 @@ endif() if (GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) add_compile_definitions(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) endif() +if (GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) + add_compile_definitions(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) +endif() set(TARGET vulkan-shaders-gen) add_executable(${TARGET} vulkan-shaders-gen.cpp) install(TARGETS ${TARGET} RUNTIME) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp b/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp index dd828c232628c..6567a8c54cf49 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp @@ -18,7 +18,11 @@ void main() { // fast path for when all four iterations are in-bounds if (idx + (num_iter-1)*num_threads < p.ne) { [[unroll]] for (uint i = 0; i < num_iter; ++i) { -#ifndef OPTIMIZATION_ERROR_WORKAROUND + +#if defined(DATA_D_BF16) + float f = float(data_a[get_aoffset() + idx]); + data_d[get_doffset() + idx] = D_TYPE(fp32_to_bf16(f)); +#elif !defined(OPTIMIZATION_ERROR_WORKAROUND) data_d[get_doffset() + idx] = D_TYPE(data_a[get_aoffset() + idx]); #else data_d[get_doffset() + idx] = data_a[get_aoffset() + idx]; @@ -31,7 +35,10 @@ void main() { continue; } -#ifndef OPTIMIZATION_ERROR_WORKAROUND +#if defined(DATA_D_BF16) + float f = float(data_a[get_aoffset() + idx]); + data_d[get_doffset() + idx] = D_TYPE(fp32_to_bf16(f)); +#elif !defined(OPTIMIZATION_ERROR_WORKAROUND) data_d[get_doffset() + idx] = D_TYPE(data_a[get_aoffset() + idx]); #else data_d[get_doffset() + idx] = data_a[get_aoffset() + idx]; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp b/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp index 29c9064942d93..f476a2e3dd83e 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp @@ -12,7 +12,10 @@ void main() { return; } -#ifndef OPTIMIZATION_ERROR_WORKAROUND +#if defined(DATA_D_BF16) + float f = float(data_a[get_aoffset() + src0_idx(idx)]); + data_d[get_doffset() + dst_idx(idx)] = D_TYPE(fp32_to_bf16(f)); +#elif !defined(OPTIMIZATION_ERROR_WORKAROUND) data_d[get_doffset() + dst_idx(idx)] = D_TYPE(data_a[get_aoffset() + src0_idx(idx)]); #else data_d[get_doffset() + dst_idx(idx)] = data_a[get_aoffset() + src0_idx(idx)]; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp index 2a162a2c81543..0d9739d40609a 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp @@ -23,6 +23,12 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) { } #endif +#if defined(DATA_A_BF16) +vec2 dequantize(uint ib, uint iqs, uint a_offset) { + return vec2(bf16_to_fp32(data_a[a_offset + ib]), bf16_to_fp32(data_a[a_offset + ib + 1])); +} +#endif + #if defined(DATA_A_Q4_0) vec2 dequantize(uint ib, uint iqs, uint a_offset) { const uint vui = uint(data_a[a_offset + ib].qs[iqs]); @@ -428,7 +434,7 @@ vec4 dequantize4(uint ib, uint iqs, uint a_offset) { } #endif -#if defined(DATA_A_F32) || defined(DATA_A_F16) +#if defined(DATA_A_F32) || defined(DATA_A_F16) || defined(DATA_A_BF16) vec2 get_dm(uint ib, uint a_offset) { return vec2(0, 0); } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp index 962d2353f88a5..9cb7da2daab5d 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp @@ -482,7 +482,7 @@ float16_t dequantFuncIQ2_XXS(const in decodeBufIQ2_XXS bl, const in uint blockCo const uint ib8 = (idx & 0x18) >> 3; // 0..3 const uint iqs = 8 * ib32 + ib8; - const uint8_t qs = bl.block.qs[iqs]; + const uint qs = bl.block.qs[iqs]; const uint signscale = pack32(u16vec2(bl16.block.qs[4*ib32+2], bl16.block.qs[4*ib32+3])); const float dscale = float(bl.block.d) * 0.25 * (0.5 + float(signscale >> 28)); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp b/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp index e877ed7796a8f..ee6b86a18ddf2 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp @@ -20,9 +20,14 @@ void main() { const uint a_offset = get_aoffset() + i01*p.nb01 + i11*p.nb02 + i12*p.nb03; const uint d_offset = get_doffset() + i10*p.nb21 + i11*p.nb22 + i12*p.nb23; +#if defined(DATA_A_BF16) + FLOAT_TYPE v = FLOAT_TYPE(bf16_to_fp32(data_a[a_offset + i00])); +#else + FLOAT_TYPE v = FLOAT_TYPE(data_a[a_offset + i00]); +#endif #ifndef OPTIMIZATION_ERROR_WORKAROUND - data_d[d_offset + i00] = D_TYPE(data_a[a_offset + i00]); + data_d[d_offset + i00] = D_TYPE(v); #else - data_d[d_offset + i00] = data_a[a_offset + i00]; + data_d[d_offset + i00] = D_TYPE(v); #endif } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp index 775b48cd05e1a..bb429dd594588 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp @@ -6,7 +6,7 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; -#if !defined(DATA_A_F32) && !defined(DATA_A_F16) +#if !defined(DATA_A_F32) && !defined(DATA_A_F16) && !defined(DATA_A_BF16) #define K_PER_ITER 8 #else #define K_PER_ITER 2 diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp index 48376637fb3e7..bc633369f9bb5 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp @@ -21,7 +21,9 @@ layout (push_constant) uniform parameter uint nrows_x; uint row_stride_x; uint channel_stride_x; + uint channel_stride_y; uint channel_x_divisor; + uint ne12; uint b_offset; uint d_offset; } p; @@ -33,6 +35,7 @@ void main() { const uint row_x = gl_GlobalInvocationID.y; const uint channel = gl_GlobalInvocationID.z; const uint channel_x = channel / p.channel_x_divisor; + const uint channel_y = channel % p.ne12; const uint nrows_y = p.ncols_x; const uint nrows_dst = p.nrows_x; @@ -56,7 +59,7 @@ void main() { const uint row_y = col_x; const uint ix = channel_x*p.channel_stride_x + row_x*p.row_stride_x + col_x; - const uint iy = channel*nrows_y + row_y; + const uint iy = channel_y*p.channel_stride_y + row_y; const vec4 av4 = vec4(data_a_v4[ix / 4]); const vec4 bv4 = vec4(data_b_v4[iy / 4]); @@ -72,7 +75,7 @@ void main() { const uint row_y = col_x; const uint ix = channel_x*p.channel_stride_x + row_x*p.row_stride_x + col_x; - const uint iy = channel*nrows_y + row_y; + const uint iy = channel_y*p.channel_stride_y + row_y; const vec4 av4 = vec4(data_a_v4[ix / 4]); const vec4 bv4 = vec4(data_b_v4[iy / 4]); @@ -89,7 +92,7 @@ void main() { const uint row_y = col_x; const uint ix = channel_x*p.channel_stride_x + row_x*p.row_stride_x + col_x; - const uint iy = channel*nrows_y + row_y; + const uint iy = channel_y*p.channel_stride_y + row_y; const FLOAT_TYPE xi = FLOAT_TYPE(data_a[ix]); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp index 23ce8ceec332b..529ac4d44fecc 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp @@ -10,6 +10,10 @@ #extension GL_EXT_shader_explicit_arithmetic_types_float16 : require #endif +#if defined(DATA_A_BF16) && defined(COOPMAT) +#extension GL_EXT_bfloat16 : enable +#endif + #ifdef COOPMAT #extension GL_KHR_cooperative_matrix : enable #extension GL_KHR_memory_scope_semantics : enable @@ -29,6 +33,10 @@ #define LOAD_VEC_B 1 #endif +#if !defined(TO_FLOAT_TYPE) +#define TO_FLOAT_TYPE FLOAT_TYPE +#endif + layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; @@ -202,8 +210,8 @@ void main() { #endif #ifdef COOPMAT - coopmat cache_a; - coopmat cache_b; + coopmat cache_a; + coopmat cache_b; coopmat sums[cms_per_row * cms_per_col]; [[unroll]] for (uint i = 0; i < cms_per_row * cms_per_col; i++) { @@ -248,6 +256,21 @@ void main() { buf_a[(loadc_a + l) * SHMEM_STRIDE + loadr_a] = FLOAT_TYPE(0.0f); } #endif +#elif defined(DATA_A_BF16) +#if LOAD_VEC_A == 4 + const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; + const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; + buf_a[buf_idx ] = TO_FLOAT_TYPE(data_a[idx].x); + buf_a[buf_idx + 1] = TO_FLOAT_TYPE(data_a[idx].y); + buf_a[buf_idx + 2] = TO_FLOAT_TYPE(data_a[idx].z); + buf_a[buf_idx + 3] = TO_FLOAT_TYPE(data_a[idx].w); +#else + if (ir * BM + loadc_a + l < p.M && block + loadr_a < end_k) { + buf_a[(loadc_a + l) * SHMEM_STRIDE + loadr_a] = TO_FLOAT_TYPE(data_a[pos_a + (loadc_a + l) * p.stride_a + loadr_a]); + } else { + buf_a[(loadc_a + l) * SHMEM_STRIDE + loadr_a] = TO_FLOAT_TYPE(uint16_t(0)); + } +#endif #elif defined(DATA_A_Q4_0) const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + 4 * loadr_a; @@ -695,13 +718,13 @@ void main() { const uint idx = pos_b + (loadc_b + l) * p.stride_b / LOAD_VEC_B + loadr_b; #endif const uint buf_idx = (loadc_b + l) * SHMEM_STRIDE + loadr_b * LOAD_VEC_B; - buf_b[buf_idx + 0] = FLOAT_TYPE(data_b[idx].x); - buf_b[buf_idx + 1] = FLOAT_TYPE(data_b[idx].y); - buf_b[buf_idx + 2] = FLOAT_TYPE(data_b[idx].z); - buf_b[buf_idx + 3] = FLOAT_TYPE(data_b[idx].w); + buf_b[buf_idx + 0] = TO_FLOAT_TYPE(data_b[idx].x); + buf_b[buf_idx + 1] = TO_FLOAT_TYPE(data_b[idx].y); + buf_b[buf_idx + 2] = TO_FLOAT_TYPE(data_b[idx].z); + buf_b[buf_idx + 3] = TO_FLOAT_TYPE(data_b[idx].w); #elif !MUL_MAT_ID if (ic * BN + loadc_b + l < p.N && block + loadr_b < end_k) { - buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = FLOAT_TYPE(data_b[pos_b + (loadc_b + l) * p.stride_b + loadr_b]); + buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = TO_FLOAT_TYPE(data_b[pos_b + (loadc_b + l) * p.stride_b + loadr_b]); } else { buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = FLOAT_TYPE(0.0f); } @@ -709,7 +732,7 @@ void main() { const uint row_i = ic * BN + loadc_b + l; if (row_i < _ne1) { const u16vec2 row_idx = row_ids[row_i]; - buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = FLOAT_TYPE(data_b[pos_b + row_idx.y * p.batch_stride_b + (row_idx.x % p.ne11) * p.stride_b + loadr_b]); + buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = TO_FLOAT_TYPE(data_b[pos_b + row_idx.y * p.batch_stride_b + (row_idx.x % p.ne11) * p.stride_b + loadr_b]); } else { buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = FLOAT_TYPE(0.0f); } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp index 06b7ab09ea51a..344b466101beb 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp @@ -14,6 +14,9 @@ #extension GL_EXT_buffer_reference : enable #extension GL_KHR_shader_subgroup_ballot : enable #extension GL_KHR_shader_subgroup_vote : enable +#ifdef DATA_A_BF16 +#extension GL_EXT_bfloat16 : enable +#endif #include "types.comp" @@ -80,6 +83,12 @@ layout (binding = 2) writeonly buffer D {D_TYPE data_d[];}; #define store_scales(a) #endif +#if defined(DATA_A_BF16) +#define MAT_TYPE bfloat16_t +#else +#define MAT_TYPE FLOAT_TYPE +#endif + #ifdef MUL_MAT_ID layout (binding = 3) readonly buffer IDS {int data_ids[];}; @@ -271,8 +280,8 @@ void main() { // Manually partial unroll [[unroll]] for (uint j = 0; j < unroll_count; ++j) { - coopmat mat_a; - coopmat mat_b; + coopmat mat_a; + coopmat mat_b; coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, block_k, BK) DECODEFUNCA); coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BNover4, block_k, BK), tensorViewTranspose); @@ -286,8 +295,8 @@ void main() { store_scales(tid); } while (block_k < end_k) { - coopmat mat_a; - coopmat mat_b; + coopmat mat_a; + coopmat mat_b; coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, block_k, BK) DECODEFUNCA); coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BNover4, block_k, BK), tensorViewTranspose); @@ -310,8 +319,8 @@ void main() { // Manually partial unroll [[unroll]] for (uint j = 0; j < unroll_count; ++j) { - coopmat mat_a; - coopmat mat_b; + coopmat mat_a; + coopmat mat_b; coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, block_k, BK) DECODEFUNCA); coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BNover2, block_k, BK), tensorViewTranspose); @@ -325,8 +334,8 @@ void main() { store_scales(tid); } while (block_k < end_k) { - coopmat mat_a; - coopmat mat_b; + coopmat mat_a; + coopmat mat_b; coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, block_k, BK) DECODEFUNCA); coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BNover2, block_k, BK), tensorViewTranspose); @@ -350,8 +359,8 @@ void main() { // Manually partial unroll [[unroll]] for (uint j = 0; j < unroll_count; ++j) { - coopmat mat_a; - coopmat mat_b; + coopmat mat_a; + coopmat mat_b; coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, block_k, BK) DECODEFUNCA); coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, block_k, BK), tensorViewTranspose); @@ -365,8 +374,8 @@ void main() { store_scales(tid); } while (block_k < end_k) { - coopmat mat_a; - coopmat mat_b; + coopmat mat_a; + coopmat mat_b; coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, block_k, BK) DECODEFUNCA); coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, block_k, BK), tensorViewTranspose); @@ -405,8 +414,8 @@ void main() { fetch_scales(ir * BM, pos_a, stride_a, block_k + BK, tid, false); } - coopmat mat_a; - coopmat mat_b; + coopmat mat_a; + coopmat mat_b; coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutAClamp, ir * BM, BM, block_k, BK) DECODEFUNCA); #ifdef MUL_MAT_ID diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/test_bfloat16_support.comp b/ggml/src/ggml-vulkan/vulkan-shaders/test_bfloat16_support.comp new file mode 100644 index 0000000000000..fd0ba401feb0c --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/test_bfloat16_support.comp @@ -0,0 +1,7 @@ +#version 460 + +#extension GL_EXT_bfloat16 : require + +void main() +{ +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp index f5b29bfb13a66..3bde717832b45 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp @@ -33,6 +33,19 @@ #endif #endif +#if defined(DATA_A_BF16) +#define QUANT_K 1 +#define QUANT_R 1 + +#if !defined(LOAD_VEC_A) || LOAD_VEC_A == 1 +#define A_TYPE uint16_t +#elif LOAD_VEC_A == 4 +#define A_TYPE u16vec4 +#elif LOAD_VEC_A == 8 +#error unsupported +#endif +#endif + #define QUANT_K_Q4_0 32 #define QUANT_R_Q4_0 2 @@ -1343,4 +1356,18 @@ void init_iq_shmem(uvec3 wgsize) } #endif +// returns the bfloat value in the low 16b. +// See ggml_compute_fp32_to_bf16 +uint32_t fp32_to_bf16(float f) +{ + uint32_t u = floatBitsToUint(f); + u = (u + (0x7fff + ((u >> 16) & 1))) >> 16; + return u; +} + +float bf16_to_fp32(uint32_t u) +{ + return uintBitsToFloat(u << 16); +} + #endif // !defined(GGML_TYPES_COMP) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index cf74625cc56d5..3b28578545ed5 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -63,7 +63,8 @@ const std::vector type_names = { "iq3_xxs", "iq3_s", "iq4_xs", - "iq4_nl" + "iq4_nl", + "bf16", }; namespace { @@ -296,7 +297,6 @@ void matmul_shaders(bool fp16, bool matmul_id, bool coopmat, bool coopmat2, bool std::string aligned_b_type_f16 = coopmat2 ? "float16_t" : fp16 ? "f16mat2x4" : "f16vec4"; std::map base_dict = { - {"FLOAT_TYPE", (coopmat2 || fp16) ? "float16_t" : "float"}, {"FLOAT_TYPE_VEC2", (coopmat2 || fp16) ? "f16vec2" : "vec2"}, }; std::string shader_name = "matmul"; @@ -318,12 +318,45 @@ void matmul_shaders(bool fp16, bool matmul_id, bool coopmat, bool coopmat2, bool const std::string source_name = coopmat2 ? "mul_mm_cm2.comp" : "mul_mm.comp"; + auto const &FLOAT_TYPE = [&](const std::string &t) -> std::string { + if (t == "bf16") { + // scalar path promotes to float + if (!coopmat && !coopmat2) { + return "float"; + } + return "bfloat16_t"; + } + if (coopmat2 || fp16) { + return "float16_t"; + } + return "float"; + }; + // Shaders with f16 B_TYPE - string_to_spv(shader_name + "_f32_f16", source_name, merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}, }), fp16, coopmat, coopmat2, f16acc); - string_to_spv(shader_name + "_f32_f16_aligned", source_name, merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc); + string_to_spv(shader_name + "_f32_f16", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("f16")}, {"DATA_A_F32", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}, }), fp16, coopmat, coopmat2, f16acc); + string_to_spv(shader_name + "_f32_f16_aligned", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("f16")}, {"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc); + + string_to_spv(shader_name + "_f16_aligned", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("f16")}, {"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc); + string_to_spv(shader_name + "_f16", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("f16")}, {"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc); + + // bf16 + { + std::string load_vec_a_unaligned = "1"; + // For aligned matmul loads + std::string load_vec_a = coopmat2 ? "1" : "4"; + + // scalar path promotes to float + std::string to_float_type = (coopmat || coopmat2) ? "uintBitsToBFloat16EXT" : "bf16_to_fp32"; - string_to_spv(shader_name + "_f16_aligned", source_name, merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc); - string_to_spv(shader_name + "_f16", source_name, merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc); + // If bfloat16 is not supported, then only compile the scalar (promote to fp32) shader +#if !defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) + if (!(coopmat || coopmat2)) +#endif + { + string_to_spv(shader_name + "_bf16_aligned", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("bf16")}, {"TO_FLOAT_TYPE", to_float_type}, {"DATA_A_BF16", "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", "4"}, {"B_TYPE", coopmat2 ? "bfloat16_t" : "u16vec4"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc); + string_to_spv(shader_name + "_bf16", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("bf16")}, {"TO_FLOAT_TYPE", to_float_type}, {"DATA_A_BF16", "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", coopmat2 ? "bfloat16_t" : "uint16_t"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}}), fp16, coopmat, coopmat2, f16acc); + } + } for (const auto& tname : type_names) { std::string load_vec_quant = "2"; @@ -332,26 +365,30 @@ void matmul_shaders(bool fp16, bool matmul_id, bool coopmat, bool coopmat2, bool else if ((tname == "q5_0") || (tname == "q5_1") || (tname == "q8_0") || (tname == "iq4_nl")) load_vec_quant = "4"; + if (tname == "bf16") { + continue; + } + std::string data_a_key = "DATA_A_" + to_uppercase(tname); // For unaligned, load one at a time for f32/f16, or two at a time for quants - std::string load_vec_a_unaligned = (coopmat2 || tname == "f32" || tname == "f16") ? "1" : load_vec_quant; + std::string load_vec_a_unaligned = (coopmat2 || tname == "f32" || tname == "f16" || tname == "bf16") ? "1" : load_vec_quant; // For aligned matmul loads - std::string load_vec_a = (coopmat2 || tname == "f32" || tname == "f16") ? load_vec : load_vec_quant; + std::string load_vec_a = (coopmat2 || tname == "f32" || tname == "f16" || tname == "bf16") ? load_vec : load_vec_quant; // don't generate f32 variants for coopmat2 if (!coopmat2) { - string_to_spv(shader_name + "_" + tname + "_f32", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc); - string_to_spv(shader_name + "_" + tname + "_f32_aligned", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc); + string_to_spv(shader_name + "_" + tname + "_f32", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE(tname)}, {data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc); + string_to_spv(shader_name + "_" + tname + "_f32_aligned", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE(tname)}, {data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc); } if (tname != "f16" && tname != "f32") { - string_to_spv(shader_name + "_" + tname + "_f16", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc); - string_to_spv(shader_name + "_" + tname + "_f16_aligned", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc); + string_to_spv(shader_name + "_" + tname + "_f16", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE(tname)}, {data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc); + string_to_spv(shader_name + "_" + tname + "_f16_aligned", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE(tname)}, {data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc); } #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) if (!coopmat && !coopmat2 && !matmul_id && (tname == "q4_0" || tname == "q4_1" || tname == "q5_0" || tname == "q5_1" || tname == "q8_0")) { - string_to_spv(shader_name + "_" + tname + "_q8_1", "mul_mmq.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float"},}), fp16, coopmat, coopmat2, f16acc); + string_to_spv(shader_name + "_" + tname + "_q8_1", "mul_mmq.comp", merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE(tname)}, {data_a_key, "1"}, {"D_TYPE", "float"},}), fp16, coopmat, coopmat2, f16acc); } #endif } @@ -393,6 +430,7 @@ void process_shaders() { if (tname == "f32") { continue; } + if (tname == "bf16") continue; if (tname == "f16") { string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn_cm2.comp", @@ -417,12 +455,12 @@ void process_shaders() { string_to_spv("mul_mat_vec_id_" + tname + "_f32", shader, merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}})); // Dequant shaders - if (tname != "f16") { + if (tname != "f16" && tname != "bf16") { string_to_spv("dequant_" + tname, "dequant_" + tname + ".comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float16_t"}})); } if (!string_ends_with(tname, "_k")) { - shader = (tname == "f32" || tname == "f16") ? "get_rows.comp" : "get_rows_quant.comp"; + shader = (tname == "f32" || tname == "f16" || tname == "bf16") ? "get_rows.comp" : "get_rows_quant.comp"; if (tname == "f16") { string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}})); @@ -447,9 +485,11 @@ void process_shaders() { string_to_spv("cpy_f32_f32", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); string_to_spv("cpy_f32_f16", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}}); string_to_spv("cpy_f16_f16", "copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}}); + string_to_spv("cpy_f32_bf16","copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "uint16_t"}, {"DATA_D_BF16", "1"}}); string_to_spv("contig_cpy_f32_f32", "contig_copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); string_to_spv("contig_cpy_f32_f16", "contig_copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}}); string_to_spv("contig_cpy_f16_f16", "contig_copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}}); + string_to_spv("contig_cpy_f32_bf16","contig_copy.comp",{{"A_TYPE", "float"}, {"D_TYPE", "uint16_t"}, {"DATA_D_BF16", "1"}}); for (std::string t : {"q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "iq4_nl"}) { string_to_spv("cpy_f32_" + t, "copy_to_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 326ccdb071a79..a2540bd93fd91 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -231,6 +231,7 @@ class ClipVision: BLOCK_COUNT = "clip.vision.block_count" IMAGE_MEAN = "clip.vision.image_mean" IMAGE_STD = "clip.vision.image_std" + SPATIAL_MERGE_SIZE = "clip.vision.spatial_merge_size" USE_GELU = "clip.use_gelu" USE_SILU = "clip.use_silu" @@ -491,6 +492,7 @@ class MODEL_TENSOR(IntEnum): V_ENC_FFN_DOWN = auto() V_PRE_NORM = auto() V_POST_NORM = auto() + V_MM_INP_NORM = auto() V_MM_INP_PROJ = auto() # gemma3 V_MM_SOFT_EMB_NORM = auto() # gemma3 V_RESMPL_POS_EMBD_K = auto() # minicpmv @@ -505,6 +507,7 @@ class MODEL_TENSOR(IntEnum): V_RESMPL_PROJ = auto() # minicpmv V_RESMPL_QUERY = auto() # minicpmv V_TOK_EMBD_IMG_BREAK = auto() # pixtral + V_MM_PATCH_MERGER = auto() # mistral small 3.1 MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { @@ -747,6 +750,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_PRE_NORM: "v.pre_ln", MODEL_TENSOR.V_POST_NORM: "v.post_ln", MODEL_TENSOR.V_MM_INP_PROJ: "mm.input_projection", + MODEL_TENSOR.V_MM_INP_NORM: "mm.input_norm", MODEL_TENSOR.V_MM_SOFT_EMB_NORM: "mm.soft_emb_norm", MODEL_TENSOR.V_RESMPL_POS_EMBD_K: "resampler.pos_embd_k", MODEL_TENSOR.V_RESMPL_ATTN_Q: "resampler.attn.q", @@ -760,6 +764,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_RESMPL_PROJ: "resampler.proj", MODEL_TENSOR.V_RESMPL_QUERY: "resampler.query", MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK: "v.token_embd.img_break", # pixtral + MODEL_TENSOR.V_MM_PATCH_MERGER: "mm.patch_merger", # mistral small 3.1 } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { @@ -783,6 +788,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_PRE_NORM, MODEL_TENSOR.V_POST_NORM, MODEL_TENSOR.V_MM_INP_PROJ, + MODEL_TENSOR.V_MM_INP_NORM, MODEL_TENSOR.V_MM_SOFT_EMB_NORM, MODEL_TENSOR.V_RESMPL_POS_EMBD_K, MODEL_TENSOR.V_RESMPL_ATTN_Q, @@ -796,6 +802,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_RESMPL_PROJ, MODEL_TENSOR.V_RESMPL_QUERY, MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK, + MODEL_TENSOR.V_MM_PATCH_MERGER, ], MODEL_ARCH.LLAMA: [ MODEL_TENSOR.TOKEN_EMBD, diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index f22a6d4a3472b..a30c49e32b351 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -972,6 +972,9 @@ def add_vision_image_mean(self, values: Sequence[float]) -> None: def add_vision_image_std(self, values: Sequence[float]) -> None: self.add_array(Keys.ClipVision.IMAGE_STD, values) + def add_vision_spatial_merge_size(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.SPATIAL_MERGE_SIZE, value) + def add_vision_use_gelu(self, value: bool) -> None: self.add_bool(Keys.ClipVision.USE_GELU, value) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 311d1ff69c799..2f6326104ffa7 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1001,6 +1001,10 @@ class TensorNameMap: "multi_modal_projector.mm_input_projection", ), + MODEL_TENSOR.V_MM_INP_NORM: ( + "multi_modal_projector.norm", + ), + MODEL_TENSOR.V_MM_SOFT_EMB_NORM: ( "multi_modal_projector.mm_soft_emb_norm", ), @@ -1052,6 +1056,10 @@ class TensorNameMap: MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK: ( "v.token_embd.img_break", # for pixtral, this is a generated vector ), + + MODEL_TENSOR.V_MM_PATCH_MERGER: ( + "multi_modal_projector.patch_merger.merging_layer", # mistral small 3.1 + ), } # architecture-specific block mappings diff --git a/scripts/compare-llama-bench.py b/scripts/compare-llama-bench.py index 6205fe88d7239..8c599cf9eab49 100755 --- a/scripts/compare-llama-bench.py +++ b/scripts/compare-llama-bench.py @@ -19,9 +19,9 @@ # Properties by which to differentiate results per commit: KEY_PROPERTIES = [ - "cpu_info", "gpu_info", "backends", "n_gpu_layers", "model_filename", "model_type", "n_batch", "n_ubatch", - "embeddings", "cpu_mask", "cpu_strict", "poll", "n_threads", "type_k", "type_v", "use_mmap", "no_kv_offload", - "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen" + "cpu_info", "gpu_info", "backends", "n_gpu_layers", "tensor_buft_overrides", "model_filename", "model_type", + "n_batch", "n_ubatch", "embeddings", "cpu_mask", "cpu_strict", "poll", "n_threads", "type_k", "type_v", + "use_mmap", "no_kv_offload", "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen", "n_depth" ] # Properties that are boolean and are converted to Yes/No for the table: @@ -30,11 +30,11 @@ # Header names for the table: PRETTY_NAMES = { "cpu_info": "CPU", "gpu_info": "GPU", "backends": "Backends", "n_gpu_layers": "GPU layers", - "model_filename": "File", "model_type": "Model", "model_size": "Model size [GiB]", - "model_n_params": "Num. of par.", "n_batch": "Batch size", "n_ubatch": "Microbatch size", - "embeddings": "Embeddings", "cpu_mask": "CPU mask", "cpu_strict": "CPU strict", "poll": "Poll", - "n_threads": "Threads", "type_k": "K type", "type_v": "V type", "split_mode": "Split mode", "main_gpu": "Main GPU", - "no_kv_offload": "NKVO", "flash_attn": "FlashAttention", "tensor_split": "Tensor split", "use_mmap": "Use mmap", + "tensor_buft_overrides": "Tensor overrides", "model_filename": "File", "model_type": "Model", "model_size": "Model size [GiB]", + "model_n_params": "Num. of par.", "n_batch": "Batch size", "n_ubatch": "Microbatch size", "embeddings": "Embeddings", + "cpu_mask": "CPU mask", "cpu_strict": "CPU strict", "poll": "Poll", "n_threads": "Threads", "type_k": "K type", "type_v": "V type", + "use_mmap": "Use mmap", "no_kv_offload": "NKVO", "split_mode": "Split mode", "main_gpu": "Main GPU", "tensor_split": "Tensor split", + "flash_attn": "FlashAttention", } DEFAULT_SHOW = ["model_type"] # Always show these properties by default. @@ -281,12 +281,12 @@ def get_rows(properties): The returned rows are unique in terms of property combinations. """ select_string = ", ".join( - [f"tb.{p}" for p in properties] + ["tb.n_prompt", "tb.n_gen", "AVG(tb.avg_ts)", "AVG(tc.avg_ts)"]) + [f"tb.{p}" for p in properties] + ["tb.n_prompt", "tb.n_gen", "tb.n_depth", "AVG(tb.avg_ts)", "AVG(tc.avg_ts)"]) equal_string = " AND ".join( [f"tb.{p} = tc.{p}" for p in KEY_PROPERTIES] + [ f"tb.build_commit = '{hexsha8_baseline}'", f"tc.build_commit = '{hexsha8_compare}'"] ) - group_order_string = ", ".join([f"tb.{p}" for p in properties] + ["tb.n_gen", "tb.n_prompt"]) + group_order_string = ", ".join([f"tb.{p}" for p in properties] + ["tb.n_gen", "tb.n_prompt", "tb.n_depth"]) query = (f"SELECT {select_string} FROM test tb JOIN test tc ON {equal_string} " f"GROUP BY {group_order_string} ORDER BY {group_order_string};") return cursor.execute(query).fetchall() @@ -309,7 +309,7 @@ def get_rows(properties): rows_full = get_rows(KEY_PROPERTIES) properties_different = [] for i, kp_i in enumerate(KEY_PROPERTIES): - if kp_i in DEFAULT_SHOW or kp_i == "n_prompt" or kp_i == "n_gen": + if kp_i in DEFAULT_SHOW or kp_i in ["n_prompt", "n_gen", "n_depth"]: continue for row_full in rows_full: if row_full[i] != rows_full[0][i]: @@ -340,17 +340,20 @@ def get_rows(properties): table = [] for row in rows_show: - n_prompt = int(row[-4]) - n_gen = int(row[-3]) + n_prompt = int(row[-5]) + n_gen = int(row[-4]) + n_depth = int(row[-3]) if n_prompt != 0 and n_gen == 0: test_name = f"pp{n_prompt}" elif n_prompt == 0 and n_gen != 0: test_name = f"tg{n_gen}" else: test_name = f"pp{n_prompt}+tg{n_gen}" + if n_depth != 0: + test_name = f"{test_name}@d{n_depth}" # Regular columns test name avg t/s values Speedup # VVVVVVVVVVVVV VVVVVVVVV VVVVVVVVVVVVVV VVVVVVV - table.append(list(row[:-4]) + [test_name] + list(row[-2:]) + [float(row[-1]) / float(row[-2])]) + table.append(list(row[:-5]) + [test_name] + list(row[-2:]) + [float(row[-1]) / float(row[-2])]) # Some a-posteriori fixes to make the table contents prettier: for bool_property in BOOL_PROPERTIES: @@ -376,7 +379,7 @@ def get_rows(properties): for gns in GPU_NAME_STRIP: row_table[ip] = row_table[ip].replace(gns, "") - gpu_names = row_table[ip].split("/") + gpu_names = row_table[ip].split(", ") num_gpus = len(gpu_names) all_names_the_same = len(set(gpu_names)) == 1 if len(gpu_names) >= 2 and all_names_the_same: diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 41feffca9212d..433cfab7ffae9 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -13bcf9ce50651a8b4238ec6d136f46f2c1b23b6f +f3a375f20bf56860b30e7c511d03593a1e393345 diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp index 735d2619c928f..9b56614951879 100644 --- a/src/llama-chat.cpp +++ b/src/llama-chat.cpp @@ -454,7 +454,7 @@ int32_t llm_chat_apply_template( ss << "<|" << role << "|>" << "\n" << message->content; } if (add_ass) { - ss << "<|assistant|>"; + ss << "<|assistant|>\n"; } } else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) { // MiniCPM-3B-OpenHermes-2.5-v2-GGUF diff --git a/src/llama-context.cpp b/src/llama-context.cpp index e49225aa22433..5a2eef9b784a1 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -114,7 +114,7 @@ llama_context::llama_context( } if (n_ctx_per_seq > hparams.n_ctx_train) { - LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n", + LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n", __func__, n_ctx_per_seq, hparams.n_ctx_train); } diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 822e2bb2cf018..51092a128c5c6 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -40,6 +40,7 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_335M: return "335M"; case LLM_TYPE_410M: return "410M"; case LLM_TYPE_450M: return "450M"; + case LLM_TYPE_475M: return "475M"; case LLM_TYPE_770M: return "770M"; case LLM_TYPE_780M: return "780M"; case LLM_TYPE_0_5B: return "0.5B"; @@ -707,7 +708,11 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0); if (hparams.n_layer == 12 && hparams.n_embd == 768) { - type = LLM_TYPE_137M; + if (arch == LLM_ARCH_NOMIC_BERT) { + type = LLM_TYPE_137M; + } else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) { + type = LLM_TYPE_475M; + } } } break; case LLM_ARCH_BLOOM: diff --git a/src/llama-model.h b/src/llama-model.h index 95eca00266a4b..34aac337cff27 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -36,6 +36,7 @@ enum llm_type { LLM_TYPE_335M, LLM_TYPE_410M, LLM_TYPE_450M, + LLM_TYPE_475M, LLM_TYPE_770M, LLM_TYPE_780M, LLM_TYPE_0_5B, diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index d70acb7719435..767498ef39dd5 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1981,7 +1981,7 @@ struct test_mul_mat : public test_case { const std::array bs; // dims 3 and 4 const std::array nr; // repeat in dims 3 and 4 const std::array per; // permutation of dimensions - const bool v; // whether a is a non-contiguous view + const bool v; // whether a and b are non-contiguous views std::string vars() override { return VARS_TO_STR9(type_a, type_b, m, n, k, bs, nr, per, v); @@ -2042,12 +2042,15 @@ struct test_mul_mat : public test_case { } else { if (v) { - a = ggml_new_tensor_4d(ctx, type_a, k*2, m, bs[0], bs[1]); - a = ggml_view_4d(ctx, a, k, m, bs[0], bs[1], a->nb[1], a->nb[2], a->nb[3], 0); + a = ggml_new_tensor_4d(ctx, type_a, k*2, m, bs[0], bs[1]); + b = ggml_new_tensor_4d(ctx, type_b, k*2, n, bs[0]*nr[0], bs[1]*nr[1]); + + a = ggml_view_4d(ctx, a, k, m, bs[0], bs[1], a->nb[1], a->nb[2], a->nb[3], 0); + b = ggml_view_4d(ctx, b, k, n, bs[0]*nr[0], bs[1]*nr[1], b->nb[1], b->nb[2], b->nb[3], 0); } else { a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0], bs[1]); + b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]); } - b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]); if (!ggml_is_quantized(type_a)) { if (bs[1] == 1 && nr[1] == 1) { ggml_set_param(ctx, a); @@ -4184,6 +4187,11 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 2, 1, 3})); test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 1, 3, 2})); test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 3, 2, 1})); + + // test cases with large ne00/ne10 to cover stream-k fixup + test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 1024, {3, 2}, {1, 1})); + test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 8, 1024, {3, 2}, {1, 1})); + test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 1024, {3, 2}, {1, 1})); } } for (ggml_type type_a : other_types) { diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp index 85d89843d6d96..d217069575cdf 100644 --- a/tests/test-chat-template.cpp +++ b/tests/test-chat-template.cpp @@ -181,8 +181,8 @@ int main(void) { }, { /* .name= */ "ChatGLM4", - /* .template_str= */ U8C("[gMASK]{% for item in messages %}{% if item['tools'] is defined %}<|system|>\n你是一个名为 ChatGLM 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的,你的任务是针对用户的问题和要求提供适当的答复和支持。\n\n# 可用工具{% set tools = item['tools'] %}{% for tool in tools %}{% if tool['type'] == 'function' %}\n\n## {{ tool['function']['name'] }}\n\n{{ tool['function'] | tojson(indent=4) }}\n......{% endif %}{% endfor %}{% endif %}{% if item['content'] %}<|{{ item['role'] }}|>{{ item['metadata'] }}\n{{ item['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}"), - /* .expected_output= */ "[gMASK]<|system|>\nYou are a helpful assistant<|user|>\nHello<|assistant|>\nHi there<|user|>\nWho are you<|assistant|>\n I am an assistant <|user|>\nAnother question<|assistant|>", + /* .template_str= */ U8C("[gMASK]{% for item in messages %}{% if item['tools'] is defined %}<|system|>\n你是一个名为 ChatGLM 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的,你的任务是针对用户的问题和要求提供适当的答复和支持。\n\n# 可用工具{% set tools = item['tools'] %}{% for tool in tools %}{% if tool['type'] == 'function' %}\n\n## {{ tool['function']['name'] }}\n\n{{ tool['function'] | tojson(indent=4) }}\n......{% endif %}{% endfor %}{% endif %}{% if item['content'] %}<|{{ item['role'] }}|>{{ item['metadata'] }}\n{{ item['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>\n{% endif %}"), + /* .expected_output= */ "[gMASK]<|system|>\nYou are a helpful assistant<|user|>\nHello<|assistant|>\nHi there<|user|>\nWho are you<|assistant|>\n I am an assistant <|user|>\nAnother question<|assistant|>\n", /* .expected_output_jinja= */ "", /* .bos_token= */ "", /* .eos_token= */ "", diff --git a/tests/test-quantize-stats.cpp b/tests/test-quantize-stats.cpp index db01059119e9b..a284a1f0c5e31 100644 --- a/tests/test-quantize-stats.cpp +++ b/tests/test-quantize-stats.cpp @@ -1,4 +1,5 @@ #include "ggml.h" +#include "ggml-cpu.h" #include "llama.h" #include "common.h"