Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3435,7 +3435,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) {
params.use_jinja = true;
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
add_opt(common_arg(
{"--reasoning-format"}, "FORMAT",
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
Expand Down
43 changes: 34 additions & 9 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,29 @@
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf
from gguf.vocab import MistralTokenizerType, MistralVocab
from mistral_common.tokens.tokenizers.base import TokenizerVersion
from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN, DATASET_STD
from mistral_common.tokens.tokenizers.tekken import Tekkenizer
from mistral_common.tokens.tokenizers.sentencepiece import (
SentencePieceTokenizer,
)

try:
from mistral_common.tokens.tokenizers.base import TokenizerVersion # pyright: ignore[reportMissingImports]
from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN as _MISTRAL_COMMON_DATASET_MEAN, DATASET_STD as _MISTRAL_COMMON_DATASET_STD # pyright: ignore[reportMissingImports]
from mistral_common.tokens.tokenizers.tekken import Tekkenizer # pyright: ignore[reportMissingImports]
from mistral_common.tokens.tokenizers.sentencepiece import ( # pyright: ignore[reportMissingImports]
SentencePieceTokenizer,
)

_mistral_common_installed = True
_mistral_import_error_msg = ""
except ImportError:
_MISTRAL_COMMON_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
_MISTRAL_COMMON_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)

_mistral_common_installed = False
TokenizerVersion = None
Tekkenizer = None
SentencePieceTokenizer = None
_mistral_import_error_msg = (
"Mistral format requires `mistral-common` to be installed. Please run "
"`pip install mistral-common[image,audio]` to install it."
)


logger = logging.getLogger("hf-to-gguf")
Expand Down Expand Up @@ -107,6 +124,9 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
type(self) is MmprojModel:
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")

if self.is_mistral_format and not _mistral_common_installed:
raise ImportError(_mistral_import_error_msg)

self.dir_model = dir_model
self.ftype = ftype
self.fname_out = fname_out
Expand Down Expand Up @@ -1363,8 +1383,8 @@ def set_gguf_parameters(self):
self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads"]))

# preprocessor config
image_mean = DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"]
image_std = DATASET_STD if self.is_mistral_format else self.preprocessor_config["image_std"]
image_mean = _MISTRAL_COMMON_DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"]
image_std = _MISTRAL_COMMON_DATASET_STD if self.is_mistral_format else self.preprocessor_config["image_std"]

self.gguf_writer.add_vision_image_mean(image_mean)
self.gguf_writer.add_vision_image_std(image_std)
Expand Down Expand Up @@ -2033,6 +2053,9 @@ def __init__(self, *args, **kwargs):
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)

def _set_vocab_mistral(self):
if not _mistral_common_installed:
raise ImportError(_mistral_import_error_msg)

vocab = MistralVocab(self.dir_model)
logger.info(
f"Converting tokenizer {vocab.tokenizer_type} of size {vocab.vocab_size}."
Expand Down Expand Up @@ -9212,7 +9235,7 @@ class MistralModel(LlamaModel):

@staticmethod
def get_community_chat_template(vocab: MistralVocab, templates_dir: Path, is_mistral_format: bool):
assert TokenizerVersion is not None, "mistral_common is not installed"
assert TokenizerVersion is not None and Tekkenizer is not None and SentencePieceTokenizer is not None, _mistral_import_error_msg
assert isinstance(vocab.tokenizer, (Tekkenizer, SentencePieceTokenizer)), (
f"Expected Tekkenizer or SentencePieceTokenizer, got {type(vocab.tokenizer)}"
)
Expand Down Expand Up @@ -9594,6 +9617,8 @@ def main() -> None:
fname_out = ModelBase.add_prefix_to_filename(fname_out, "mmproj-")

is_mistral_format = args.mistral_format
if is_mistral_format and not _mistral_common_installed:
raise ImportError(_mistral_import_error_msg)
disable_mistral_community_chat_template = args.disable_mistral_community_chat_template

with torch.inference_mode():
Expand Down
6 changes: 3 additions & 3 deletions ggml/src/ggml-cuda/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2826,7 +2826,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
ggml_cuda_topk_moe_ops(/*with_norm=*/false, /*delayed_softmax=*/true);

if (ops.size() == topk_moe_ops_with_norm.size() &&
ggml_can_fuse_subgraph(cgraph, node_idx, topk_moe_ops_with_norm, { node_idx + 3, node_idx + 8 })) {
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 8 })) {
ggml_tensor * softmax = cgraph->nodes[node_idx];
ggml_tensor * weights = cgraph->nodes[node_idx+8];

Expand All @@ -2836,7 +2836,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
}

if (ops.size() == topk_moe_ops.size() &&
ggml_can_fuse_subgraph(cgraph, node_idx, topk_moe_ops, { node_idx + 3, node_idx + 4 })) {
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 4 })) {
ggml_tensor * softmax = cgraph->nodes[node_idx];
ggml_tensor * weights = cgraph->nodes[node_idx+4];
if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
Expand All @@ -2845,7 +2845,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
}

if (ops.size() == topk_moe_ops_delayed_softmax.size() &&
ggml_can_fuse_subgraph(cgraph, node_idx, topk_moe_ops_delayed_softmax, { node_idx + 2, node_idx + 5 })) {
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 2, node_idx + 5 })) {
ggml_tensor * softmax = cgraph->nodes[node_idx + 4];
ggml_tensor * weights = cgraph->nodes[node_idx + 5];

Expand Down
156 changes: 115 additions & 41 deletions ggml/src/ggml-sycl/ggml-sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@
#include <regex>

#include <sycl/sycl.hpp>
#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
# include <sycl/ext/oneapi/experimental/async_alloc/async_alloc.hpp>
#endif
#include <sycl/half_type.hpp>

#include "ggml-sycl.h"
Expand All @@ -54,6 +57,7 @@ int g_ggml_sycl_disable_optimize = 0;
int g_ggml_sycl_disable_graph = 0;
int g_ggml_sycl_disable_dnn = 0;
int g_ggml_sycl_prioritize_dmmv = 0;
int g_ggml_sycl_use_async_mem_op = 0;

static ggml_sycl_device_info ggml_sycl_init() {
ggml_sycl_device_info info = {};
Expand Down Expand Up @@ -237,7 +241,20 @@ static void ggml_check_sycl() try {
fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
#endif
*/

// Currently, we only use async malloc / free when graphs are enabled as it is required for the calls to be
// properly recorded. As this SYCL extension matures it may be beneficial to enable as the default path and in
// other places.
#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
g_ggml_sycl_use_async_mem_op = !g_ggml_sycl_disable_graph;
if (g_ggml_sycl_use_async_mem_op) {
for (unsigned int i = 0; i < dpct::dev_mgr::instance().device_count(); ++i) {
if (!dpct::dev_mgr::instance().get_device(i).has(sycl::aspect::ext_oneapi_async_memory_alloc)) {
g_ggml_sycl_use_async_mem_op = 0;
break;
}
}
}
#endif
if (CHECK_TRY_ERROR(g_all_sycl_device_count =
dpct::dev_mgr::instance().device_count()) != 0) {
initialized = true;
Expand Down Expand Up @@ -3031,19 +3048,51 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
}
}

// Helper functions to unify device memory allocation for both async and sync paths
static inline void * sycl_ext_malloc_device(dpct::queue_ptr stream, size_t size) {
bool use_async = g_ggml_sycl_use_async_mem_op;
#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
if (use_async) {
return syclex::async_malloc(*stream, sycl::usm::alloc::device, size);
}
#else
// If async allocation extension is not available, use_async should always be false.
GGML_ASSERT(!use_async);
#endif
return sycl::malloc(size, *stream, sycl::usm::alloc::device);
}

static inline void sycl_ext_free(dpct::queue_ptr stream, void * ptr) {
bool use_async = g_ggml_sycl_use_async_mem_op;
#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
if (use_async) {
syclex::async_free(*stream, ptr);
return;
}
#else
// If async allocation extension is not available, use_async should always be false.
GGML_ASSERT(!use_async);
#endif
sycl::free(ptr, *stream);
}

static void reorder_qw_q4_0(uint8_t * data_device, const int ncols, const int nrows, size_t size, size_t offset,
dpct::queue_ptr stream) {
auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
SYCL_CHECK(
CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size)
.wait()));
uint8_t * tmp_buf = static_cast<uint8_t *>(sycl_ext_malloc_device(stream, size));

sycl::event copy_event;
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
if (!g_ggml_sycl_use_async_mem_op) {
copy_event.wait();
}

GGML_ASSERT((size % sizeof(block_q4_0) == 0));
GGML_ASSERT((offset % sizeof(block_q4_0) == 0));
int offset_blks = offset / sizeof(block_q4_0);
auto qs_ptr = data_device + offset_blks * QK4_0 / 2;
auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks;

stream->parallel_for(
auto reorder_event = stream->parallel_for(
size / sizeof(block_q4_0),
[=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
const block_q4_0* x = (const block_q4_0*)tmp_buf;
Expand All @@ -3054,9 +3103,11 @@ static void reorder_qw_q4_0(uint8_t * data_device, const int ncols, const int nr
*(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j];
}
*(d_ptr + ib) = x[ib].d;
}).wait_and_throw();

sycl::free(tmp_buf, *stream);
});
if (!g_ggml_sycl_use_async_mem_op) {
reorder_event.wait_and_throw();
}
sycl_ext_free(stream, tmp_buf);
}

static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
Expand All @@ -3065,14 +3116,19 @@ static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, d

const int nblocks = size / sizeof(block_q4_K);

auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size).wait()));
uint8_t * tmp_buf = static_cast<uint8_t *>(sycl_ext_malloc_device(stream, size));

sycl::event copy_event;
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
if (!g_ggml_sycl_use_async_mem_op) {
copy_event.wait();
}

auto * qs_ptr = data_device;
auto * scales_ptr = qs_ptr + QK_K / 2 * nblocks;
auto * dm_ptr = (sycl::half2 *) (scales_ptr + K_SCALE_SIZE * nblocks);

stream->parallel_for(nblocks, [=](auto i) {
auto reorder_event = stream->parallel_for(nblocks, [=](auto i) {
const block_q4_K * x = (const block_q4_K *) tmp_buf;
const int ib = i;

Expand All @@ -3085,9 +3141,11 @@ static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, d
}

dm_ptr[ib] = x[ib].dm;
}).wait_and_throw();

sycl::free(tmp_buf, *stream);
});
if (!g_ggml_sycl_use_async_mem_op) {
reorder_event.wait_and_throw();
}
sycl_ext_free(stream, tmp_buf);
}

static void reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
Expand All @@ -3096,42 +3154,46 @@ static void reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, d

const int nblocks = size / sizeof(block_q6_K);

auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size).wait()));
uint8_t * tmp_buf = static_cast<uint8_t *>(sycl_ext_malloc_device(stream, size));

sycl::event copy_event;
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
if (!g_ggml_sycl_use_async_mem_op) {
copy_event.wait();
}

auto * ql_ptr = data_device;
auto * qh_ptr = ql_ptr + (QK_K / 2) * nblocks;
auto * scales_ptr = qh_ptr + (QK_K / 4) * nblocks;
sycl::half * dm_ptr = (sycl::half *) (scales_ptr + (QK_K / 16) * nblocks);

stream
->parallel_for(nblocks,
[=](auto i) {
const block_q6_K * x = (const block_q6_K *) tmp_buf;
const int ib = i;

const uint8_t * ql = x[ib].ql;
const uint8_t * qh = x[ib].qh;
uint8_t * base_ql_ptr = ql_ptr + (QK_K / 2) * ib;
uint8_t * base_qh_ptr = qh_ptr + (QK_K / 4) * ib;
uint8_t * base_scales_ptr = scales_ptr + (QK_K / 16) * ib;
auto reorder_event = stream->parallel_for(nblocks, [=](auto i) {
const block_q6_K * x = (const block_q6_K *) tmp_buf;
const int ib = i;

for (int j = 0; j < QK_K / 2; ++j) {
base_ql_ptr[j] = ql[j];
}
for (int j = 0; j < QK_K / 4; ++j) {
base_qh_ptr[j] = qh[j];
}
const uint8_t * ql = x[ib].ql;
const uint8_t * qh = x[ib].qh;
uint8_t * base_ql_ptr = ql_ptr + (QK_K / 2) * ib;
uint8_t * base_qh_ptr = qh_ptr + (QK_K / 4) * ib;
uint8_t * base_scales_ptr = scales_ptr + (QK_K / 16) * ib;

for (int j = 0; j < QK_K / 16; ++j) {
base_scales_ptr[j] = x[ib].scales[j];
}
for (int j = 0; j < QK_K / 2; ++j) {
base_ql_ptr[j] = ql[j];
}
for (int j = 0; j < QK_K / 4; ++j) {
base_qh_ptr[j] = qh[j];
}

dm_ptr[ib] = x[ib].d;
})
.wait_and_throw();
for (int j = 0; j < QK_K / 16; ++j) {
base_scales_ptr[j] = x[ib].scales[j];
}

sycl::free(tmp_buf, *stream);
dm_ptr[ib] = x[ib].d;
});
if (!g_ggml_sycl_use_async_mem_op) {
reorder_event.wait_and_throw();
}
sycl_ext_free(stream, tmp_buf);
}

static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
Expand Down Expand Up @@ -4056,6 +4118,18 @@ static bool check_graph_compatibility(ggml_cgraph * cgraph) {
GGML_LOG_INFO("%s: disabling SYCL graphs due to unsupported node type %s\n", __func__,
ggml_op_name(node_op));
return false;
case GGML_OP_MUL_MAT:
// We cannot use graphs with ggml_sycl_mul_mat() when SYCL async memory allocation extensions are not available,
// as SYCL malloc / free and host wait calls are not supported when recording to a graph which are all present
// in reordering.
if (!g_ggml_sycl_use_async_mem_op) {
GGML_LOG_INFO(
"%s: disabling SYCL graphs due to unsupported node type when using a compiler without the "
"oneAPI async memory allocation extension "
"%s\n",
__func__, ggml_op_name(node_op));
return false;
}
}
}
return true;
Expand Down
8 changes: 4 additions & 4 deletions gguf-py/gguf/vocab.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@
SentencePieceProcessor = None

try:
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from mistral_common.tokens.tokenizers.tekken import Tekkenizer
from mistral_common.tokens.tokenizers.utils import (
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer # pyright: ignore[reportMissingImports]
from mistral_common.tokens.tokenizers.tekken import Tekkenizer # pyright: ignore[reportMissingImports]
from mistral_common.tokens.tokenizers.utils import ( # pyright: ignore[reportMissingImports]
_filter_valid_tokenizer_files,
)
from mistral_common.tokens.tokenizers.sentencepiece import (
from mistral_common.tokens.tokenizers.sentencepiece import ( # pyright: ignore[reportMissingImports]
SentencePieceTokenizer,
)
except ImportError:
Expand Down
2 changes: 0 additions & 2 deletions requirements/requirements-convert_hf_to_gguf.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
mistral-common>=1.8.3

-r ./requirements-convert_legacy_llama.txt
--extra-index-url https://download.pytorch.org/whl/cpu

Expand Down
5 changes: 5 additions & 0 deletions tools/imatrix/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,8 @@ target_compile_features(${TARGET} PRIVATE cxx_std_17)
if(LLAMA_TOOLS_INSTALL)
install(TARGETS ${TARGET} RUNTIME)
endif()

if (CMAKE_SYSTEM_NAME MATCHES "AIX")
# AIX's flock() function comes from libbsd.a
target_link_libraries(${TARGET} PRIVATE -lbsd)
endif()
Loading