Skip to content

Commit ab72358

Browse files
Merge pull request #302 from menloresearch/update-dev-from-master-2025-10-24-00-31
Sync master with upstream release b6829
2 parents 4f596b7 + 0bf47a1 commit ab72358

File tree

10 files changed

+202
-76
lines changed

10 files changed

+202
-76
lines changed

common/arg.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3435,7 +3435,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34353435
[](common_params & params) {
34363436
params.use_jinja = true;
34373437
}
3438-
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
3438+
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
34393439
add_opt(common_arg(
34403440
{"--reasoning-format"}, "FORMAT",
34413441
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"

convert_hf_to_gguf.py

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,29 @@
2929
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
3030
import gguf
3131
from gguf.vocab import MistralTokenizerType, MistralVocab
32-
from mistral_common.tokens.tokenizers.base import TokenizerVersion
33-
from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN, DATASET_STD
34-
from mistral_common.tokens.tokenizers.tekken import Tekkenizer
35-
from mistral_common.tokens.tokenizers.sentencepiece import (
36-
SentencePieceTokenizer,
37-
)
32+
33+
try:
34+
from mistral_common.tokens.tokenizers.base import TokenizerVersion # pyright: ignore[reportMissingImports]
35+
from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN as _MISTRAL_COMMON_DATASET_MEAN, DATASET_STD as _MISTRAL_COMMON_DATASET_STD # pyright: ignore[reportMissingImports]
36+
from mistral_common.tokens.tokenizers.tekken import Tekkenizer # pyright: ignore[reportMissingImports]
37+
from mistral_common.tokens.tokenizers.sentencepiece import ( # pyright: ignore[reportMissingImports]
38+
SentencePieceTokenizer,
39+
)
40+
41+
_mistral_common_installed = True
42+
_mistral_import_error_msg = ""
43+
except ImportError:
44+
_MISTRAL_COMMON_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
45+
_MISTRAL_COMMON_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
46+
47+
_mistral_common_installed = False
48+
TokenizerVersion = None
49+
Tekkenizer = None
50+
SentencePieceTokenizer = None
51+
_mistral_import_error_msg = (
52+
"Mistral format requires `mistral-common` to be installed. Please run "
53+
"`pip install mistral-common[image,audio]` to install it."
54+
)
3855

3956

4057
logger = logging.getLogger("hf-to-gguf")
@@ -107,6 +124,9 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
107124
type(self) is MmprojModel:
108125
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
109126

127+
if self.is_mistral_format and not _mistral_common_installed:
128+
raise ImportError(_mistral_import_error_msg)
129+
110130
self.dir_model = dir_model
111131
self.ftype = ftype
112132
self.fname_out = fname_out
@@ -1363,8 +1383,8 @@ def set_gguf_parameters(self):
13631383
self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads"]))
13641384

13651385
# preprocessor config
1366-
image_mean = DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"]
1367-
image_std = DATASET_STD if self.is_mistral_format else self.preprocessor_config["image_std"]
1386+
image_mean = _MISTRAL_COMMON_DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"]
1387+
image_std = _MISTRAL_COMMON_DATASET_STD if self.is_mistral_format else self.preprocessor_config["image_std"]
13681388

13691389
self.gguf_writer.add_vision_image_mean(image_mean)
13701390
self.gguf_writer.add_vision_image_std(image_std)
@@ -2033,6 +2053,9 @@ def __init__(self, *args, **kwargs):
20332053
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
20342054

20352055
def _set_vocab_mistral(self):
2056+
if not _mistral_common_installed:
2057+
raise ImportError(_mistral_import_error_msg)
2058+
20362059
vocab = MistralVocab(self.dir_model)
20372060
logger.info(
20382061
f"Converting tokenizer {vocab.tokenizer_type} of size {vocab.vocab_size}."
@@ -9212,7 +9235,7 @@ class MistralModel(LlamaModel):
92129235

92139236
@staticmethod
92149237
def get_community_chat_template(vocab: MistralVocab, templates_dir: Path, is_mistral_format: bool):
9215-
assert TokenizerVersion is not None, "mistral_common is not installed"
9238+
assert TokenizerVersion is not None and Tekkenizer is not None and SentencePieceTokenizer is not None, _mistral_import_error_msg
92169239
assert isinstance(vocab.tokenizer, (Tekkenizer, SentencePieceTokenizer)), (
92179240
f"Expected Tekkenizer or SentencePieceTokenizer, got {type(vocab.tokenizer)}"
92189241
)
@@ -9594,6 +9617,8 @@ def main() -> None:
95949617
fname_out = ModelBase.add_prefix_to_filename(fname_out, "mmproj-")
95959618

95969619
is_mistral_format = args.mistral_format
9620+
if is_mistral_format and not _mistral_common_installed:
9621+
raise ImportError(_mistral_import_error_msg)
95979622
disable_mistral_community_chat_template = args.disable_mistral_community_chat_template
95989623

95999624
with torch.inference_mode():

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2826,7 +2826,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
28262826
ggml_cuda_topk_moe_ops(/*with_norm=*/false, /*delayed_softmax=*/true);
28272827

28282828
if (ops.size() == topk_moe_ops_with_norm.size() &&
2829-
ggml_can_fuse_subgraph(cgraph, node_idx, topk_moe_ops_with_norm, { node_idx + 3, node_idx + 8 })) {
2829+
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 8 })) {
28302830
ggml_tensor * softmax = cgraph->nodes[node_idx];
28312831
ggml_tensor * weights = cgraph->nodes[node_idx+8];
28322832

@@ -2836,7 +2836,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
28362836
}
28372837

28382838
if (ops.size() == topk_moe_ops.size() &&
2839-
ggml_can_fuse_subgraph(cgraph, node_idx, topk_moe_ops, { node_idx + 3, node_idx + 4 })) {
2839+
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 4 })) {
28402840
ggml_tensor * softmax = cgraph->nodes[node_idx];
28412841
ggml_tensor * weights = cgraph->nodes[node_idx+4];
28422842
if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
@@ -2845,7 +2845,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
28452845
}
28462846

28472847
if (ops.size() == topk_moe_ops_delayed_softmax.size() &&
2848-
ggml_can_fuse_subgraph(cgraph, node_idx, topk_moe_ops_delayed_softmax, { node_idx + 2, node_idx + 5 })) {
2848+
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 2, node_idx + 5 })) {
28492849
ggml_tensor * softmax = cgraph->nodes[node_idx + 4];
28502850
ggml_tensor * weights = cgraph->nodes[node_idx + 5];
28512851

ggml/src/ggml-sycl/ggml-sycl.cpp

Lines changed: 115 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@
3030
#include <regex>
3131

3232
#include <sycl/sycl.hpp>
33+
#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
34+
# include <sycl/ext/oneapi/experimental/async_alloc/async_alloc.hpp>
35+
#endif
3336
#include <sycl/half_type.hpp>
3437

3538
#include "ggml-sycl.h"
@@ -54,6 +57,7 @@ int g_ggml_sycl_disable_optimize = 0;
5457
int g_ggml_sycl_disable_graph = 0;
5558
int g_ggml_sycl_disable_dnn = 0;
5659
int g_ggml_sycl_prioritize_dmmv = 0;
60+
int g_ggml_sycl_use_async_mem_op = 0;
5761

5862
static ggml_sycl_device_info ggml_sycl_init() {
5963
ggml_sycl_device_info info = {};
@@ -237,7 +241,20 @@ static void ggml_check_sycl() try {
237241
fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
238242
#endif
239243
*/
240-
244+
// Currently, we only use async malloc / free when graphs are enabled as it is required for the calls to be
245+
// properly recorded. As this SYCL extension matures it may be beneficial to enable as the default path and in
246+
// other places.
247+
#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
248+
g_ggml_sycl_use_async_mem_op = !g_ggml_sycl_disable_graph;
249+
if (g_ggml_sycl_use_async_mem_op) {
250+
for (unsigned int i = 0; i < dpct::dev_mgr::instance().device_count(); ++i) {
251+
if (!dpct::dev_mgr::instance().get_device(i).has(sycl::aspect::ext_oneapi_async_memory_alloc)) {
252+
g_ggml_sycl_use_async_mem_op = 0;
253+
break;
254+
}
255+
}
256+
}
257+
#endif
241258
if (CHECK_TRY_ERROR(g_all_sycl_device_count =
242259
dpct::dev_mgr::instance().device_count()) != 0) {
243260
initialized = true;
@@ -3031,19 +3048,51 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
30313048
}
30323049
}
30333050

3051+
// Helper functions to unify device memory allocation for both async and sync paths
3052+
static inline void * sycl_ext_malloc_device(dpct::queue_ptr stream, size_t size) {
3053+
bool use_async = g_ggml_sycl_use_async_mem_op;
3054+
#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
3055+
if (use_async) {
3056+
return syclex::async_malloc(*stream, sycl::usm::alloc::device, size);
3057+
}
3058+
#else
3059+
// If async allocation extension is not available, use_async should always be false.
3060+
GGML_ASSERT(!use_async);
3061+
#endif
3062+
return sycl::malloc(size, *stream, sycl::usm::alloc::device);
3063+
}
3064+
3065+
static inline void sycl_ext_free(dpct::queue_ptr stream, void * ptr) {
3066+
bool use_async = g_ggml_sycl_use_async_mem_op;
3067+
#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
3068+
if (use_async) {
3069+
syclex::async_free(*stream, ptr);
3070+
return;
3071+
}
3072+
#else
3073+
// If async allocation extension is not available, use_async should always be false.
3074+
GGML_ASSERT(!use_async);
3075+
#endif
3076+
sycl::free(ptr, *stream);
3077+
}
3078+
30343079
static void reorder_qw_q4_0(uint8_t * data_device, const int ncols, const int nrows, size_t size, size_t offset,
30353080
dpct::queue_ptr stream) {
3036-
auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
3037-
SYCL_CHECK(
3038-
CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size)
3039-
.wait()));
3081+
uint8_t * tmp_buf = static_cast<uint8_t *>(sycl_ext_malloc_device(stream, size));
3082+
3083+
sycl::event copy_event;
3084+
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
3085+
if (!g_ggml_sycl_use_async_mem_op) {
3086+
copy_event.wait();
3087+
}
3088+
30403089
GGML_ASSERT((size % sizeof(block_q4_0) == 0));
30413090
GGML_ASSERT((offset % sizeof(block_q4_0) == 0));
30423091
int offset_blks = offset / sizeof(block_q4_0);
30433092
auto qs_ptr = data_device + offset_blks * QK4_0 / 2;
30443093
auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks;
30453094

3046-
stream->parallel_for(
3095+
auto reorder_event = stream->parallel_for(
30473096
size / sizeof(block_q4_0),
30483097
[=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
30493098
const block_q4_0* x = (const block_q4_0*)tmp_buf;
@@ -3054,9 +3103,11 @@ static void reorder_qw_q4_0(uint8_t * data_device, const int ncols, const int nr
30543103
*(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j];
30553104
}
30563105
*(d_ptr + ib) = x[ib].d;
3057-
}).wait_and_throw();
3058-
3059-
sycl::free(tmp_buf, *stream);
3106+
});
3107+
if (!g_ggml_sycl_use_async_mem_op) {
3108+
reorder_event.wait_and_throw();
3109+
}
3110+
sycl_ext_free(stream, tmp_buf);
30603111
}
30613112

30623113
static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
@@ -3065,14 +3116,19 @@ static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, d
30653116

30663117
const int nblocks = size / sizeof(block_q4_K);
30673118

3068-
auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
3069-
SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size).wait()));
3119+
uint8_t * tmp_buf = static_cast<uint8_t *>(sycl_ext_malloc_device(stream, size));
3120+
3121+
sycl::event copy_event;
3122+
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
3123+
if (!g_ggml_sycl_use_async_mem_op) {
3124+
copy_event.wait();
3125+
}
30703126

30713127
auto * qs_ptr = data_device;
30723128
auto * scales_ptr = qs_ptr + QK_K / 2 * nblocks;
30733129
auto * dm_ptr = (sycl::half2 *) (scales_ptr + K_SCALE_SIZE * nblocks);
30743130

3075-
stream->parallel_for(nblocks, [=](auto i) {
3131+
auto reorder_event = stream->parallel_for(nblocks, [=](auto i) {
30763132
const block_q4_K * x = (const block_q4_K *) tmp_buf;
30773133
const int ib = i;
30783134

@@ -3085,9 +3141,11 @@ static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, d
30853141
}
30863142

30873143
dm_ptr[ib] = x[ib].dm;
3088-
}).wait_and_throw();
3089-
3090-
sycl::free(tmp_buf, *stream);
3144+
});
3145+
if (!g_ggml_sycl_use_async_mem_op) {
3146+
reorder_event.wait_and_throw();
3147+
}
3148+
sycl_ext_free(stream, tmp_buf);
30913149
}
30923150

30933151
static void reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
@@ -3096,42 +3154,46 @@ static void reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, d
30963154

30973155
const int nblocks = size / sizeof(block_q6_K);
30983156

3099-
auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
3100-
SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size).wait()));
3157+
uint8_t * tmp_buf = static_cast<uint8_t *>(sycl_ext_malloc_device(stream, size));
3158+
3159+
sycl::event copy_event;
3160+
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
3161+
if (!g_ggml_sycl_use_async_mem_op) {
3162+
copy_event.wait();
3163+
}
31013164

31023165
auto * ql_ptr = data_device;
31033166
auto * qh_ptr = ql_ptr + (QK_K / 2) * nblocks;
31043167
auto * scales_ptr = qh_ptr + (QK_K / 4) * nblocks;
31053168
sycl::half * dm_ptr = (sycl::half *) (scales_ptr + (QK_K / 16) * nblocks);
31063169

3107-
stream
3108-
->parallel_for(nblocks,
3109-
[=](auto i) {
3110-
const block_q6_K * x = (const block_q6_K *) tmp_buf;
3111-
const int ib = i;
3112-
3113-
const uint8_t * ql = x[ib].ql;
3114-
const uint8_t * qh = x[ib].qh;
3115-
uint8_t * base_ql_ptr = ql_ptr + (QK_K / 2) * ib;
3116-
uint8_t * base_qh_ptr = qh_ptr + (QK_K / 4) * ib;
3117-
uint8_t * base_scales_ptr = scales_ptr + (QK_K / 16) * ib;
3170+
auto reorder_event = stream->parallel_for(nblocks, [=](auto i) {
3171+
const block_q6_K * x = (const block_q6_K *) tmp_buf;
3172+
const int ib = i;
31183173

3119-
for (int j = 0; j < QK_K / 2; ++j) {
3120-
base_ql_ptr[j] = ql[j];
3121-
}
3122-
for (int j = 0; j < QK_K / 4; ++j) {
3123-
base_qh_ptr[j] = qh[j];
3124-
}
3174+
const uint8_t * ql = x[ib].ql;
3175+
const uint8_t * qh = x[ib].qh;
3176+
uint8_t * base_ql_ptr = ql_ptr + (QK_K / 2) * ib;
3177+
uint8_t * base_qh_ptr = qh_ptr + (QK_K / 4) * ib;
3178+
uint8_t * base_scales_ptr = scales_ptr + (QK_K / 16) * ib;
31253179

3126-
for (int j = 0; j < QK_K / 16; ++j) {
3127-
base_scales_ptr[j] = x[ib].scales[j];
3128-
}
3180+
for (int j = 0; j < QK_K / 2; ++j) {
3181+
base_ql_ptr[j] = ql[j];
3182+
}
3183+
for (int j = 0; j < QK_K / 4; ++j) {
3184+
base_qh_ptr[j] = qh[j];
3185+
}
31293186

3130-
dm_ptr[ib] = x[ib].d;
3131-
})
3132-
.wait_and_throw();
3187+
for (int j = 0; j < QK_K / 16; ++j) {
3188+
base_scales_ptr[j] = x[ib].scales[j];
3189+
}
31333190

3134-
sycl::free(tmp_buf, *stream);
3191+
dm_ptr[ib] = x[ib].d;
3192+
});
3193+
if (!g_ggml_sycl_use_async_mem_op) {
3194+
reorder_event.wait_and_throw();
3195+
}
3196+
sycl_ext_free(stream, tmp_buf);
31353197
}
31363198

31373199
static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
@@ -4056,6 +4118,18 @@ static bool check_graph_compatibility(ggml_cgraph * cgraph) {
40564118
GGML_LOG_INFO("%s: disabling SYCL graphs due to unsupported node type %s\n", __func__,
40574119
ggml_op_name(node_op));
40584120
return false;
4121+
case GGML_OP_MUL_MAT:
4122+
// We cannot use graphs with ggml_sycl_mul_mat() when SYCL async memory allocation extensions are not available,
4123+
// as SYCL malloc / free and host wait calls are not supported when recording to a graph which are all present
4124+
// in reordering.
4125+
if (!g_ggml_sycl_use_async_mem_op) {
4126+
GGML_LOG_INFO(
4127+
"%s: disabling SYCL graphs due to unsupported node type when using a compiler without the "
4128+
"oneAPI async memory allocation extension "
4129+
"%s\n",
4130+
__func__, ggml_op_name(node_op));
4131+
return false;
4132+
}
40594133
}
40604134
}
40614135
return true;

gguf-py/gguf/vocab.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,12 @@
1414
SentencePieceProcessor = None
1515

1616
try:
17-
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
18-
from mistral_common.tokens.tokenizers.tekken import Tekkenizer
19-
from mistral_common.tokens.tokenizers.utils import (
17+
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer # pyright: ignore[reportMissingImports]
18+
from mistral_common.tokens.tokenizers.tekken import Tekkenizer # pyright: ignore[reportMissingImports]
19+
from mistral_common.tokens.tokenizers.utils import ( # pyright: ignore[reportMissingImports]
2020
_filter_valid_tokenizer_files,
2121
)
22-
from mistral_common.tokens.tokenizers.sentencepiece import (
22+
from mistral_common.tokens.tokenizers.sentencepiece import ( # pyright: ignore[reportMissingImports]
2323
SentencePieceTokenizer,
2424
)
2525
except ImportError:

requirements/requirements-convert_hf_to_gguf.txt

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
mistral-common>=1.8.3
2-
31
-r ./requirements-convert_legacy_llama.txt
42
--extra-index-url https://download.pytorch.org/whl/cpu
53

tools/imatrix/CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,8 @@ target_compile_features(${TARGET} PRIVATE cxx_std_17)
66
if(LLAMA_TOOLS_INSTALL)
77
install(TARGETS ${TARGET} RUNTIME)
88
endif()
9+
10+
if (CMAKE_SYSTEM_NAME MATCHES "AIX")
11+
# AIX's flock() function comes from libbsd.a
12+
target_link_libraries(${TARGET} PRIVATE -lbsd)
13+
endif()

0 commit comments

Comments
 (0)