Skip to content

Commit 5cd800b

Browse files
committed
Merge branch 'master' of https://github.com/ggml-org/llama.cpp into phi4_tools_clean
2 parents a5d014b + 0fd8487 commit 5cd800b

File tree

7 files changed

+98
-40
lines changed

7 files changed

+98
-40
lines changed

.github/workflows/build.yml

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -676,6 +676,35 @@ jobs:
676676
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
677677
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
678678
679+
macOS-latest-cmake-visionos:
680+
runs-on: macos-latest
681+
682+
steps:
683+
- name: Clone
684+
id: checkout
685+
uses: actions/checkout@v4
686+
687+
- name: Dependencies
688+
id: depends
689+
continue-on-error: true
690+
run: |
691+
brew update
692+
693+
- name: Build
694+
id: cmake_build
695+
run: |
696+
sysctl -a
697+
cmake -B build -G Xcode \
698+
-DGGML_METAL_USE_BF16=ON \
699+
-DGGML_METAL_EMBED_LIBRARY=ON \
700+
-DLLAMA_BUILD_EXAMPLES=OFF \
701+
-DLLAMA_BUILD_TESTS=OFF \
702+
-DLLAMA_BUILD_SERVER=OFF \
703+
-DCMAKE_SYSTEM_NAME=visionOS \
704+
-DCMAKE_OSX_DEPLOYMENT_TARGET=1.0 \
705+
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
706+
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
707+
679708
macOS-latest-swift:
680709
runs-on: macos-latest
681710

build-xcframework.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -432,8 +432,8 @@ cmake -B build-visionos -G Xcode \
432432
-DCMAKE_SYSTEM_NAME=visionOS \
433433
-DCMAKE_OSX_SYSROOT=xros \
434434
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xros \
435-
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_C_FLAGS}" \
436-
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_CXX_FLAGS}" \
435+
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
436+
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
437437
-S .
438438
cmake --build build-visionos --config Release -- -quiet
439439

@@ -445,8 +445,8 @@ cmake -B build-visionos-sim -G Xcode \
445445
-DCMAKE_SYSTEM_NAME=visionOS \
446446
-DCMAKE_OSX_SYSROOT=xrsimulator \
447447
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xrsimulator \
448-
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_C_FLAGS}" \
449-
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_CXX_FLAGS}" \
448+
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
449+
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
450450
-S .
451451
cmake --build build-visionos-sim --config Release -- -quiet
452452

convert_hf_to_gguf.py

Lines changed: 16 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,8 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
180180
extra = sorted(tensor_names_from_parts.difference(self.tensor_names))
181181
missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map))
182182
if len(extra) == 0 and len(missing_files) > 0:
183-
raise ValueError(f"Missing or incomplete model files: {missing_files}")
183+
raise ValueError(f"Missing or incomplete model files: {missing_files}\n"
184+
f"Missing tensors: {missing}")
184185
else:
185186
raise ValueError("Mismatch between weight map and model parts for tensor names:\n"
186187
f"Missing tensors: {missing}\n"
@@ -1099,13 +1100,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
10991100

11001101
tensors.append((self.map_tensor_name(name), data_torch))
11011102

1102-
if name == "word_embeddings.weight":
1103-
assert self.tensor_names is not None
1104-
1105-
# TODO: tie them at runtime, don't duplicate in the model file
1106-
if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")):
1107-
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
1108-
11091103
return tensors
11101104

11111105

@@ -2423,10 +2417,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
24232417

24242418
tensors.append((new_name, data_torch))
24252419

2426-
# note: GPT2 output is tied to (same as) wte in original model
2427-
if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
2428-
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
2429-
24302420
return tensors
24312421

24322422

@@ -2756,21 +2746,26 @@ def set_gguf_parameters(self):
27562746
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
27572747
self.gguf_writer.add_rope_scaling_factor(1.0)
27582748

2749+
_has_tok_embd = False
2750+
27592751
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
27602752
del bid # unused
27612753

2762-
new_name = self.map_tensor_name(name)
2763-
2764-
tensors: list[tuple[str, Tensor]] = [(new_name, data_torch)]
2754+
output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
2755+
tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD)
27652756

2766-
if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
2767-
assert self.tensor_names is not None
2757+
new_name = self.map_tensor_name(name)
27682758

2769-
if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")):
2770-
# copy tok_embd.weight to output.weight
2771-
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
2759+
# assuming token_embd.weight is seen before output.weight
2760+
if not self._has_tok_embd and new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
2761+
# even though the tensor file(s) does not contain the word embeddings they are still in the weight map
2762+
if self.tensor_names and "transformer.wte.weight" in self.tensor_names:
2763+
logger.debug(f"{tok_embd_name} not found before {output_name}, assuming they are tied")
2764+
self.tensor_names.remove("transformer.wte.weight")
2765+
elif new_name == tok_embd_name:
2766+
self._has_tok_embd = True
27722767

2773-
return tensors
2768+
return [(new_name, data_torch)]
27742769

27752770

27762771
@Model.register("InternLM2ForCausalLM")

ggml/src/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,10 @@ if (CMAKE_SYSTEM_NAME MATCHES "Android")
325325
target_link_libraries(ggml-base PRIVATE dl)
326326
endif()
327327

328+
if(CMAKE_SYSTEM_NAME MATCHES "visionOS")
329+
target_compile_definitions(ggml-base PUBLIC _DARWIN_C_SOURCE)
330+
endif()
331+
328332
if (BUILD_SHARED_LIBS)
329333
foreach (target ggml-base ggml)
330334
set_target_properties(${target} PROPERTIES POSITION_INDEPENDENT_CODE ON)

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8436,8 +8436,12 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
84368436
VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
84378437
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
84388438

8439+
uint64_t total_mat_mul_bytes = 0;
84398440
for (int i = 0; i < cgraph->n_nodes; i++) {
84408441
ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false);
8442+
if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT || cgraph->nodes[i]->op == GGML_OP_MUL_MAT_ID) {
8443+
total_mat_mul_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]);
8444+
}
84418445
}
84428446
if (ctx->device->need_compiles) {
84438447
ggml_vk_load_shaders(ctx->device);
@@ -8458,17 +8462,27 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
84588462
bool first_node_in_batch = true; // true if next node will be first node in a batch
84598463
int submit_node_idx = 0; // index to first node in a batch
84608464

8461-
// Submit work every nodes_per_submit nodes to overlap CPU cmdbuffer generation with GPU execution.
8462-
// Start with a smaller count to get work submitted right away, and increase it after each submit.
8463-
int nodes_per_submit = 20;
8465+
// Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution.
8466+
// Estimate the amount of matmul work by looking at the weight matrix size, and submit every 100MB
8467+
// (and scaled down based on model size, so smaller models submit earlier).
8468+
// Also submit at least every 100 nodes, in case there are workloads without as much matmul.
8469+
int nodes_per_submit = 100;
84648470
int submitted_nodes = 0;
84658471
int submit_count = 0;
8472+
uint64_t mul_mat_bytes = 0;
8473+
uint64_t mul_mat_bytes_per_submit = std::min(uint64_t(100*1000*1000), total_mat_mul_bytes / 40u);
84668474
for (int i = 0; i < cgraph->n_nodes; i++) {
84678475
if (first_node_in_batch) {
84688476
submit_node_idx = i;
84698477
}
84708478

8471-
bool submit = (submitted_nodes >= nodes_per_submit) || (i == last_node);
8479+
if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT || cgraph->nodes[i]->op == GGML_OP_MUL_MAT_ID) {
8480+
mul_mat_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]);
8481+
}
8482+
8483+
bool submit = (submitted_nodes >= nodes_per_submit) ||
8484+
(mul_mat_bytes >= mul_mat_bytes_per_submit) ||
8485+
(i == last_node);
84728486

84738487
bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, submit);
84748488

@@ -8485,13 +8499,9 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
84858499
if (submit) {
84868500
first_node_in_batch = true;
84878501
submitted_nodes = 0;
8488-
switch (submit_count) {
8489-
case 0:
8490-
nodes_per_submit = 50;
8491-
break;
8492-
default:
8493-
nodes_per_submit = 100;
8494-
break;
8502+
mul_mat_bytes = 0;
8503+
if (submit_count < 3) {
8504+
mul_mat_bytes_per_submit *= 2;
84958505
}
84968506
submit_count++;
84978507
}

gguf-py/gguf/vocab.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,12 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool:
154154
return True
155155
with open(tokenizer_config_file, encoding = 'utf-8') as f:
156156
tokenizer_config = json.load(f)
157-
chat_template = tokenizer_config.get('chat_template')
157+
chat_template_alt = None
158+
chat_template_file = path / 'chat_template.json'
159+
if chat_template_file.is_file():
160+
with open(chat_template_file, encoding = 'utf-8') as f:
161+
chat_template_alt = json.load(f).get('chat_template')
162+
chat_template = tokenizer_config.get('chat_template', chat_template_alt)
158163
if chat_template is None or isinstance(chat_template, (str, list)):
159164
self.chat_template = chat_template
160165
else:

src/llama-model.cpp

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2020,7 +2020,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
20202020
// output
20212021
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
20222022
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
2023-
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
2023+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2024+
2025+
// if output is NULL, init from the input tok embed
2026+
if (output == NULL) {
2027+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2028+
}
20242029

20252030
for (int i = 0; i < n_layer; ++i) {
20262031
auto & layer = layers[i];
@@ -2381,7 +2386,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
23812386
// output
23822387
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
23832388
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
2384-
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
2389+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2390+
2391+
// if output is NULL, init from the input tok embed
2392+
if (output == NULL) {
2393+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2394+
}
23852395

23862396
for (int i = 0; i < n_layer; ++i) {
23872397
auto & layer = layers[i];
@@ -2407,7 +2417,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
24072417
} break;
24082418
case LLM_ARCH_CODESHELL:
24092419
{
2410-
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2420+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2421+
2422+
// if tok embd is NULL, init from output
2423+
if (tok_embd == NULL) {
2424+
tok_embd = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2425+
}
24112426

24122427
// output
24132428
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);

0 commit comments

Comments
 (0)