Skip to content

Commit 92a0e6c

Browse files
committed
Merge branch 'master' into cmake-vulkan-test-shader-func
2 parents 9af47ae + 1e2809b commit 92a0e6c

File tree

153 files changed

+7807
-2802
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

153 files changed

+7807
-2802
lines changed

.github/workflows/build.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ concurrency:
1515
cancel-in-progress: true
1616

1717
env:
18-
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
1918
GGML_NLOOP: 3
2019
GGML_N_THREADS: 1
2120
LLAMA_LOG_COLORS: 1
@@ -308,7 +307,7 @@ jobs:
308307
run: |
309308
cd build
310309
# This is using llvmpipe and runs slower than other backends
311-
ctest -L main --verbose --timeout 2700
310+
ctest -L main --verbose --timeout 3600
312311
313312
ubuntu-22-cmake-hip:
314313
runs-on: ubuntu-22.04

.github/workflows/docker.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,7 @@ jobs:
4242
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
4343
- { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
4444
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
45-
# Note: the intel images are failing due to an out of disk space error
46-
# - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
45+
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
4746
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
4847
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
4948
#- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: true }

.github/workflows/release.yml

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,6 @@ concurrency:
1616
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
1717
cancel-in-progress: true
1818

19-
# Fine-grant permission
20-
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
21-
permissions:
22-
contents: write # for creating release
23-
2419
env:
2520
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
2621
CMAKE_ARGS: "-DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON"
@@ -416,28 +411,27 @@ jobs:
416411
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
417412
run: |
418413
cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
419-
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
414+
7z a llama-${{ steps.tag.outputs.name }}-bin-win-cuda${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
420415
421416
- name: Upload artifacts
422417
uses: actions/upload-artifact@v4
423418
with:
424-
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
425-
name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip
419+
path: llama-${{ steps.tag.outputs.name }}-bin-win-cuda${{ matrix.cuda }}-x64.zip
420+
name: llama-bin-win-cuda${{ matrix.cuda }}-x64.zip
426421

427422
- name: Copy and pack Cuda runtime
428-
if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
429423
run: |
430424
echo "Cuda install location: ${{ env.CUDA_PATH }}"
431425
$dst='.\build\bin\cudart\'
432426
robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
433427
robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
434-
7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*
428+
7z a cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip $dst\*
435429
436430
- name: Upload Cuda runtime
437431
uses: actions/upload-artifact@v4
438432
with:
439-
path: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
440-
name: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
433+
path: cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip
434+
name: cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip
441435

442436
windows-sycl:
443437
runs-on: windows-latest
@@ -646,6 +640,11 @@ jobs:
646640
release:
647641
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
648642

643+
# Fine-grant permission
644+
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
645+
permissions:
646+
contents: write # for creating release
647+
649648
runs-on: ubuntu-latest
650649

651650
needs:

CMakeLists.txt

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -252,20 +252,3 @@ configure_file(cmake/llama.pc.in
252252

253253
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
254254
DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
255-
256-
#
257-
# copy the license files
258-
#
259-
260-
# Check if running in GitHub Actions
261-
if(DEFINED ENV{GITHUB_ACTIONS} AND "$ENV{GITHUB_ACTIONS}" STREQUAL "true")
262-
message(STATUS "Running inside GitHub Actions - copying license files")
263-
264-
# Copy all files from licenses/ to build/bin/
265-
file(GLOB LICENSE_FILES "${CMAKE_SOURCE_DIR}/licenses/*")
266-
foreach(LICENSE_FILE ${LICENSE_FILES})
267-
get_filename_component(FILENAME ${LICENSE_FILE} NAME)
268-
configure_file(${LICENSE_FILE} "${CMAKE_BINARY_DIR}/bin/${FILENAME}" COPYONLY)
269-
endforeach()
270-
endif()
271-

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,9 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
1616

1717
## Hot topics
1818

19+
- 🔥 Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
1920
- **GGML developer experience survey (organized and reviewed by NVIDIA):** [link](https://forms.gle/Gasw3cRgyhNEnrwK9)
20-
- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141]((https://github.com/ggml-org/llama.cpp/pull/13141))), `libllava` will be deprecated
21+
- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141](https://github.com/ggml-org/llama.cpp/pull/13141)), `libllava` will be deprecated
2122
- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
2223
- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
2324
- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim

build-xcframework.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ setup_framework_structure() {
117117
# Copy all required headers (common for all platforms)
118118
cp include/llama.h ${header_path}
119119
cp ggml/include/ggml.h ${header_path}
120+
cp ggml/include/ggml-opt.h ${header_path}
120121
cp ggml/include/ggml-alloc.h ${header_path}
121122
cp ggml/include/ggml-backend.h ${header_path}
122123
cp ggml/include/ggml-metal.h ${header_path}

common/CMakeLists.txt

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -119,8 +119,8 @@ if (LLAMA_LLGUIDANCE)
119119

120120
ExternalProject_Add(llguidance_ext
121121
GIT_REPOSITORY https://github.com/guidance-ai/llguidance
122-
# v0.7.10:
123-
GIT_TAG 0309d2a6bf40abda35344a362edc71e06d5009f8
122+
# v0.7.19 (+ fancy-regex build fix):
123+
GIT_TAG b59f98f85269892a7de3d3641ad155366f13daa6
124124
PREFIX ${CMAKE_BINARY_DIR}/llguidance
125125
SOURCE_DIR ${LLGUIDANCE_SRC}
126126
BUILD_IN_SOURCE TRUE
@@ -144,3 +144,27 @@ endif ()
144144
target_include_directories(${TARGET} PUBLIC .)
145145
target_compile_features (${TARGET} PUBLIC cxx_std_17)
146146
target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
147+
148+
149+
#
150+
# copy the license files
151+
#
152+
153+
# Check if running in GitHub Actions
154+
if (DEFINED ENV{GITHUB_ACTIONS} AND "$ENV{GITHUB_ACTIONS}" STREQUAL "true")
155+
message(STATUS "Running inside GitHub Actions - copying license files")
156+
157+
# Copy all files from licenses/ to build/bin/
158+
file(GLOB LICENSE_FILES "${CMAKE_SOURCE_DIR}/licenses/*")
159+
foreach(LICENSE_FILE ${LICENSE_FILES})
160+
get_filename_component(FILENAME ${LICENSE_FILE} NAME)
161+
add_custom_command(
162+
POST_BUILD
163+
TARGET ${TARGET}
164+
COMMAND ${CMAKE_COMMAND} -E copy_if_different
165+
"${LICENSE_FILE}"
166+
"$<TARGET_FILE_DIR:llama>/${FILENAME}"
167+
COMMENT "Copying ${FILENAME} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}")
168+
message(STATUS "Copying ${LICENSE_FILE} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${FILENAME}")
169+
endforeach()
170+
endif()

common/arg.cpp

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ using json = nlohmann::ordered_json;
4040

4141
std::initializer_list<enum llama_example> mmproj_examples = {
4242
LLAMA_EXAMPLE_LLAVA,
43-
// TODO: add LLAMA_EXAMPLE_SERVER when it's ready
43+
LLAMA_EXAMPLE_SERVER,
4444
};
4545

4646
static std::string read_file(const std::string & fname) {
@@ -2204,32 +2204,33 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
22042204
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
22052205
add_opt(common_arg(
22062206
{"--mmproj"}, "FILE",
2207-
"path to a multimodal projector file. see tools/mtmd/README.md",
2207+
"path to a multimodal projector file. see tools/mtmd/README.md\n"
2208+
"note: if -hf is used, this argument can be omitted",
22082209
[](common_params & params, const std::string & value) {
22092210
params.mmproj.path = value;
22102211
}
2211-
).set_examples(mmproj_examples));
2212+
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ"));
22122213
add_opt(common_arg(
22132214
{"--mmproj-url"}, "URL",
22142215
"URL to a multimodal projector file. see tools/mtmd/README.md",
22152216
[](common_params & params, const std::string & value) {
22162217
params.mmproj.url = value;
22172218
}
2218-
).set_examples(mmproj_examples));
2219+
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL"));
22192220
add_opt(common_arg(
22202221
{"--no-mmproj"},
22212222
"explicitly disable multimodal projector, useful when using -hf",
22222223
[](common_params & params) {
22232224
params.no_mmproj = true;
22242225
}
2225-
).set_examples(mmproj_examples));
2226+
).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ"));
22262227
add_opt(common_arg(
22272228
{"--no-mmproj-offload"},
22282229
"do not offload multimodal projector to GPU",
22292230
[](common_params & params) {
22302231
params.mmproj_use_gpu = false;
22312232
}
2232-
).set_examples(mmproj_examples));
2233+
).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
22332234
add_opt(common_arg(
22342235
{"--image"}, "FILE",
22352236
"path to an image file. use with multimodal models. Specify multiple times for batching",
@@ -2436,6 +2437,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
24362437
}
24372438
}
24382439
));
2440+
add_opt(common_arg(
2441+
{"--no-op-offload"},
2442+
string_format("disable offloading host tensor operations to device (default: %s)", params.no_op_offload ? "true" : "false"),
2443+
[](common_params & params) {
2444+
params.no_op_offload = true;
2445+
}
2446+
));
24392447
add_opt(common_arg(
24402448
{"--lora"}, "FNAME",
24412449
"path to LoRA adapter (can be repeated to use multiple adapters)",
@@ -2627,6 +2635,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
26272635
params.i_chunk = value;
26282636
}
26292637
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2638+
add_opt(common_arg(
2639+
{"--parse-special"},
2640+
string_format("prase special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"),
2641+
[](common_params & params) {
2642+
params.parse_special = true;
2643+
}
2644+
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
26302645
add_opt(common_arg(
26312646
{"-pps"},
26322647
string_format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),

common/chat.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,9 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
125125
msgs.push_back(msg);
126126
}
127127
} catch (const std::exception & e) {
128-
throw std::runtime_error("Failed to parse messages: " + std::string(e.what()) + "; messages = " + messages.dump(2));
128+
// @ngxson : disable otherwise it's bloating the API response
129+
// printf("%s\n", std::string("; messages = ") + messages.dump(2));
130+
throw std::runtime_error("Failed to parse messages: " + std::string(e.what()));
129131
}
130132

131133
return msgs;

common/common.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1113,6 +1113,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
11131113
cparams.offload_kqv = !params.no_kv_offload;
11141114
cparams.flash_attn = params.flash_attn;
11151115
cparams.no_perf = params.no_perf;
1116+
cparams.op_offload = !params.no_op_offload;
11161117

11171118
if (params.reranking) {
11181119
cparams.embeddings = true;
@@ -1564,3 +1565,20 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
15641565

15651566
return result;
15661567
}
1568+
1569+
ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride) {
1570+
const int64_t ne_datapoint = llama_n_ctx(ctx);
1571+
const int64_t ndata = (tokens.size() - ne_datapoint - 1) / stride;
1572+
ggml_opt_dataset_t result = ggml_opt_dataset_init(
1573+
GGML_TYPE_I32, GGML_TYPE_I32, ne_datapoint, ne_datapoint, ndata, /*ndata_shard =*/ 1);
1574+
1575+
llama_token * data = (llama_token *) ggml_opt_dataset_data(result)->data;
1576+
llama_token * labels = (llama_token *) ggml_opt_dataset_labels(result)->data;
1577+
1578+
for (int64_t idata = 0; idata < ndata; ++idata) {
1579+
memcpy(data + idata*ne_datapoint, tokens.data() + idata*stride + 0, ne_datapoint*sizeof(llama_token));
1580+
memcpy(labels + idata*ne_datapoint, tokens.data() + idata*stride + 1, ne_datapoint*sizeof(llama_token));
1581+
}
1582+
1583+
return result;
1584+
}

0 commit comments

Comments
 (0)