Skip to content

Commit 8b13f2d

Browse files
committed
fix conflict
2 parents d9c6bf1 + 274ec65 commit 8b13f2d

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

60 files changed

+785
-684
lines changed

.github/workflows/server.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ jobs:
7979
# Setup nodejs (to be used for verifying bundled index.html)
8080
- uses: actions/setup-node@v4
8181
with:
82-
node-version: 22
82+
node-version: '22.11.0'
8383

8484
- name: Verify bundled index.html
8585
id: verify_server_index_html

CMakeLists.txt

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -46,11 +46,9 @@ if (WIN32)
4646
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
4747
endif()
4848

49-
if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
50-
add_compile_options("$<$<COMPILE_LANGUAGE:C>:/source-charset:utf-8>")
51-
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/source-charset:utf-8>")
52-
add_compile_options("$<$<COMPILE_LANGUAGE:C>:/execution-charset:utf-8>")
53-
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/execution-charset:utf-8>")
49+
if (MSVC)
50+
add_compile_options("$<$<COMPILE_LANGUAGE:C>:/utf-8>")
51+
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/utf-8>")
5452
endif()
5553

5654
#

CMakePresets.json

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,13 @@
3131
{ "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } },
3232
{ "name": "vulkan", "hidden": true, "cacheVariables": { "GGML_VULKAN": "ON" } },
3333

34+
{
35+
"name": "x64-windows-llvm", "hidden": true,
36+
"cacheVariables": {
37+
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/x64-windows-llvm.cmake"
38+
}
39+
},
40+
3441
{
3542
"name": "arm64-windows-msvc", "hidden": true,
3643
"architecture": { "value": "arm64", "strategy": "external" },
@@ -70,6 +77,11 @@
7077
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] },
7178
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] },
7279

80+
{ "name": "x64-windows-llvm-debug", "inherits": [ "base", "x64-windows-llvm", "debug" ] },
81+
{ "name": "x64-windows-llvm-release", "inherits": [ "base", "x64-windows-llvm", "release" ] },
82+
{ "name": "x64-windows-llvm-reldbg", "inherits": [ "base", "x64-windows-llvm", "reldbg" ] },
83+
{ "name": "x64-windows-llvm+static-release", "inherits": [ "base", "x64-windows-llvm", "reldbg", "static" ] },
84+
7385
{ "name": "x64-windows-msvc-debug", "inherits": [ "base", "debug" ] },
7486
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
7587
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },

CODEOWNERS

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
# collaborators can optionally add themselves here to indicate their availability for reviewing related PRs
22

3-
ci/ @ggerganov
3+
/ci/ @ggerganov
4+
/.devops/ @ngxson
5+
/examples/server/ @ngxson

cmake/x64-windows-llvm.cmake

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
set( CMAKE_SYSTEM_NAME Windows )
2+
set( CMAKE_SYSTEM_PROCESSOR x86_64 )
3+
4+
set( CMAKE_C_COMPILER clang )
5+
set( CMAKE_CXX_COMPILER clang++ )
6+
7+
set( arch_c_flags "-march=native" )
8+
9+
set( CMAKE_C_FLAGS_INIT "${arch_c_flags}" )
10+
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags}" )
11+

common/arg.cpp

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -591,7 +591,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
591591
[](common_params & params) {
592592
params.ctx_shift = false;
593593
}
594-
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
594+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
595595
add_opt(common_arg(
596596
{"--chunks"}, "N",
597597
string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
@@ -1711,6 +1711,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
17111711
params.public_path = value;
17121712
}
17131713
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
1714+
add_opt(common_arg(
1715+
{"--no-webui"},
1716+
string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
1717+
[](common_params & params) {
1718+
params.webui = false;
1719+
}
1720+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_WEBUI"));
17141721
add_opt(common_arg(
17151722
{"--embedding", "--embeddings"},
17161723
string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
@@ -2076,35 +2083,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
20762083
[](common_params & params, int value) {
20772084
params.speculative.n_max = value;
20782085
}
2079-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
2086+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX"));
20802087
add_opt(common_arg(
20812088
{"--draft-min", "--draft-n-min"}, "N",
20822089
string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
20832090
[](common_params & params, int value) {
20842091
params.speculative.n_min = value;
20852092
}
2086-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
2093+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN"));
20872094
add_opt(common_arg(
20882095
{"--draft-p-split"}, "P",
20892096
string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
20902097
[](common_params & params, const std::string & value) {
20912098
params.speculative.p_split = std::stof(value);
20922099
}
2093-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2100+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_DRAFT_P_SPLIT"));
20942101
add_opt(common_arg(
20952102
{"--draft-p-min"}, "P",
20962103
string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min),
20972104
[](common_params & params, const std::string & value) {
20982105
params.speculative.p_min = std::stof(value);
20992106
}
2100-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2107+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
21012108
add_opt(common_arg(
21022109
{"-cd", "--ctx-size-draft"}, "N",
21032110
string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
21042111
[](common_params & params, int value) {
21052112
params.speculative.n_ctx = value;
21062113
}
2107-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2114+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
21082115
add_opt(common_arg(
21092116
{"-devd", "--device-draft"}, "<dev1,dev2,..>",
21102117
"comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
@@ -2124,14 +2131,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
21242131
fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
21252132
}
21262133
}
2127-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2134+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
21282135
add_opt(common_arg(
21292136
{"-md", "--model-draft"}, "FNAME",
21302137
"draft model for speculative decoding (default: unused)",
21312138
[](common_params & params, const std::string & value) {
21322139
params.speculative.model = value;
21332140
}
2134-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2141+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
21352142

21362143
return ctx_arg;
21372144
}

docs/build.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,13 @@ cmake --build build --config Release
5757
```
5858
Building for arm64 can also be done with the MSVC compiler with the build-arm64-windows-MSVC preset, or the standard CMake build instructions. However, note that the MSVC compiler does not support inline ARM assembly code, used e.g. for the accelerated Q4_0_N_M CPU kernels.
5959
60+
For building with ninja generator and clang compiler as default:
61+
-set path:set LIB=C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\um\x64;C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.41.34120\lib\x64\uwp;C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\ucrt\x64
62+
```bash
63+
cmake --preset x64-windows-llvm-release
64+
cmake --build build-x64-windows-llvm-release
65+
```
66+
6067
## BLAS Build
6168
6269
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Using BLAS doesn't affect the generation performance. There are currently several different BLAS implementations available for build and use:

examples/CMakeLists.txt

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,12 @@ else()
2020
add_subdirectory(batched)
2121
add_subdirectory(embedding)
2222
add_subdirectory(eval-callback)
23-
add_subdirectory(gbnf-validator)
23+
24+
if (NOT WIN32)
25+
# disabled on Windows because it uses internal functions not exported with LLAMA_API
26+
add_subdirectory(gbnf-validator)
27+
endif()
28+
2429
add_subdirectory(gguf-hash)
2530
add_subdirectory(gguf-split)
2631
add_subdirectory(gguf)
@@ -51,7 +56,10 @@ else()
5156
add_subdirectory(convert-llama2c-to-ggml)
5257
add_subdirectory(cvector-generator)
5358
add_subdirectory(export-lora)
54-
add_subdirectory(quantize-stats)
59+
if (NOT WIN32)
60+
# disabled on Windows because it uses internal functions not exported with LLAMA_API
61+
add_subdirectory(quantize-stats)
62+
endif()
5563
add_subdirectory(llava)
5664
if (GGML_RPC)
5765
add_subdirectory(rpc)

examples/gguf-split/gguf-split.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -287,7 +287,7 @@ struct split_strategy {
287287
}
288288

289289
void print_info() {
290-
printf("n_split: %ld\n", ctx_outs.size());
290+
printf("n_split: %zu\n", ctx_outs.size());
291291
int i_split = 0;
292292
for (auto & ctx_out : ctx_outs) {
293293
// re-calculate the real gguf size for each split (= metadata size + total size of all tensors)
@@ -297,7 +297,7 @@ struct split_strategy {
297297
total_size += ggml_nbytes(t);
298298
}
299299
total_size = total_size / 1000 / 1000; // convert to megabytes
300-
printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
300+
printf("split %05d: n_tensors = %d, total_size = %zuM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
301301
i_split++;
302302
}
303303
}

examples/llama-bench/llama-bench.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1521,7 +1521,7 @@ int main(int argc, char ** argv) {
15211521
for (const auto & inst : params_instances) {
15221522
params_idx++;
15231523
if (params.progress) {
1524-
fprintf(stderr, "llama-bench: benchmark %d/%ld: starting\n", params_idx, params_count);
1524+
fprintf(stderr, "llama-bench: benchmark %d/%zu: starting\n", params_idx, params_count);
15251525
}
15261526
// keep the same model between tests when possible
15271527
if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
@@ -1573,14 +1573,14 @@ int main(int argc, char ** argv) {
15731573
// warmup run
15741574
if (t.n_prompt > 0) {
15751575
if (params.progress) {
1576-
fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count);
1576+
fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup prompt run\n", params_idx, params_count);
15771577
}
15781578
//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
15791579
test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
15801580
}
15811581
if (t.n_gen > 0) {
15821582
if (params.progress) {
1583-
fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup generation run\n", params_idx, params_count);
1583+
fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup generation run\n", params_idx, params_count);
15841584
}
15851585
test_gen(ctx, 1, t.n_threads);
15861586
}
@@ -1592,14 +1592,14 @@ int main(int argc, char ** argv) {
15921592

15931593
if (t.n_prompt > 0) {
15941594
if (params.progress) {
1595-
fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count,
1595+
fprintf(stderr, "llama-bench: benchmark %d/%zu: prompt run %d/%d\n", params_idx, params_count,
15961596
i + 1, params.reps);
15971597
}
15981598
test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
15991599
}
16001600
if (t.n_gen > 0) {
16011601
if (params.progress) {
1602-
fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count,
1602+
fprintf(stderr, "llama-bench: benchmark %d/%zu: generation run %d/%d\n", params_idx, params_count,
16031603
i + 1, params.reps);
16041604
}
16051605
test_gen(ctx, t.n_gen, t.n_threads);

0 commit comments

Comments
 (0)