Skip to content

Commit 52e69b5

Browse files
committed
Merge remote-tracking branch 'origin/master' into sl/dl-backend-6
2 parents 478194b + cc98896 commit 52e69b5

39 files changed

+5409
-27730
lines changed

.github/workflows/server.yml

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -76,20 +76,26 @@ jobs:
7676
run: |
7777
pip install -r examples/server/tests/requirements.txt
7878
79-
- name: Verify server deps
80-
id: verify_server_deps
79+
# Setup nodejs (to be used for verifying bundled index.html)
80+
- uses: actions/setup-node@v4
81+
with:
82+
node-version: 22
83+
84+
- name: Verify bundled index.html
85+
id: verify_server_index_html
8186
run: |
8287
git config --global --add safe.directory $(realpath .)
83-
cd examples/server
84-
git ls-files --others --modified
88+
cd examples/server/webui
8589
git status
86-
./deps.sh
90+
npm ci
91+
npm run build
8792
git status
88-
not_ignored_files="$(git ls-files --others --modified)"
89-
echo "Modified files: ${not_ignored_files}"
90-
if [ -n "${not_ignored_files}" ]; then
91-
echo "Repository is dirty or server deps are not built as expected"
92-
echo "${not_ignored_files}"
93+
modified_files="$(git status -s)"
94+
echo "Modified files: ${modified_files}"
95+
if [ -n "${modified_files}" ]; then
96+
echo "Repository is dirty or server/webui is not built as expected"
97+
echo "Hint: You may need to follow Web UI build guide in server/README.md"
98+
echo "${modified_files}"
9399
exit 1
94100
fi
95101

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,10 @@ examples/server/*.mjs.hpp
104104
!examples/sycl/*.bat
105105
!examples/sycl/*.sh
106106

107+
# Server Web UI temporary files
108+
node_modules
109+
examples/server/webui/dist
110+
107111
# Python
108112

109113
/.venv

Makefile

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1145,8 +1145,15 @@ $(LIB_COMMON_S): $(OBJ_COMMON)
11451145
# Include dependency files
11461146
-include $(DEP_FILES)
11471147

1148+
# Clean generated server assets
1149+
clean-server-assets:
1150+
find examples/server -type f -name "*.js.hpp" -delete
1151+
find examples/server -type f -name "*.mjs.hpp" -delete
1152+
find examples/server -type f -name "*.css.hpp" -delete
1153+
find examples/server -type f -name "*.html.hpp" -delete
1154+
11481155
# Clean rule
1149-
clean:
1156+
clean: clean-server-assets
11501157
rm -vrf $(BUILD_TARGETS) $(TEST_TARGETS)
11511158
rm -rvf *.a *.dll *.so *.dot
11521159
find ggml src common tests examples pocs -type f -name "*.o" -delete
@@ -1354,20 +1361,14 @@ llama-server: \
13541361
examples/server/utils.hpp \
13551362
examples/server/httplib.h \
13561363
examples/server/index.html.hpp \
1357-
examples/server/completion.js.hpp \
13581364
examples/server/loading.html.hpp \
1359-
examples/server/deps_daisyui.min.css.hpp \
1360-
examples/server/deps_markdown-it.js.hpp \
1361-
examples/server/deps_tailwindcss.js.hpp \
1362-
examples/server/deps_vue.esm-browser.js.hpp \
13631365
common/json.hpp \
1364-
common/stb_image.h \
13651366
$(OBJ_ALL)
13661367
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
13671368
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
13681369

13691370
# Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
1370-
examples/server/%.hpp: examples/server/public/% Makefile
1371+
examples/server/%.hpp: examples/server/public/% FORCE Makefile
13711372
@( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
13721373
echo "unsigned char $${NAME}[] = {" && \
13731374
cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
@@ -1542,7 +1543,7 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
15421543
# Deprecated binaries that we want to keep around long enough for people to migrate to the new filenames, then these can be removed.
15431544
#
15441545
# Mark legacy binary targets as .PHONY so that they are always checked.
1545-
.PHONY: main quantize perplexity embedding server
1546+
.PHONY: FORCE main quantize perplexity embedding server
15461547

15471548
# Define the object file target
15481549
examples/deprecation-warning/deprecation-warning.o: examples/deprecation-warning/deprecation-warning.cpp

common/arg.cpp

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,18 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
348348
return true;
349349
}
350350

351+
static std::string list_builtin_chat_templates() {
352+
std::vector<const char *> supported_tmpl;
353+
int32_t res = llama_chat_builtin_templates(nullptr, 0);
354+
supported_tmpl.resize(res);
355+
res = llama_chat_builtin_templates(supported_tmpl.data(), supported_tmpl.size());
356+
std::ostringstream msg;
357+
for (auto & tmpl : supported_tmpl) {
358+
msg << tmpl << (&tmpl == &supported_tmpl.back() ? "" : ", ");
359+
}
360+
return msg.str();
361+
}
362+
351363
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
352364
// load dynamic backends
353365
ggml_backend_load_all();
@@ -1814,9 +1826,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
18141826
).set_examples({LLAMA_EXAMPLE_SERVER}));
18151827
add_opt(common_arg(
18161828
{"--chat-template"}, "JINJA_TEMPLATE",
1817-
"set custom jinja chat template (default: template taken from model's metadata)\n"
1818-
"if suffix/prefix are specified, template will be disabled\n"
1819-
"only commonly used templates are accepted:\nhttps://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template",
1829+
string_format(
1830+
"set custom jinja chat template (default: template taken from model's metadata)\n"
1831+
"if suffix/prefix are specified, template will be disabled\n"
1832+
"list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
1833+
),
18201834
[](common_params & params, const std::string & value) {
18211835
if (!common_chat_verify_template(value)) {
18221836
throw std::runtime_error(string_format(

docs/build.md

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -26,17 +26,17 @@ cmake --build build --config Release
2626

2727
1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
2828

29-
```bash
30-
cmake -B build -DCMAKE_BUILD_TYPE=Debug
31-
cmake --build build
32-
```
29+
```bash
30+
cmake -B build -DCMAKE_BUILD_TYPE=Debug
31+
cmake --build build
32+
```
3333

3434
2. Multi-config generators (`-G` param set to Visual Studio, XCode...):
3535

36-
```bash
37-
cmake -B build -G "Xcode"
38-
cmake --build build --config Debug
39-
```
36+
```bash
37+
cmake -B build -G "Xcode"
38+
cmake --build build --config Debug
39+
```
4040

4141
For more details and a list of supported generators, see the [CMake documentation](https://cmake.org/cmake/help/latest/manual/cmake-generators.7.html).
4242

examples/infill/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ In this section, we cover the most commonly used options for running the `infill
1414
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
1515
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
1616
- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
17-
- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
17+
- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 4096, but if a LLaMA model was built with a longer context, increasing this value will provide better results for longer input/inference.
1818
- `--spm-infill`: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.
1919

2020
## Input Prompts

examples/main/README.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ In this section, we cover the most commonly used options for running the `llama-
6666
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g [https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true](https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true)).
6767
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
6868
- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
69-
- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
69+
- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 4096, but if a LLaMA model was built with a longer context, increasing this value will provide better results for longer input/inference.
7070
- `-mli, --multiline-input`: Allows you to write or paste multiple lines without ending each in '\'
7171
- `-t N, --threads N`: Set the number of threads to use during generation. For optimal performance, it is recommended to set this value to the number of physical CPU cores your system has.
7272
- `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
@@ -131,7 +131,7 @@ During text generation, LLaMA models have a limited context size, which means th
131131

132132
### Context Size
133133

134-
- `-c N, --ctx-size N`: Set the size of the prompt context (default: 0, 0 = loaded from model). The LLaMA models were built with a context of 2048-8192, which will yield the best results on longer input/inference.
134+
- `-c N, --ctx-size N`: Set the size of the prompt context (default: 4096, 0 = loaded from model). If a LLaMA model was built with a longer context, increasing this value will yield the best results on longer input/inference.
135135

136136
### Extended Context Size
137137

@@ -348,6 +348,7 @@ These options provide extra functionality and customization when running the LLa
348348

349349
- `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
350350
- `--verbose-prompt`: Print the prompt before generating text.
351+
- `--no-display-prompt`: Don't print prompt at generation.
351352
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used.
352353
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
353354
- `-hfr URL --hf-repo URL`: The url to the Hugging Face model repository. Used in conjunction with `--hf-file` or `-hff`. The model is downloaded and stored in the file provided by `-m` or `--model`. If `-m` is not provided, the model is auto-stored in the path specified by the `LLAMA_CACHE` environment variable or in an OS-specific local cache.

examples/server/CMakeLists.txt

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,7 @@ set(TARGET_SRCS
1616
)
1717
set(PUBLIC_ASSETS
1818
index.html
19-
completion.js
2019
loading.html
21-
deps_daisyui.min.css
22-
deps_markdown-it.js
23-
deps_tailwindcss.js
24-
deps_vue.esm-browser.js
2520
)
2621

2722
foreach(asset ${PUBLIC_ASSETS})
@@ -33,11 +28,20 @@ foreach(asset ${PUBLIC_ASSETS})
3328
OUTPUT "${output}"
3429
COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake"
3530
)
31+
set_source_files_properties(${output} PROPERTIES GENERATED TRUE)
3632
endforeach()
3733

3834
add_executable(${TARGET} ${TARGET_SRCS})
3935
install(TARGETS ${TARGET} RUNTIME)
4036

37+
# clean up generated files in pre-build step
38+
foreach(asset ${PUBLIC_ASSETS})
39+
set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp")
40+
add_custom_command(TARGET ${TARGET} PRE_BUILD
41+
COMMAND "${CMAKE_COMMAND}" -E remove -f "${output}"
42+
)
43+
endforeach()
44+
4145
target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
4246

4347
if (LLAMA_SERVER_SSL)

0 commit comments

Comments
 (0)