Skip to content

Commit a90612f

Browse files
authored
Merge branch 'master' into jinja-system-prompt
2 parents 9371988 + 45a8e76 commit a90612f

File tree

7 files changed

+36
-14
lines changed

7 files changed

+36
-14
lines changed

common/arg.cpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -813,13 +813,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
813813
).set_env("LLAMA_ARG_FLASH_ATTN"));
814814
add_opt(common_arg(
815815
{"-p", "--prompt"}, "PROMPT",
816-
ex == LLAMA_EXAMPLE_MAIN
817-
? "prompt to start generation with\nif -cnv is set, this will be used as system prompt"
818-
: "prompt to start generation with",
816+
"prompt to start generation with; for system message, use -sys",
819817
[](common_params & params, const std::string & value) {
820818
params.prompt = value;
821819
}
822820
).set_excludes({LLAMA_EXAMPLE_SERVER}));
821+
add_opt(common_arg(
822+
{"-sys", "--system-prompt"}, "PROMPT",
823+
"system prompt to use with model (if applicable, depending on chat template)",
824+
[](common_params & params, const std::string & value) {
825+
params.system_prompt = value;
826+
}
827+
).set_examples({LLAMA_EXAMPLE_MAIN}));
823828
add_opt(common_arg(
824829
{"--no-perf"},
825830
string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,7 @@ struct common_params {
261261
std::string hf_repo = ""; // HF repo // NOLINT
262262
std::string hf_file = ""; // HF file // NOLINT
263263
std::string prompt = ""; // NOLINT
264+
std::string system_prompt = ""; // NOLINT
264265
std::string prompt_file = ""; // store the external prompt file name // NOLINT
265266
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
266267
std::string input_prefix = ""; // string to prefix user inputs with // NOLINT

examples/main/main.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,10 @@ int main(int argc, char ** argv) {
217217
// print chat template example in conversation mode
218218
if (params.conversation_mode) {
219219
if (params.enable_chat_template) {
220+
if (!params.prompt.empty()) {
221+
LOG_WRN("*** User-specified prompt in conversation mode will be ignored, did you mean to set --system-prompt (-sys) instead?\n");
222+
}
223+
220224
LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(chat_templates.get(), params.use_jinja).c_str());
221225
} else {
222226
LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
@@ -261,7 +265,7 @@ int main(int argc, char ** argv) {
261265

262266
std::vector<llama_token> embd_inp;
263267

264-
bool waiting_for_first_input = params.conversation_mode && params.enable_chat_template && params.prompt.empty();
268+
bool waiting_for_first_input = params.conversation_mode && params.enable_chat_template && params.system_prompt.empty();
265269
auto chat_add_and_format = [&chat_msgs, &chat_templates](const std::string & role, const std::string & content) {
266270
common_chat_msg new_msg;
267271
new_msg.role = role;
@@ -273,9 +277,9 @@ int main(int argc, char ** argv) {
273277
};
274278

275279
{
276-
auto prompt = (params.enable_chat_template && !params.prompt.empty())
277-
// format the user prompt or system prompt if in conversation mode
278-
? chat_add_and_format(params.conversation_mode ? "system" : "user", params.prompt)
280+
auto prompt = (params.conversation_mode && params.enable_chat_template)
281+
// format the system prompt in conversation mode (will use template default if empty)
282+
? (params.system_prompt.empty() ? params.system_prompt : chat_add_and_format("system", params.system_prompt))
279283
// otherwise use the prompt as is
280284
: params.prompt;
281285
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
7 Bytes
Binary file not shown.

examples/server/webui/src/components/SettingDialog.tsx

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -148,13 +148,13 @@ const SETTING_SECTIONS: SettingSection[] = [
148148
fields: [
149149
{
150150
type: SettingInputType.CHECKBOX,
151-
label: 'Expand though process by default for generating message',
151+
label: 'Expand thought process by default when generating messages',
152152
key: 'showThoughtInProgress',
153153
},
154154
{
155155
type: SettingInputType.CHECKBOX,
156156
label:
157-
'Exclude thought process when sending request to API (Recommended for DeepSeek-R1)',
157+
'Exclude thought process when sending requests to API (Recommended for DeepSeek-R1)',
158158
key: 'excludeThoughtOnReq',
159159
},
160160
],
@@ -247,7 +247,7 @@ const SETTING_SECTIONS: SettingSection[] = [
247247
This feature uses{' '}
248248
<OpenInNewTab href="https://pyodide.org">pyodide</OpenInNewTab>,
249249
downloaded from CDN. To use this feature, ask the LLM to generate
250-
python code inside a markdown code block. You will see a "Run"
250+
Python code inside a Markdown code block. You will see a "Run"
251251
button on the code block, near the "Copy" button.
252252
</small>
253253
</>
@@ -274,7 +274,7 @@ export default function SettingDialog({
274274
);
275275

276276
const resetConfig = () => {
277-
if (window.confirm('Are you sure to reset all settings?')) {
277+
if (window.confirm('Are you sure you want to reset all settings?')) {
278278
setLocalConfig(CONFIG_DEFAULT);
279279
}
280280
};
@@ -296,9 +296,9 @@ export default function SettingDialog({
296296
return;
297297
}
298298
} else if (mustBeNumeric) {
299-
const trimedValue = value.toString().trim();
300-
const numVal = Number(trimedValue);
301-
if (isNaN(numVal) || !isNumeric(numVal) || trimedValue.length === 0) {
299+
const trimmedValue = value.toString().trim();
300+
const numVal = Number(trimmedValue);
301+
if (isNaN(numVal) || !isNumeric(numVal) || trimmedValue.length === 0) {
302302
alert(`Value for ${key} must be numeric`);
303303
return;
304304
}

ggml/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,9 @@ option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM"
155155
option(GGML_CUDA_FA "ggml: compile ggml FlashAttention CUDA kernels" ON)
156156
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
157157
option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT})
158+
set (GGML_CUDA_COMPRESSION_MODE "size" CACHE STRING
159+
"ggml: cuda link binary compression mode; requires cuda 12.8+")
160+
set_property(CACHE GGML_CUDA_COMPRESSION_MODE PROPERTY STRINGS "none;speed;balance;size")
158161

159162
option(GGML_HIP "ggml: use HIP" OFF)
160163
option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)

ggml/src/ggml-cuda/CMakeLists.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,15 @@ if (CUDAToolkit_FOUND)
102102

103103
set(CUDA_FLAGS -use_fast_math)
104104

105+
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")
106+
# Options are:
107+
# - none (not recommended)
108+
# - speed (nvcc's default)
109+
# - balance
110+
# - size
111+
list(APPEND CUDA_FLAGS -compress-mode=${GGML_CUDA_COMPRESSION_MODE})
112+
endif()
113+
105114
if (GGML_FATAL_WARNINGS)
106115
list(APPEND CUDA_FLAGS -Werror all-warnings)
107116
endif()

0 commit comments

Comments
 (0)