Skip to content

Commit 4235644

Browse files
authored
Merge branch 'ggml-org:master' into master
2 parents 850d301 + f4c3dd5 commit 4235644

File tree

7 files changed

+185
-89
lines changed

7 files changed

+185
-89
lines changed

common/arg.cpp

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -853,6 +853,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
853853
}
854854
}
855855
).set_excludes({LLAMA_EXAMPLE_SERVER}));
856+
add_opt(common_arg(
857+
{"-sysf", "--system-prompt-file"}, "FNAME",
858+
"a file containing the system prompt (default: none)",
859+
[](common_params & params, const std::string & value) {
860+
std::ifstream file(value);
861+
if (!file) {
862+
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
863+
}
864+
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.system_prompt));
865+
if (!params.system_prompt.empty() && params.system_prompt.back() == '\n') {
866+
params.system_prompt.pop_back();
867+
}
868+
}
869+
).set_examples({LLAMA_EXAMPLE_MAIN}));
856870
add_opt(common_arg(
857871
{"--in-file"}, "FNAME",
858872
"an input file (repeat to specify multiple files)",
@@ -1875,7 +1889,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
18751889
[](common_params & params, const std::string & value) {
18761890
params.out_file = value;
18771891
}
1878-
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA}));
1892+
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS}));
18791893
add_opt(common_arg(
18801894
{"-ofreq", "--output-frequency"}, "N",
18811895
string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),

examples/run/run.cpp

Lines changed: 89 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ class Opt {
7979
ctx_params = llama_context_default_params();
8080
model_params = llama_model_default_params();
8181
context_size_default = ctx_params.n_batch;
82+
n_threads_default = ctx_params.n_threads;
8283
ngl_default = model_params.n_gpu_layers;
8384
common_params_sampling sampling;
8485
temperature_default = sampling.temp;
@@ -104,6 +105,7 @@ class Opt {
104105

105106
ctx_params.n_batch = context_size >= 0 ? context_size : context_size_default;
106107
ctx_params.n_ctx = ctx_params.n_batch;
108+
ctx_params.n_threads = ctx_params.n_threads_batch = n_threads >= 0 ? n_threads : n_threads_default;
107109
model_params.n_gpu_layers = ngl >= 0 ? ngl : ngl_default;
108110
temperature = temperature >= 0 ? temperature : temperature_default;
109111

@@ -116,12 +118,12 @@ class Opt {
116118
std::string chat_template_file;
117119
std::string user;
118120
bool use_jinja = false;
119-
int context_size = -1, ngl = -1;
121+
int context_size = -1, ngl = -1, n_threads = -1;
120122
float temperature = -1;
121123
bool verbose = false;
122124

123125
private:
124-
int context_size_default = -1, ngl_default = -1;
126+
int context_size_default = -1, ngl_default = -1, n_threads_default = -1;
125127
float temperature_default = -1;
126128
bool help = false;
127129

@@ -159,53 +161,94 @@ class Opt {
159161
return 0;
160162
}
161163

164+
int parse_options_with_value(int argc, const char ** argv, int & i, bool & options_parsing) {
165+
if (options_parsing && (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0)) {
166+
if (handle_option_with_value(argc, argv, i, context_size) == 1) {
167+
return 1;
168+
}
169+
} else if (options_parsing &&
170+
(strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "-ngl") == 0 || strcmp(argv[i], "--ngl") == 0)) {
171+
if (handle_option_with_value(argc, argv, i, ngl) == 1) {
172+
return 1;
173+
}
174+
} else if (options_parsing && (strcmp(argv[i], "-t") == 0 || strcmp(argv[i], "--threads") == 0)) {
175+
if (handle_option_with_value(argc, argv, i, n_threads) == 1) {
176+
return 1;
177+
}
178+
} else if (options_parsing && strcmp(argv[i], "--temp") == 0) {
179+
if (handle_option_with_value(argc, argv, i, temperature) == 1) {
180+
return 1;
181+
}
182+
} else if (options_parsing && strcmp(argv[i], "--chat-template-file") == 0) {
183+
if (handle_option_with_value(argc, argv, i, chat_template_file) == 1) {
184+
return 1;
185+
}
186+
use_jinja = true;
187+
} else {
188+
return 2;
189+
}
190+
191+
return 0;
192+
}
193+
194+
int parse_options(const char ** argv, int & i, bool & options_parsing) {
195+
if (options_parsing && (parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) {
196+
verbose = true;
197+
} else if (options_parsing && strcmp(argv[i], "--jinja") == 0) {
198+
use_jinja = true;
199+
} else if (options_parsing && parse_flag(argv, i, "-h", "--help")) {
200+
help = true;
201+
return 0;
202+
} else if (options_parsing && strcmp(argv[i], "--") == 0) {
203+
options_parsing = false;
204+
} else {
205+
return 2;
206+
}
207+
208+
return 0;
209+
}
210+
211+
int parse_positional_args(const char ** argv, int & i, int & positional_args_i) {
212+
if (positional_args_i == 0) {
213+
if (!argv[i][0] || argv[i][0] == '-') {
214+
return 1;
215+
}
216+
217+
++positional_args_i;
218+
model_ = argv[i];
219+
} else if (positional_args_i == 1) {
220+
++positional_args_i;
221+
user = argv[i];
222+
} else {
223+
user += " " + std::string(argv[i]);
224+
}
225+
226+
return 0;
227+
}
228+
162229
int parse(int argc, const char ** argv) {
163230
bool options_parsing = true;
164231
for (int i = 1, positional_args_i = 0; i < argc; ++i) {
165-
if (options_parsing && (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0)) {
166-
if (handle_option_with_value(argc, argv, i, context_size) == 1) {
167-
return 1;
168-
}
169-
} else if (options_parsing &&
170-
(strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "-ngl") == 0 || strcmp(argv[i], "--ngl") == 0)) {
171-
if (handle_option_with_value(argc, argv, i, ngl) == 1) {
172-
return 1;
173-
}
174-
} else if (options_parsing && strcmp(argv[i], "--temp") == 0) {
175-
if (handle_option_with_value(argc, argv, i, temperature) == 1) {
176-
return 1;
177-
}
178-
} else if (options_parsing &&
179-
(parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) {
180-
verbose = true;
181-
} else if (options_parsing && strcmp(argv[i], "--jinja") == 0) {
182-
use_jinja = true;
183-
} else if (options_parsing && strcmp(argv[i], "--chat-template-file") == 0){
184-
if (handle_option_with_value(argc, argv, i, chat_template_file) == 1) {
185-
return 1;
186-
}
187-
use_jinja = true;
188-
} else if (options_parsing && parse_flag(argv, i, "-h", "--help")) {
189-
help = true;
190-
return 0;
191-
} else if (options_parsing && strcmp(argv[i], "--") == 0) {
192-
options_parsing = false;
193-
} else if (positional_args_i == 0) {
194-
if (!argv[i][0] || argv[i][0] == '-') {
195-
return 1;
196-
}
197-
198-
++positional_args_i;
199-
model_ = argv[i];
200-
} else if (positional_args_i == 1) {
201-
++positional_args_i;
202-
user = argv[i];
203-
} else {
204-
user += " " + std::string(argv[i]);
232+
int ret = parse_options_with_value(argc, argv, i, options_parsing);
233+
if (ret == 0) {
234+
continue;
235+
} else if (ret == 1) {
236+
return ret;
237+
}
238+
239+
ret = parse_options(argv, i, options_parsing);
240+
if (ret == 0) {
241+
continue;
242+
} else if (ret == 1) {
243+
return ret;
244+
}
245+
246+
if (parse_positional_args(argv, i, positional_args_i)) {
247+
return 1;
205248
}
206249
}
207250

208-
if (model_.empty()){
251+
if (model_.empty()) {
209252
return 1;
210253
}
211254

@@ -232,6 +275,8 @@ class Opt {
232275
" Number of GPU layers (default: %d)\n"
233276
" --temp <value>\n"
234277
" Temperature (default: %.1f)\n"
278+
" -t, --threads <value>\n"
279+
" Number of threads to use during generation (default: %d)\n"
235280
" -v, --verbose, --log-verbose\n"
236281
" Set verbosity level to infinity (i.e. log all messages, useful for debugging)\n"
237282
" -h, --help\n"
@@ -260,7 +305,7 @@ class Opt {
260305
" llama-run file://some-file3.gguf\n"
261306
" llama-run --ngl 999 some-file4.gguf\n"
262307
" llama-run --ngl 999 some-file5.gguf Hello World\n",
263-
context_size_default, ngl_default, temperature_default);
308+
context_size_default, ngl_default, temperature_default, n_threads_default);
264309
}
265310
};
266311

examples/tts/tts.cpp

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -87,11 +87,11 @@ struct wav_header {
8787
uint32_t data_size;
8888
};
8989

90-
static void save_wav16(const std::string & fname, const std::vector<float> & data, int sample_rate) {
90+
static bool save_wav16(const std::string & fname, const std::vector<float> & data, int sample_rate) {
9191
std::ofstream file(fname, std::ios::binary);
9292
if (!file) {
93-
LOG_ERR("%s: Failed to open file '%s' for writing", __func__, fname.c_str());
94-
return;
93+
LOG_ERR("%s: Failed to open file '%s' for writing.\n", __func__, fname.c_str());
94+
return false;
9595
}
9696

9797
wav_header header;
@@ -108,7 +108,7 @@ static void save_wav16(const std::string & fname, const std::vector<float> & dat
108108
file.write(reinterpret_cast<const char*>(&pcm_sample), sizeof(pcm_sample));
109109
}
110110

111-
file.close();
111+
return file.good();
112112
}
113113

114114
static void fill_hann_window(int length, bool periodic, float * output) {
@@ -536,6 +536,7 @@ static std::string audio_data_from_speaker(json speaker, const outetts_version t
536536
int main(int argc, char ** argv) {
537537
common_params params;
538538

539+
params.out_file = "output.wav";
539540
params.prompt = "";
540541

541542
params.n_predict = 4096;
@@ -1060,8 +1061,6 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
10601061
}
10611062
#endif
10621063

1063-
const std::string fname = "output.wav";
1064-
10651064
const int n_sr = 24000; // sampling rate
10661065

10671066
// zero out first 0.25 seconds
@@ -1072,11 +1071,15 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
10721071
LOG_INF("%s: time for spectral ops: %.3f ms\n", __func__, (ggml_time_us() - t_spec_start) / 1000.0f);
10731072
LOG_INF("%s: total time: %.3f ms\n", __func__, (ggml_time_us() - t_main_start) / 1000.0f);
10741073

1075-
save_wav16(fname, audio, n_sr);
1074+
int retval = 0;
10761075

1077-
LOG_INF("%s: audio written to file '%s'\n", __func__, fname.c_str());
1076+
if (save_wav16(params.out_file, audio, n_sr)) {
1077+
LOG_INF("%s: audio written to file '%s'\n", __func__, params.out_file.c_str());
1078+
} else {
1079+
retval = ENOENT;
1080+
}
10781081

10791082
llama_backend_free();
10801083

1081-
return 0;
1084+
return retval;
10821085
}

ggml/src/ggml-cann/aclnn_ops.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2790,10 +2790,14 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
27902790
(char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
27912791
output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
27922792
output_ne_offset);
2793+
int64_t antiquantGroupSize = 0;
2794+
if (src0->ne[0] > QK8_0) {
2795+
antiquantGroupSize = QK8_0;
2796+
}
27932797

27942798
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
27952799
acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr,
2796-
nullptr, nullptr, nullptr, QK8_0, acl_output_tensor,
2800+
nullptr, nullptr, nullptr, antiquantGroupSize, acl_output_tensor,
27972801
&workspaceSize, &executor));
27982802
if (workspaceAddr == nullptr) {
27992803
workspaceAddr = workspace_allocator.alloc(workspaceSize);
@@ -2833,7 +2837,7 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
28332837

28342838
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
28352839
acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
2836-
nullptr, nullptr, nullptr, nullptr, QK8_0,
2840+
nullptr, nullptr, nullptr, nullptr, antiquantGroupSize,
28372841
acl_output_tensor, &workspaceSize, &executor));
28382842
ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
28392843
workspaceAddr, workspaceSize, executor, ctx.stream()));

ggml/src/ggml-cann/ggml-cann.cpp

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1689,11 +1689,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
16891689
case GGML_OP_MUL_MAT: {
16901690
switch (op->src[0]->type) {
16911691
case GGML_TYPE_Q8_0:
1692-
// Current groupsize should not be greater than k-1 in
1693-
// aclnnWeightQuantBatchMatmulV2GetWorkspaceSize
1694-
if (op->src[0]->ne[0] <= QK8_0) {
1695-
return false;
1696-
}
16971692
case GGML_TYPE_F16:
16981693
case GGML_TYPE_F32:
16991694
case GGML_TYPE_Q4_0:

0 commit comments

Comments
 (0)