Skip to content

Commit 4a9a3c8

Browse files
authored
Merge branch 'ggerganov:master' into sgemm_ppc_mma
2 parents 46dcd2b + 07028f9 commit 4a9a3c8

File tree

4 files changed

+14
-6
lines changed

4 files changed

+14
-6
lines changed

examples/server/server.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1880,6 +1880,7 @@ struct server_context {
18801880
if (slot.state == SLOT_STATE_STARTED) {
18811881
slot.t_start_process_prompt = ggml_time_us();
18821882
slot.t_start_generation = 0;
1883+
18831884
slot.n_past = 0;
18841885
slot.n_prompt_tokens = prompt_tokens.size();
18851886
slot.state = SLOT_STATE_PROCESSING_PROMPT;

examples/server/utils.hpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -266,8 +266,10 @@ static llama_tokens format_infill(
266266
}
267267

268268
// for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?)
269-
const int n_suffix_take = std::min<int>(tokens_suffix.size(), (n_batch/4));
270-
const int n_prefix_take = std::min<int>(tokens_prefix.size(), 3*(n_batch/4) - 3);
269+
const int n_prefix_take = std::min<int>(tokens_prefix.size(), 3*(n_batch/4));
270+
const int n_suffix_take = std::min<int>(tokens_suffix.size(), std::max<int>(0, (n_batch/4) - (2 + tokens_prompt.size())));
271+
272+
SRV_DBG("n_prefix_take = %d, n_suffix_take = %d, total = %d\n", n_prefix_take, n_suffix_take, (n_prefix_take + n_suffix_take));
271273

272274
// fill the rest of the context with extra chunks
273275
const int n_extra_take = std::min<int>(std::max<int>(0, n_ctx - (n_batch) - 2*n_predict), extra_tokens.size());

flake.lock

Lines changed: 3 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ggml/src/ggml-cuda.cu

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1484,14 +1484,19 @@ static void ggml_cuda_op_mul_mat(
14841484
const size_t nbytes_data = ggml_nbytes(src0);
14851485
const size_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
14861486
dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ctx.pool(id), nbytes_data + nbytes_padding);
1487+
// TODO: remove this for MUSA once the Guilty Lockup issue is resolved
1488+
#ifndef GGML_USE_MUSA
14871489
CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd, 0, nbytes_data + nbytes_padding, stream));
1490+
#else // GGML_USE_MUSA
1491+
CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data, 0, nbytes_padding, stream));
1492+
#endif // !GGML_USE_MUSA
14881493
}
14891494

14901495
// If src0 is on a temporary compute buffer (partial offloading) there may be some padding that needs to be cleared:
14911496
if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_is_quantized(src0->type) && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
14921497
const size_t nbytes_data = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00);
14931498
const size_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
1494-
CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data , 0, nbytes_padding, stream));
1499+
CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data, 0, nbytes_padding, stream));
14951500
}
14961501

14971502
if (src1_on_device && src1_is_contiguous) {

0 commit comments

Comments
 (0)