Skip to content

Commit d256aa0

Browse files
committed
tmp
1 parent 7e4cae5 commit d256aa0

File tree

4 files changed

+5
-5
lines changed

4 files changed

+5
-5
lines changed

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3248,7 +3248,7 @@ static int64_t get_op_batch_size(const ggml_tensor * op) {
32483248
}
32493249

32503250
static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
3251-
const int min_batch_size = 32;
3251+
const int min_batch_size = 9999999;
32523252

32533253
return get_op_batch_size(op) >= min_batch_size;
32543254

src/llama-kv-cache.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,8 +96,8 @@ bool llama_kv_cache_init(
9696
return false;
9797
}
9898

99-
ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
100-
ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
99+
ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, 1);
100+
ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, 1);
101101
ggml_format_name(k, "cache_k_l%d", i);
102102
ggml_format_name(v, "cache_v_l%d", i);
103103
cache.k_l.push_back(k);

src/llama-quant.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -776,7 +776,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
776776

777777
// get more optimal quantization type based on the tensor shape, layer, etc.
778778
if (!params->pure && ggml_is_quantized(default_type)) {
779-
new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
779+
new_type = name.find("_exps") != std::string::npos ? name.find("ffn_down") != std::string::npos ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K : GGML_TYPE_BF16;
780780
}
781781
if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
782782
new_type = params->token_embedding_type;

src/llama.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6407,7 +6407,7 @@ struct llm_build_context {
64076407

64086408
// whether to use n_tokens as the matrix dimension during multiplication or n_head
64096409
// n_tokens is higher during prompt processing, this allows to optimize for this case
6410-
bool pp_opt = n_tokens > n_head;
6410+
bool pp_opt = true;
64116411

64126412
for (int il = 0; il < n_layer; ++il) {
64136413
struct ggml_tensor * inpSA = inpL;

0 commit comments

Comments
 (0)