Skip to content

Commit c4d6705

Browse files
pwilkinCISC
andauthored
Apply suggestions from code review
Co-authored-by: Sigbjørn Skjæret <[email protected]>
1 parent 2a82bdf commit c4d6705

File tree

2 files changed

+8
-7
lines changed

2 files changed

+8
-7
lines changed

scripts/server-bench.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,7 @@ def benchmark(
236236

237237
logger.info("")
238238
logger.info(f"Benchmark duration: {token_t_last:.2f} s")
239-
logger.info(f"Request throughput: {n_prompts / token_t_last:.2f} requests/s = {n_prompts / (token_t_last / 60):.2f} requests/min")
239+
logger.info(f"Request throughput: {n_prompts / token_t_last:.2f} requests/s = {n_prompts / (token_t_last/60):.2f} requests/min")
240240
logger.info(f"Total prompt length: {np.sum(prompt_n)} tokens")
241241
logger.info(f"Average prompt length: {np.mean(prompt_n):.2f} tokens")
242242
logger.info(f"Average prompt latency: {1e3 * np.mean(prompt_t):.2f} ms")

src/llama-model.cpp

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3985,7 +3985,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
39853985

39863986
// output
39873987
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3988-
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
3988+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3989+
// if output is NULL, init from the input tok embed
3990+
if (output == NULL) {
3991+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3992+
}
39893993

39903994
for (int i = 0; i < n_layer; ++i) {
39913995
auto & layer = layers[i];
@@ -18007,9 +18011,6 @@ struct llm_build_seed_oss : public llm_graph_context {
1800718011

1800818012
// self-attention
1800918013
{
18010-
// rope freq factors for llama3; may return nullptr for llama2 and other models
18011-
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
18012-
1801318014
// compute Q and K and RoPE them
1801418015
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
1801518016
cb(Qcur, "Qcur", il);
@@ -18037,13 +18038,13 @@ struct llm_build_seed_oss : public llm_graph_context {
1803718038
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
1803818039

1803918040
Qcur = ggml_rope_ext(
18040-
ctx0, Qcur, inp_pos, rope_factors,
18041+
ctx0, Qcur, inp_pos, nullptr,
1804118042
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
1804218043
ext_factor, attn_factor, beta_fast, beta_slow
1804318044
);
1804418045

1804518046
Kcur = ggml_rope_ext(
18046-
ctx0, Kcur, inp_pos, rope_factors,
18047+
ctx0, Kcur, inp_pos, nullptr,
1804718048
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
1804818049
ext_factor, attn_factor, beta_fast, beta_slow
1804918050
);

0 commit comments

Comments
 (0)