Skip to content

Commit b7d0456

Browse files
committed
lazy inputs for snac codes tensors instead of cpu buffers
1 parent befc55f commit b7d0456

File tree

5 files changed

+155
-104
lines changed

5 files changed

+155
-104
lines changed

src/llama-context.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1348,13 +1348,13 @@ int llama_context::decode(llama_batch & inp_batch) {
13481348
const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1);
13491349
if (compute_status != GGML_STATUS_SUCCESS) {
13501350
switch (compute_status) {
1351-
case GGML_STATUS_ABORTED:
1352-
return 2;
1353-
case GGML_STATUS_ALLOC_FAILED:
1354-
return -2;
1355-
case GGML_STATUS_FAILED:
1356-
default:
1357-
return -3;
1351+
case GGML_STATUS_ABORTED:
1352+
return 2;
1353+
case GGML_STATUS_ALLOC_FAILED:
1354+
return -2;
1355+
case GGML_STATUS_FAILED:
1356+
default:
1357+
return -3;
13581358
}
13591359
}
13601360

src/llama-graph.cpp

Lines changed: 75 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -556,6 +556,81 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
556556
}
557557
}
558558

559+
void llm_graph_input_snac::set_input(const llama_ubatch * ubatch) {
560+
561+
LLAMA_LOG_INFO("Setting SNAC input for layer %d\n", ilayer);
562+
563+
const int n_tokens = ubatch->n_tokens;
564+
if (n_tokens % frame_size != 0) {
565+
return; // TODO: handle gracefully
566+
}
567+
const int n_frames = n_tokens / frame_size;
568+
569+
int64_t expected_elements = 0;
570+
int vocab_offset = 0;
571+
int tokens_per_frame = 0;
572+
573+
switch (ilayer) {
574+
case 0: // Layer 1
575+
tokens_per_frame = 1;
576+
vocab_offset = 128266; // TODO: hparams
577+
break;
578+
case 1: // Layer 2
579+
tokens_per_frame = 2;
580+
vocab_offset = 132362;
581+
break;
582+
case 2: // Layer 3
583+
tokens_per_frame = 4;
584+
vocab_offset = 136458;
585+
break;
586+
default:
587+
LLAMA_LOG_ERROR("%s: Invalid SNAC layer index %d encountered.\n", __func__, ilayer);
588+
GGML_ASSERT(false && "Invalid SNAC layer index"); // Should be caught by constructor assert
589+
return;
590+
}
591+
expected_elements = (int64_t)n_frames * tokens_per_frame;
592+
593+
std::vector<int32_t> indices;
594+
indices.reserve(expected_elements);
595+
596+
const llama_token * tokens_data = ubatch->token;
597+
598+
for (int i_frame = 0; i_frame < n_frames; ++i_frame) {
599+
const int frame_start_idx = i_frame * frame_size;
600+
const llama_token * frame_tokens = tokens_data + frame_start_idx;
601+
602+
switch (ilayer) {
603+
case 0: { // L1: token 0
604+
int32_t index = (int32_t)(frame_tokens[0] - vocab_offset);
605+
606+
indices.push_back(index);
607+
break;
608+
}
609+
case 1: { // L2: tokens 1, 4
610+
int32_t index1 = (int32_t)(frame_tokens[1] - vocab_offset);
611+
int32_t index4 = (int32_t)(frame_tokens[4] - vocab_offset);
612+
613+
indices.push_back(index1);
614+
indices.push_back(index4);
615+
break;
616+
}
617+
case 2: { // L3: tokens 2, 3, 5, 6
618+
int32_t index2 = (int32_t)(frame_tokens[2] - vocab_offset);
619+
int32_t index3 = (int32_t)(frame_tokens[3] - vocab_offset);
620+
int32_t index5 = (int32_t)(frame_tokens[5] - vocab_offset);
621+
int32_t index6 = (int32_t)(frame_tokens[6] - vocab_offset);
622+
623+
indices.push_back(index2);
624+
indices.push_back(index3);
625+
indices.push_back(index5);
626+
indices.push_back(index6);
627+
break;
628+
}
629+
}
630+
}
631+
ggml_backend_tensor_set(target, indices.data(), 0, ggml_nbytes(target));
632+
}
633+
559634
//
560635
// llm_graph_context
561636
//
@@ -985,8 +1060,6 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
9851060
}
9861061
} else {
9871062
inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
988-
LLAMA_LOG_DEBUG("build_inp_embd: inp->embd shape = [%ld, %ld, %ld, %ld]\n",
989-
inp->embd->ne[0], inp->embd->ne[1], inp->embd->ne[2], inp->embd->ne[3]);
9901063
ggml_set_input(inp->embd);
9911064

9921065
cur = inp->embd;

src/llama-graph.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,20 @@ class llm_graph_input_attn_cross : public llm_graph_input_i {
268268
const llama_cross * cross = nullptr;
269269
};
270270

271+
class llm_graph_input_snac : public llm_graph_input_i {
272+
public:
273+
llm_graph_input_snac(ggml_tensor * target, int ilayer,
274+
const llama_hparams & hparams) : target(target), ilayer(ilayer), hparams(hparams) {}
275+
virtual ~llm_graph_input_snac() = default;
276+
277+
void set_input(const llama_ubatch * ubatch) override;
278+
279+
ggml_tensor * target; // idx tensor 1, 2, or 3
280+
const llama_hparams & hparams;
281+
const int ilayer;
282+
const int frame_size = 7;
283+
};
284+
271285
//
272286
// llm_graph_result
273287
//

src/llama-model.cpp

Lines changed: 58 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -1489,8 +1489,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
14891489
ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
14901490

14911491
auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
1492-
std::string tn_str = tn.str();
1493-
ggml_tensor * t_meta = ml.get_tensor_meta(tn_str.c_str());
1492+
ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
14941493

14951494
if (!t_meta) {
14961495
if (flags & TENSOR_NOT_REQUIRED) {
@@ -11743,21 +11742,22 @@ struct llm_build_wavtokenizer_dec : public llm_graph_context {
1174311742
}
1174411743
};
1174511744

11746-
// TODO: Placeholder
1174711745
struct llm_build_snac_dec : public llm_graph_context {
1174811746

1174911747
llm_build_snac_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
1175011748
ggml_tensor * cur;
1175111749
ggml_tensor * emb_layer_1, * emb_layer_2, * emb_layer_3;
11752-
build_codebook_embd(model, &emb_layer_1, &emb_layer_2, &emb_layer_3);
1175311750

11754-
if (emb_layer_1 == nullptr || emb_layer_2 == nullptr || emb_layer_3 == nullptr) {
11751+
bool inputs = build_snac_inputs(model, &emb_layer_1, &emb_layer_2, &emb_layer_3);
11752+
11753+
if (!inputs) {
1175511754
// graph build is called with garbage ubatch codes during model init
1175611755
// in this case, bypass normal graph construction and return a dummy
1175711756
LLAMA_LOG_INFO("build_codebook_inputs returned null, using dummy tensor\n");
1175811757
cur = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, 768, ubatch.n_tokens > 0 ? ubatch.n_tokens : 64, 1, 1);
1175911758
ggml_set_input(cur);
1176011759
} else {
11760+
// TODO: Upsampling is wrong
1176111761
// Projections
1176211762
cur = ggml_mul_mat(ctx0, ggml_reshape_2d(ctx0, model.codebook_proj_w[0], 8, 768), emb_layer_1);
1176311763
cur = ggml_reshape_4d(ctx0, cur, 768, emb_layer_1->ne[1], 1, 1);
@@ -11859,113 +11859,76 @@ struct llm_build_snac_dec : public llm_graph_context {
1185911859

1186011860
cur = ggml_cpy(ctx0, cur, ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3]));
1186111861

11862-
cb(cur, "result_embd", -1);
11862+
LLAMA_LOG_INFO("Final shape of cur = [%ld, %ld, %ld, %ld]\n",
11863+
cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3]);
11864+
11865+
//cb(cur, "result_embd", -1);
1186311866
res->t_embd = cur;
1186411867
ggml_build_forward_expand(gf, cur);
1186511868
}
1186611869
private:
11867-
// TODO: SNAC expects a multilayered input from 3 different embedding matrices
11868-
void build_codebook_embd(const llama_model & model,
11869-
ggml_tensor ** emb_layer_1,
11870-
ggml_tensor ** emb_layer_2,
11871-
ggml_tensor ** emb_layer_3) {
11872-
11873-
*emb_layer_1 = nullptr;
11874-
*emb_layer_2 = nullptr;
11875-
*emb_layer_3 = nullptr;
11876-
11877-
11878-
11879-
bool is_initialized = (ubatch.token != nullptr && ubatch.n_tokens > 0);
11880-
if (is_initialized) {
11881-
for (int i = 0; i < ubatch.n_tokens; ++i) {
11882-
if (ubatch.token[i] < 0 || ubatch.token[i] >= 4096) {
11883-
is_initialized = false;
11884-
break;
11885-
}
11886-
}
11887-
}
11888-
11889-
if (!is_initialized) {
11890-
return;
11870+
// Create 3 input nodes used for lookups into 3 embd matrices
11871+
bool build_snac_inputs(const llama_model & model,
11872+
ggml_tensor ** emb_layer_1_out,
11873+
ggml_tensor ** emb_layer_2_out,
11874+
ggml_tensor ** emb_layer_3_out) {
11875+
11876+
*emb_layer_1_out = nullptr;
11877+
*emb_layer_2_out = nullptr;
11878+
*emb_layer_3_out = nullptr;
11879+
11880+
if (this->ubatch.n_tokens <= 0 || this->ubatch.n_tokens % 7 != 0) {
11881+
LLAMA_LOG_WARN("%s: Invalid ubatch size n_tokens=%d provided for SNAC graph definition. Cannot define input nodes.\n",
11882+
__func__, this->ubatch.n_tokens);
11883+
return false;
1189111884
}
1189211885

11893-
int32_t n_tokens = ubatch.n_tokens;
11894-
int32_t n_frames = n_tokens / 7;
11895-
if (n_tokens % 7 != 0) {
11896-
LLAMA_LOG_INFO("build_codebook_embd: n_tokens (%d) not a multiple of 7, truncating\n", n_tokens);
11897-
n_frames = n_tokens / 7;
11898-
}
11886+
const int32_t n_tokens = this->ubatch.n_tokens;
11887+
const int32_t n_frames = n_tokens / 7;
1189911888

11900-
// TODO: read from vq_strides
11901-
int32_t n_layer_1 = n_frames;
11902-
int32_t n_layer_2 = n_frames * 2;
11903-
int32_t n_layer_3 = n_frames * 4;
11904-
11905-
LLAMA_LOG_INFO("build_codebook_embd: n_frames = %d, n_layer_1 = %d, n_layer_2 = %d, n_layer_3 = %d\n",
11906-
n_frames, n_layer_1, n_layer_2, n_layer_3);
11907-
11908-
std::vector<int32_t> idx_1_data(n_layer_1);
11909-
std::vector<int32_t> idx_2_data(n_layer_2);
11910-
std::vector<int32_t> idx_3_data(n_layer_3);
11911-
11912-
// map codes to respective codebook
11913-
for (int32_t i = 0; i < n_frames; ++i) {
11914-
int32_t base_idx = i * 7;
11915-
idx_1_data[i] = ubatch.token[base_idx + 0];
11916-
idx_2_data[i * 2] = ubatch.token[base_idx + 1];
11917-
idx_2_data[i * 2 + 1] = ubatch.token[base_idx + 4];
11918-
idx_3_data[i * 4] = ubatch.token[base_idx + 2];
11919-
idx_3_data[i * 4 + 1] = ubatch.token[base_idx + 3];
11920-
idx_3_data[i * 4 + 2] = ubatch.token[base_idx + 5];
11921-
idx_3_data[i * 4 + 3] = ubatch.token[base_idx + 6];
11922-
}
11889+
const int32_t n_indices_l1 = n_frames * 1;
11890+
const int32_t n_indices_l2 = n_frames * 2;
11891+
const int32_t n_indices_l3 = n_frames * 4;
1192311892

11924-
// Tensors used for codebook lookups
11925-
ggml_tensor * idx_layer_1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_layer_1);
11926-
ggml_tensor * idx_layer_2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_layer_2);
11927-
ggml_tensor * idx_layer_3 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_layer_3);
11893+
ggml_tensor * idx1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_indices_l1);
11894+
ggml_tensor * idx2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_indices_l2);
11895+
ggml_tensor * idx3 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_indices_l3);
1192811896

11929-
if (!idx_layer_1 || !idx_layer_2 || !idx_layer_3) {
11930-
LLAMA_LOG_INFO("build_codebook_embd: Failed to allocate index tensors\n");
11931-
return;
11897+
if (!idx1 || !idx2 || !idx3) {
11898+
LLAMA_LOG_ERROR("%s: Failed to allocate ggml index tensors.\n", __func__);
11899+
return false;
1193211900
}
1193311901

11934-
// ggml is lazy, so explicitly create buffers for codes to be placed in idx_layer_N
11935-
ggml_backend_buffer_type_t cpu_buft = ggml_backend_cpu_buffer_type();
11936-
if (!cpu_buft) {
11937-
LLAMA_LOG_ERROR("build_codebook_embd: Failed to get CPU buffer type\n");
11938-
return;
11939-
}
11902+
ggml_set_name(idx1, "snac_indices_L1");
11903+
ggml_set_name(idx2, "snac_indices_L2");
11904+
ggml_set_name(idx3, "snac_indices_L3");
1194011905

11941-
ggml_backend_buffer_t buffer_1 = ggml_backend_buft_alloc_buffer(cpu_buft, n_layer_1 * sizeof(int32_t));
11942-
ggml_backend_buffer_t buffer_2 = ggml_backend_buft_alloc_buffer(cpu_buft, n_layer_2 * sizeof(int32_t));
11943-
ggml_backend_buffer_t buffer_3 = ggml_backend_buft_alloc_buffer(cpu_buft, n_layer_3 * sizeof(int32_t));
11906+
// mark as inputs
11907+
ggml_set_input(idx1);
11908+
ggml_set_input(idx2);
11909+
ggml_set_input(idx3);
1194411910

11945-
if (!buffer_1 || !buffer_2 || !buffer_3) {
11946-
LLAMA_LOG_ERROR("build_codebook_embd: Failed to allocate backend buffers\n");
11947-
if (buffer_1) ggml_backend_buffer_free(buffer_1);
11948-
if (buffer_2) ggml_backend_buffer_free(buffer_2);
11949-
if (buffer_3) ggml_backend_buffer_free(buffer_3);
11950-
return;
11951-
}
11911+
// add to res for future access via llama_context
11912+
res->add_input(std::make_unique<llm_graph_input_snac>(idx1, 0, this->hparams));
11913+
res->add_input(std::make_unique<llm_graph_input_snac>(idx2, 1, this->hparams));
11914+
res->add_input(std::make_unique<llm_graph_input_snac>(idx3, 2, this->hparams));
1195211915

11953-
// move codes to idx_layer_N
11954-
idx_layer_1->buffer = buffer_1;
11955-
idx_layer_2->buffer = buffer_2;
11956-
idx_layer_3->buffer = buffer_3;
11916+
// lookup
11917+
*emb_layer_1_out = ggml_get_rows(ctx0, model.codebook[0], idx1);
11918+
*emb_layer_2_out = ggml_get_rows(ctx0, model.codebook[1], idx2);
11919+
*emb_layer_3_out = ggml_get_rows(ctx0, model.codebook[2], idx3);
1195711920

11958-
idx_layer_1->data = ggml_backend_buffer_get_base(buffer_1);
11959-
idx_layer_2->data = ggml_backend_buffer_get_base(buffer_2);
11960-
idx_layer_3->data = ggml_backend_buffer_get_base(buffer_3);
11921+
if (!*emb_layer_1_out || !*emb_layer_2_out || !*emb_layer_3_out) {
11922+
LLAMA_LOG_ERROR("%s: Failed to create ggml_get_rows nodes.\n", __func__);
11923+
*emb_layer_1_out = *emb_layer_2_out = *emb_layer_3_out = nullptr; // Ensure outputs are null on failure
11924+
return false;
11925+
}
1196111926

11962-
ggml_backend_tensor_set(idx_layer_1, idx_1_data.data(), 0, n_layer_1 * sizeof(int32_t));
11963-
ggml_backend_tensor_set(idx_layer_2, idx_2_data.data(), 0, n_layer_2 * sizeof(int32_t));
11964-
ggml_backend_tensor_set(idx_layer_3, idx_3_data.data(), 0, n_layer_3 * sizeof(int32_t));
11927+
ggml_set_name(*emb_layer_1_out, "snac_embd_L1");
11928+
ggml_set_name(*emb_layer_2_out, "snac_embd_L2");
11929+
ggml_set_name(*emb_layer_3_out, "snac_embd_L3");
1196511930

11966-
*emb_layer_1 = ggml_get_rows(ctx0, model.codebook[0], idx_layer_1);
11967-
*emb_layer_2 = ggml_get_rows(ctx0, model.codebook[1], idx_layer_2);
11968-
*emb_layer_3 = ggml_get_rows(ctx0, model.codebook[2], idx_layer_3);
11931+
return true;
1196911932
}
1197011933
};
1197111934

src/llama-model.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ struct llama_layer_convnext {
138138
};
139139

140140
struct llama_layer_snac_dec_block {
141+
141142
struct ggml_tensor * alpha = nullptr;
142143

143144
struct ggml_tensor * up_weight = nullptr;

0 commit comments

Comments
 (0)