@@ -1489,8 +1489,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
14891489 ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
14901490
14911491 auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
1492- std::string tn_str = tn.str();
1493- ggml_tensor * t_meta = ml.get_tensor_meta(tn_str.c_str());
1492+ ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
14941493
14951494 if (!t_meta) {
14961495 if (flags & TENSOR_NOT_REQUIRED) {
@@ -11743,21 +11742,22 @@ struct llm_build_wavtokenizer_dec : public llm_graph_context {
1174311742 }
1174411743};
1174511744
11746- // TODO: Placeholder
1174711745struct llm_build_snac_dec : public llm_graph_context {
1174811746
1174911747 llm_build_snac_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
1175011748 ggml_tensor * cur;
1175111749 ggml_tensor * emb_layer_1, * emb_layer_2, * emb_layer_3;
11752- build_codebook_embd(model, &emb_layer_1, &emb_layer_2, &emb_layer_3);
1175311750
11754- if (emb_layer_1 == nullptr || emb_layer_2 == nullptr || emb_layer_3 == nullptr) {
11751+ bool inputs = build_snac_inputs(model, &emb_layer_1, &emb_layer_2, &emb_layer_3);
11752+
11753+ if (!inputs) {
1175511754 // graph build is called with garbage ubatch codes during model init
1175611755 // in this case, bypass normal graph construction and return a dummy
1175711756 LLAMA_LOG_INFO("build_codebook_inputs returned null, using dummy tensor\n");
1175811757 cur = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, 768, ubatch.n_tokens > 0 ? ubatch.n_tokens : 64, 1, 1);
1175911758 ggml_set_input(cur);
1176011759 } else {
11760+ // TODO: Upsampling is wrong
1176111761 // Projections
1176211762 cur = ggml_mul_mat(ctx0, ggml_reshape_2d(ctx0, model.codebook_proj_w[0], 8, 768), emb_layer_1);
1176311763 cur = ggml_reshape_4d(ctx0, cur, 768, emb_layer_1->ne[1], 1, 1);
@@ -11859,113 +11859,76 @@ struct llm_build_snac_dec : public llm_graph_context {
1185911859
1186011860 cur = ggml_cpy(ctx0, cur, ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3]));
1186111861
11862- cb(cur, "result_embd", -1);
11862+ LLAMA_LOG_INFO("Final shape of cur = [%ld, %ld, %ld, %ld]\n",
11863+ cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3]);
11864+
11865+ //cb(cur, "result_embd", -1);
1186311866 res->t_embd = cur;
1186411867 ggml_build_forward_expand(gf, cur);
1186511868 }
1186611869private:
11867- // TODO: SNAC expects a multilayered input from 3 different embedding matrices
11868- void build_codebook_embd(const llama_model & model,
11869- ggml_tensor ** emb_layer_1,
11870- ggml_tensor ** emb_layer_2,
11871- ggml_tensor ** emb_layer_3) {
11872-
11873- *emb_layer_1 = nullptr;
11874- *emb_layer_2 = nullptr;
11875- *emb_layer_3 = nullptr;
11876-
11877-
11878-
11879- bool is_initialized = (ubatch.token != nullptr && ubatch.n_tokens > 0);
11880- if (is_initialized) {
11881- for (int i = 0; i < ubatch.n_tokens; ++i) {
11882- if (ubatch.token[i] < 0 || ubatch.token[i] >= 4096) {
11883- is_initialized = false;
11884- break;
11885- }
11886- }
11887- }
11888-
11889- if (!is_initialized) {
11890- return;
11870+ // Create 3 input nodes used for lookups into 3 embd matrices
11871+ bool build_snac_inputs(const llama_model & model,
11872+ ggml_tensor ** emb_layer_1_out,
11873+ ggml_tensor ** emb_layer_2_out,
11874+ ggml_tensor ** emb_layer_3_out) {
11875+
11876+ *emb_layer_1_out = nullptr;
11877+ *emb_layer_2_out = nullptr;
11878+ *emb_layer_3_out = nullptr;
11879+
11880+ if (this->ubatch.n_tokens <= 0 || this->ubatch.n_tokens % 7 != 0) {
11881+ LLAMA_LOG_WARN("%s: Invalid ubatch size n_tokens=%d provided for SNAC graph definition. Cannot define input nodes.\n",
11882+ __func__, this->ubatch.n_tokens);
11883+ return false;
1189111884 }
1189211885
11893- int32_t n_tokens = ubatch.n_tokens;
11894- int32_t n_frames = n_tokens / 7;
11895- if (n_tokens % 7 != 0) {
11896- LLAMA_LOG_INFO("build_codebook_embd: n_tokens (%d) not a multiple of 7, truncating\n", n_tokens);
11897- n_frames = n_tokens / 7;
11898- }
11886+ const int32_t n_tokens = this->ubatch.n_tokens;
11887+ const int32_t n_frames = n_tokens / 7;
1189911888
11900- // TODO: read from vq_strides
11901- int32_t n_layer_1 = n_frames;
11902- int32_t n_layer_2 = n_frames * 2;
11903- int32_t n_layer_3 = n_frames * 4;
11904-
11905- LLAMA_LOG_INFO("build_codebook_embd: n_frames = %d, n_layer_1 = %d, n_layer_2 = %d, n_layer_3 = %d\n",
11906- n_frames, n_layer_1, n_layer_2, n_layer_3);
11907-
11908- std::vector<int32_t> idx_1_data(n_layer_1);
11909- std::vector<int32_t> idx_2_data(n_layer_2);
11910- std::vector<int32_t> idx_3_data(n_layer_3);
11911-
11912- // map codes to respective codebook
11913- for (int32_t i = 0; i < n_frames; ++i) {
11914- int32_t base_idx = i * 7;
11915- idx_1_data[i] = ubatch.token[base_idx + 0];
11916- idx_2_data[i * 2] = ubatch.token[base_idx + 1];
11917- idx_2_data[i * 2 + 1] = ubatch.token[base_idx + 4];
11918- idx_3_data[i * 4] = ubatch.token[base_idx + 2];
11919- idx_3_data[i * 4 + 1] = ubatch.token[base_idx + 3];
11920- idx_3_data[i * 4 + 2] = ubatch.token[base_idx + 5];
11921- idx_3_data[i * 4 + 3] = ubatch.token[base_idx + 6];
11922- }
11889+ const int32_t n_indices_l1 = n_frames * 1;
11890+ const int32_t n_indices_l2 = n_frames * 2;
11891+ const int32_t n_indices_l3 = n_frames * 4;
1192311892
11924- // Tensors used for codebook lookups
11925- ggml_tensor * idx_layer_1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_layer_1);
11926- ggml_tensor * idx_layer_2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_layer_2);
11927- ggml_tensor * idx_layer_3 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_layer_3);
11893+ ggml_tensor * idx1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_indices_l1);
11894+ ggml_tensor * idx2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_indices_l2);
11895+ ggml_tensor * idx3 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_indices_l3);
1192811896
11929- if (!idx_layer_1 || !idx_layer_2 || !idx_layer_3 ) {
11930- LLAMA_LOG_INFO("build_codebook_embd : Failed to allocate index tensors\n");
11931- return;
11897+ if (!idx1 || !idx2 || !idx3 ) {
11898+ LLAMA_LOG_ERROR("%s : Failed to allocate ggml index tensors. \n", __func__ );
11899+ return false ;
1193211900 }
1193311901
11934- // ggml is lazy, so explicitly create buffers for codes to be placed in idx_layer_N
11935- ggml_backend_buffer_type_t cpu_buft = ggml_backend_cpu_buffer_type();
11936- if (!cpu_buft) {
11937- LLAMA_LOG_ERROR("build_codebook_embd: Failed to get CPU buffer type\n");
11938- return;
11939- }
11902+ ggml_set_name(idx1, "snac_indices_L1");
11903+ ggml_set_name(idx2, "snac_indices_L2");
11904+ ggml_set_name(idx3, "snac_indices_L3");
1194011905
11941- ggml_backend_buffer_t buffer_1 = ggml_backend_buft_alloc_buffer(cpu_buft, n_layer_1 * sizeof(int32_t));
11942- ggml_backend_buffer_t buffer_2 = ggml_backend_buft_alloc_buffer(cpu_buft, n_layer_2 * sizeof(int32_t));
11943- ggml_backend_buffer_t buffer_3 = ggml_backend_buft_alloc_buffer(cpu_buft, n_layer_3 * sizeof(int32_t));
11906+ // mark as inputs
11907+ ggml_set_input(idx1);
11908+ ggml_set_input(idx2);
11909+ ggml_set_input(idx3);
1194411910
11945- if (!buffer_1 || !buffer_2 || !buffer_3) {
11946- LLAMA_LOG_ERROR("build_codebook_embd: Failed to allocate backend buffers\n");
11947- if (buffer_1) ggml_backend_buffer_free(buffer_1);
11948- if (buffer_2) ggml_backend_buffer_free(buffer_2);
11949- if (buffer_3) ggml_backend_buffer_free(buffer_3);
11950- return;
11951- }
11911+ // add to res for future access via llama_context
11912+ res->add_input(std::make_unique<llm_graph_input_snac>(idx1, 0, this->hparams));
11913+ res->add_input(std::make_unique<llm_graph_input_snac>(idx2, 1, this->hparams));
11914+ res->add_input(std::make_unique<llm_graph_input_snac>(idx3, 2, this->hparams));
1195211915
11953- // move codes to idx_layer_N
11954- idx_layer_1->buffer = buffer_1 ;
11955- idx_layer_2->buffer = buffer_2 ;
11956- idx_layer_3->buffer = buffer_3 ;
11916+ // lookup
11917+ *emb_layer_1_out = ggml_get_rows(ctx0, model.codebook[0], idx1) ;
11918+ *emb_layer_2_out = ggml_get_rows(ctx0, model.codebook[1], idx2) ;
11919+ *emb_layer_3_out = ggml_get_rows(ctx0, model.codebook[2], idx3) ;
1195711920
11958- idx_layer_1->data = ggml_backend_buffer_get_base(buffer_1);
11959- idx_layer_2->data = ggml_backend_buffer_get_base(buffer_2);
11960- idx_layer_3->data = ggml_backend_buffer_get_base(buffer_3);
11921+ if (!*emb_layer_1_out || !*emb_layer_2_out || !*emb_layer_3_out) {
11922+ LLAMA_LOG_ERROR("%s: Failed to create ggml_get_rows nodes.\n", __func__);
11923+ *emb_layer_1_out = *emb_layer_2_out = *emb_layer_3_out = nullptr; // Ensure outputs are null on failure
11924+ return false;
11925+ }
1196111926
11962- ggml_backend_tensor_set(idx_layer_1, idx_1_data.data(), 0, n_layer_1 * sizeof(int32_t) );
11963- ggml_backend_tensor_set(idx_layer_2, idx_2_data.data(), 0, n_layer_2 * sizeof(int32_t) );
11964- ggml_backend_tensor_set(idx_layer_3, idx_3_data.data(), 0, n_layer_3 * sizeof(int32_t) );
11927+ ggml_set_name(*emb_layer_1_out, "snac_embd_L1" );
11928+ ggml_set_name(*emb_layer_2_out, "snac_embd_L2" );
11929+ ggml_set_name(*emb_layer_3_out, "snac_embd_L3" );
1196511930
11966- *emb_layer_1 = ggml_get_rows(ctx0, model.codebook[0], idx_layer_1);
11967- *emb_layer_2 = ggml_get_rows(ctx0, model.codebook[1], idx_layer_2);
11968- *emb_layer_3 = ggml_get_rows(ctx0, model.codebook[2], idx_layer_3);
11931+ return true;
1196911932 }
1197011933};
1197111934
0 commit comments