1313//  For more details on how this works, see: https://github.com/ggml-org/llama.cpp/pull/12648
1414
1515static  void  print_usage (int , char  ** argv) {
16-     LOG (" \n  example  usage:\n "  );
16+     LOG (" \n  Example  usage:\n "  );
1717    LOG (" \n     By default, model will be downloaded from https://huggingface.co/ggml-org/sesame-csm-1b-GGUF"  );
1818    LOG (" \n     %s -p \" [0]I have a dream that one day every valley shall be exalted\"  -o output.wav"  , argv[0 ]);
1919    LOG (" \n "  );
@@ -22,6 +22,11 @@ static void print_usage(int, char ** argv) {
2222    LOG (" \n "  );
2323    LOG (" \n     Note: the model need 2 files to run, one ends with '-backbone-<quant>.gguf' and the other ends with '-decoder<quant>.gguf'"  );
2424    LOG (" \n "  );
25+     LOG (" \n Prompt format:"  );
26+     LOG (" \n     Each line must start with speaker ID in square brackets, followed by the text. A full stop is recommended at the end of each turn"  );
27+     LOG (" \n     Example: [0]Hello world."  );
28+     LOG (" \n     If you want to enter long text, use -f file.txt to read from file"  );
29+     LOG (" \n "  );
2530}
2631
2732//  greedy sampling with custom n_vocab
@@ -61,7 +66,7 @@ int main(int argc, char ** argv) {
6166    params.model          = " sesame-csm-backbone.gguf"  ;
6267    params.vocoder .model  = " kyutai-mimi.gguf"  ;
6368    params.out_file       = " output.wav"  ;
64-     params.prompt         = " [0]Hello from Sesame. "  ;
69+     params.prompt         = " "  ;
6570    params.n_predict      = 2048 ; //  CSM's max trained seq length
6671
6772    //  HF model
@@ -75,6 +80,11 @@ int main(int argc, char ** argv) {
7580    llama_backend_init ();
7681    llama_numa_init (params.numa );
7782
83+     if  (params.prompt .empty ()) {
84+         LOG_ERR (" prompt is empty\n "  );
85+         return  1 ;
86+     }
87+ 
7888    std::vector<float > embd;
7989    params.cb_eval  = ggml_callback;
8090    params.cb_eval_user_data  = &embd;
@@ -167,7 +177,7 @@ int main(int argc, char ** argv) {
167177        //  printf("\n");
168178
169179        llama_token semantic_tok = sample_greedy (logits, llama_vocab_n_tokens (vocab_dc));
170-         printf (" % d,"  , semantic_tok);
180+         printf (" Sem token %5d : % d," ,  1 +( int )generated_codes. size ()/ 32 , semantic_tok);
171181        generated_codes.push_back (semantic_tok);
172182
173183        //  for (size_t i = 0; i < 10; ++i) {
@@ -200,7 +210,7 @@ int main(int argc, char ** argv) {
200210            //  then, decode the semantic_tok to generate acoustic tokens
201211            llama_token tok = semantic_tok;
202212            int  n_codes = 32 ;
203-             int  sum_codes = 0 ; //  to check if all codes are 0
213+             int  sum_codes = semantic_tok ; //  to check if all codes are 0
204214            for  (int  i = 0 ; i < n_codes; ++i) {
205215                common_batch_clear (batch_token);
206216                //  encoder vocab is further divided into 32 codebooks, each with 2051 entries
@@ -228,9 +238,12 @@ int main(int argc, char ** argv) {
228238                }
229239
230240                //  do progressive hsum of embeddings
231-                 GGML_ASSERT (inp_past_embd.size () == embd.size ());
232-                 for  (size_t  i = 0 ; i < inp_past_embd.size (); ++i) {
233-                     inp_past_embd[i] += embd[i];
241+                 //  skip first semantic code
242+                 if  (i > 0 ) {
243+                     GGML_ASSERT (inp_past_embd.size () == embd.size ());
244+                     for  (size_t  i = 0 ; i < inp_past_embd.size (); ++i) {
245+                         inp_past_embd[i] += embd[i];
246+                     }
234247                }
235248            }
236249            printf (" \n "  );
@@ -253,6 +266,8 @@ int main(int argc, char ** argv) {
253266        //  printf("\n");
254267
255268        if  (is_stop) {
269+             //  remove last 32 codes since they will be all zeros
270+             generated_codes.resize (generated_codes.size () - 32 );
256271            break ;
257272        }
258273    }
0 commit comments