@@ -486,280 +486,6 @@ static void debug_test_mrope_2d() {
486486 ggml_backend_free (backend);
487487}
488488
489- static void debug_patch_layout () {
490- // 1. Initialize backend
491- ggml_backend_t backend = NULL ;
492- std::string backend_name = " " ;
493- // #ifdef GGML_USE_CUDA
494- // fprintf(stderr, "%s: using CUDA backend\n", __func__);
495- // backend = ggml_backend_cuda_init(0); // init device 0
496- // backend_name = "cuda";
497- // if (!backend) {
498- // fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
499- // }
500- // #endif
501- // if there aren't GPU Backends fallback to CPU backend
502- if (!backend) {
503- backend = ggml_backend_cpu_init ();
504- backend_name = " cpu" ;
505- }
506-
507- // Calculate the size needed to allocate
508- size_t ctx_size = 0 ;
509- ctx_size += 2 * ggml_tensor_overhead (); // tensors
510- // no need to allocate anything else!
511-
512- // 2. Allocate `ggml_context` to store tensor data
513- struct ggml_init_params params = {
514- /* .mem_size =*/ ctx_size,
515- /* .mem_buffer =*/ NULL ,
516- /* .no_alloc =*/ true , // the tensors will be allocated later by ggml_backend_alloc_ctx_tensors()
517- };
518- struct ggml_context * ctx = ggml_init (params);
519-
520- const int patches_w = 14 ;
521- const int patches_h = 10 ;
522- const int c = 2 ;
523- const int batch_size = 1 ;
524- struct ggml_tensor * inp_raw = ggml_new_tensor_4d (ctx, GGML_TYPE_F32, patches_w, patches_h, c, batch_size);
525- ggml_set_name (inp_raw, " inp_raw" );
526- ggml_set_input (inp_raw);
527-
528-
529- std::vector<float > dummy_q;
530- dummy_q.resize (patches_w * patches_h * c * batch_size);
531- for (size_t i = 0 ; i < patches_h * patches_w * c; i++)
532- {
533- dummy_q[i] = i;
534- }
535-
536- // std::fill(dummy_q.begin(), dummy_q.end(), 0.1);
537- // memcpy(inp_raw->data, dummy_q.data(), 128 * 12 * 30 * ggml_element_size(inp_raw));
538-
539- // 4. Allocate a `ggml_backend_buffer` to store all tensors
540- ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors (ctx, backend);
541-
542- // 5. Copy tensor data from main memory (RAM) to backend buffer
543- ggml_backend_tensor_set (inp_raw, dummy_q.data (), 0 , ggml_nbytes (inp_raw));
544-
545- // 6. Create a `ggml_cgraph` for mul_mat operation
546- struct ggml_cgraph * gf = NULL ;
547- struct ggml_context * ctx0 = NULL ;
548-
549- // create a temporally context to build the graph
550- struct ggml_init_params params0 = {
551- /* .mem_size =*/ ggml_tensor_overhead ()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead (),
552- /* .mem_buffer =*/ NULL ,
553- /* .no_alloc =*/ true , // the tensors will be allocated later by ggml_gallocr_alloc_graph()
554- };
555- ctx0 = ggml_init (params0);
556- gf = ggml_new_graph (ctx0);
557- /*
558- Compute graph
559- */
560- struct ggml_tensor * inp = ggml_cont (ctx0, ggml_permute (ctx0, inp_raw, 1 , 2 , 0 , 3 )); // [w, h, c, b] -> [c, w, h, b]
561-
562- inp = ggml_reshape_4d (
563- ctx0, inp,
564- c * 2 , patches_w / 2 , patches_h, batch_size);
565- inp = ggml_reshape_4d (
566- ctx0, inp,
567- c * 2 , patches_w / 2 , 2 , batch_size * (patches_h / 2 ));
568- inp = ggml_cont (ctx0, ggml_permute (ctx0, inp, 0 , 2 , 1 , 3 ));
569- inp = ggml_reshape_3d (
570- ctx0, inp,
571- c, patches_w * patches_h, batch_size);
572-
573- // Add "result" tensor and all of its dependencies to the cgraph
574- ggml_build_forward_expand (gf, inp);
575-
576- // 7. Create a `ggml_gallocr` for cgraph computation
577- ggml_gallocr_t allocr = ggml_gallocr_new (ggml_backend_get_default_buffer_type (backend));
578- ggml_gallocr_alloc_graph (allocr, gf);
579-
580- // 9. Run the computation
581- int n_threads = 1 ; // Optional: number of threads to perform some operations with multi-threading
582- if (ggml_backend_is_cpu (backend)) {
583- ggml_backend_cpu_set_n_threads (backend, n_threads);
584- }
585- ggml_backend_graph_compute (backend, gf);
586-
587- // 10. Retrieve results (output tensors)
588- // in this example, output tensor is always the last tensor in the graph
589- struct ggml_tensor * result = inp;
590- // struct ggml_tensor * result = gf->nodes[gf->n_nodes - 1];
591- float * result_data = (float *)malloc (ggml_nbytes (result));
592- // because the tensor data is stored in device buffer, we need to copy it back to RAM
593- ggml_backend_tensor_get (result, result_data, 0 , ggml_nbytes (result));
594- const std::string bin_file = " patch_layout_" + backend_name +" .bin" ;
595- std::ofstream outFile (bin_file, std::ios::binary);
596-
597- if (outFile.is_open ()) {
598- outFile.write (reinterpret_cast <const char *>(result_data), ggml_nbytes (result));
599- outFile.close ();
600- std::cout << " Data successfully written to " + bin_file << std::endl;
601- } else {
602- std::cerr << " Error opening file!" << std::endl;
603- }
604-
605- free (result_data);
606- // 11. Free memory and exit
607- ggml_free (ctx0);
608- ggml_gallocr_free (allocr);
609- ggml_free (ctx);
610- ggml_backend_buffer_free (buffer);
611- ggml_backend_free (backend);
612- }
613-
614- static void debug_test_get_rows () {
615- // 1. Initialize backend
616- ggml_backend_t backend = NULL ;
617- std::string backend_name = " " ;
618- // #ifdef GGML_USE_CUDA
619- // fprintf(stderr, "%s: using CUDA backend\n", __func__);
620- // backend = ggml_backend_cuda_init(0); // init device 0
621- // backend_name = "cuda";
622- // if (!backend) {
623- // fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
624- // }
625- // #endif
626- // if there aren't GPU Backends fallback to CPU backend
627- if (!backend) {
628- backend = ggml_backend_cpu_init ();
629- backend_name = " cpu" ;
630- }
631-
632- // Calculate the size needed to allocate
633- size_t ctx_size = 0 ;
634- ctx_size += 128 * ggml_tensor_overhead (); // tensors
635- // no need to allocate anything else!
636-
637- // 2. Allocate `ggml_context` to store tensor data
638- struct ggml_init_params params = {
639- /* .mem_size =*/ ctx_size,
640- /* .mem_buffer =*/ NULL ,
641- /* .no_alloc =*/ true , // the tensors will be allocated later by ggml_backend_alloc_ctx_tensors()
642- };
643- struct ggml_context * ctx = ggml_init (params);
644-
645- const int tokens = 30 ;
646- struct ggml_tensor * inp_raw = ggml_new_tensor_3d (ctx, GGML_TYPE_F32, 128 , 3 , tokens * 2 );
647- ggml_set_name (inp_raw, " inp_raw" );
648- ggml_set_input (inp_raw);
649-
650- struct ggml_tensor * pos = ggml_new_tensor_2d (ctx, GGML_TYPE_I32, 4 , tokens);
651- // struct ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, tokens * 4);
652- ggml_set_name (pos, " pos" );
653- ggml_set_input (pos);
654-
655- struct ggml_tensor * ind = ggml_new_tensor_1d (ctx, GGML_TYPE_I32, tokens);
656- ggml_set_name (ind, " ind" );
657- ggml_set_input (ind);
658-
659- struct ggml_tensor * ind_2d = ggml_new_tensor_2d (ctx, GGML_TYPE_I32, 1 , tokens);
660- ggml_set_name (ind_2d, " ind_2d" );
661- ggml_set_input (ind_2d);
662-
663- std::vector<float > dummy_q;
664- dummy_q.resize (128 * 3 * inp_raw->ne [2 ]);
665- for (int i = 0 ; i < inp_raw->ne [2 ]; i ++) {
666- for (int j = 0 ; j < 3 ; j ++) {
667- int offset = i * 128 * 3 + j * 128 ;
668- std::fill (dummy_q.begin () + offset, dummy_q.begin () + offset + 128 , 0.1 * i);
669- }
670- }
671- // std::fill(dummy_q.begin(), dummy_q.end(), 0.1);
672- // memcpy(inp_raw->data, dummy_q.data(), 128 * 12 * 30 * ggml_element_size(inp_raw));
673-
674- std::vector<int > pos_id;
675- pos_id.resize (tokens * 4 );
676- for (int i = 0 ; i < tokens; i ++) {
677- pos_id[i] = i;
678- pos_id[i + tokens * 1 ] = i + 10 ;
679- pos_id[i + tokens * 2 ] = i + 20 ;
680- pos_id[i + tokens * 3 ] = i + 30 ;
681- }
682-
683- std::vector<int > remap_ind;
684- remap_ind.resize (tokens * 4 );
685- for (int i = 0 ; i < tokens; i ++) {
686- remap_ind[i] = tokens - i - 1 ;
687- }
688-
689- // 4. Allocate a `ggml_backend_buffer` to store all tensors
690- ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors (ctx, backend);
691-
692- // 5. Copy tensor data from main memory (RAM) to backend buffer
693- ggml_backend_tensor_set (inp_raw, dummy_q.data (), 0 , ggml_nbytes (inp_raw));
694- ggml_backend_tensor_set (pos, pos_id.data (), 0 , ggml_nbytes (pos));
695- ggml_backend_tensor_set (ind, remap_ind.data (), 0 , ggml_nbytes (ind));
696- ggml_backend_tensor_set (ind_2d, remap_ind.data (), 0 , ggml_nbytes (ind_2d));
697-
698- // 6. Create a `ggml_cgraph` for mul_mat operation
699- struct ggml_cgraph * gf = NULL ;
700- struct ggml_context * ctx_cgraph = NULL ;
701-
702- // create a temporally context to build the graph
703- struct ggml_init_params params0 = {
704- /* .mem_size =*/ ggml_tensor_overhead ()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead (),
705- /* .mem_buffer =*/ NULL ,
706- /* .no_alloc =*/ true , // the tensors will be allocated later by ggml_gallocr_alloc_graph()
707- };
708- ctx_cgraph = ggml_init (params0);
709- gf = ggml_new_graph (ctx_cgraph);
710-
711- // ne = [128, 1, 30, 1]
712- auto x = ggml_reshape_2d (ctx_cgraph, inp_raw, 128 * 3 * 2 , tokens);
713- struct ggml_tensor * result0 = ggml_get_rows (
714- ctx_cgraph, x, ind);
715- result0 = ggml_reshape_3d (ctx_cgraph, result0, 128 , 3 , tokens * 2 );
716-
717- struct ggml_tensor * result1 = ggml_get_rows (
718- ctx_cgraph, pos, ind);
719-
720- // Add "result" tensor and all of its dependencies to the cgraph
721- ggml_build_forward_expand (gf, result0);
722- ggml_build_forward_expand (gf, result1);
723-
724- // 7. Create a `ggml_gallocr` for cgraph computation
725- ggml_gallocr_t allocr = ggml_gallocr_new (ggml_backend_get_default_buffer_type (backend));
726- ggml_gallocr_alloc_graph (allocr, gf);
727-
728- // 9. Run the computation
729- int n_threads = 1 ; // Optional: number of threads to perform some operations with multi-threading
730- if (ggml_backend_is_cpu (backend)) {
731- ggml_backend_cpu_set_n_threads (backend, n_threads);
732- }
733- ggml_backend_graph_compute (backend, gf);
734-
735- // 10. Retrieve results (output tensors)
736- // in this example, output tensor is always the last tensor in the graph
737- struct ggml_tensor * result = result0;
738- // struct ggml_tensor * result = gf->nodes[gf->n_nodes - 1];
739- float * result_data = (float *)malloc (ggml_nbytes (result));
740- // because the tensor data is stored in device buffer, we need to copy it back to RAM
741- ggml_backend_tensor_get (result, result_data, 0 , ggml_nbytes (result));
742- const std::string bin_file = " getrows_" + backend_name +" _0.bin" ;
743- std::ofstream outFile (bin_file, std::ios::binary);
744-
745- if (outFile.is_open ()) {
746- outFile.write (reinterpret_cast <const char *>(result_data), ggml_nbytes (result));
747- outFile.close ();
748- std::cout << " Data successfully written to " + bin_file << std::endl;
749- } else {
750- std::cerr << " Error opening file!" << std::endl;
751- }
752-
753- free (result_data);
754- // 11. Free memory and exit
755- ggml_free (ctx_cgraph);
756- ggml_gallocr_free (allocr);
757- ggml_free (ctx);
758- ggml_backend_buffer_free (buffer);
759- ggml_backend_free (backend);
760- }
761-
762-
763489enum model_output_type {
764490 conv3d,
765491 patch_embed,
@@ -955,9 +681,6 @@ int main(int argc, char ** argv) {
955681 // debug_test_mrope_2d();
956682 debug_dump_img_embed (ctx_llava, model_output_type::final_layer);
957683 // debug_dump_img_embed(ctx_llava, model_output_type::last_attn_layer);
958- // debug_test_get_rows();
959- // dump_win_attn_mask();
960- // debug_patch_layout();
961684
962685 llama_perf_context_print (ctx_llava->ctx_llama );
963686 ctx_llava->model = NULL ;
0 commit comments