4040#include " mpt_v3.cpp"
4141#include " examples/llava/clip.h"
4242#include " examples/llava/llava.h"
43- #include " experimental/emphasis.h"
43+ // #include "experimental/emphasis.h"
4444
4545// const
4646const int extra_context_handle_fragmentation = 120 ;
@@ -1684,7 +1684,7 @@ static int GetBatchSize(int desiredBlasBatchSize,FileFormat in_file_format)
16841684 file_format==FileFormat::RWKV_2);
16851685 if (!approved_format || desiredBlasBatchSize<=0 )
16861686 {
1687- desiredBlasBatchSize = 16 ;
1687+ desiredBlasBatchSize = 32 ;
16881688 }
16891689 if (file_format != FileFormat::GGML && file_format != FileFormat::GGHF && file_format != FileFormat::GGJT && file_format != FileFormat::GGJT_2 && file_format != FileFormat::GGJT_3 && file_format != FileFormat::GGUF_GENERIC)
16901690 {
@@ -1708,7 +1708,7 @@ static int GetUBatchSize(int desiredBlasUBatchSize,FileFormat in_file_format)
17081708 file_format==FileFormat::RWKV_2);
17091709 if (!approved_format || desiredBlasUBatchSize<=0 )
17101710 {
1711- desiredBlasUBatchSize = 16 ;
1711+ desiredBlasUBatchSize = 32 ;
17121712 }
17131713 if (file_format != FileFormat::GGML && file_format != FileFormat::GGHF && file_format != FileFormat::GGJT && file_format != FileFormat::GGJT_2 && file_format != FileFormat::GGJT_3 && file_format != FileFormat::GGUF_GENERIC)
17141714 {
@@ -1759,14 +1759,14 @@ static float CalcGradientAIRopeFreqBase(float original_rope_base, int n_ctx_trai
17591759 // }
17601760 return rope_freq_base_with_positive_offset;
17611761 }
1762- else if (model_arch==GGUFArch::ARCH_MISTRAL_LLAMA_1_AND_2)
1762+ /* else if(model_arch==GGUFArch::ARCH_MISTRAL_LLAMA_1_AND_2)
17631763 {
17641764 float extended_rope_negative_offset_value = 1 + ((log10f(chi_ctx_value) - log10f(chi_ctx_train_value)) / (3.14159265358979323846 * 3.14159265358979323846));
17651765 float rope_freq_base_with_negative_offset = gradient_ai_rope_freq_base_value / extended_rope_negative_offset_value;
17661766 printf("Extended RoPE Negative Offset (divisor) for Llama 1 and 2 based models. (value:%.3f).\n", extended_rope_negative_offset_value);
17671767 printf("RoPE base calculated via Gradient AI formula for Llama 1 and 2 based models. (value:%.1f).\n", rope_freq_base_with_negative_offset);
17681768 return rope_freq_base_with_negative_offset;
1769- }
1769+ } */
17701770 else
17711771 {
17721772 return gradient_ai_rope_freq_base_value;
@@ -2566,8 +2566,220 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
25662566 }
25672567 else
25682568 {
2569- printf (" \n Unknown Model, cannot load.\n " );
2570- return ModelLoadResult::FAIL;
2569+ // printf("\nUnknown Model, cannot load.\n");
2570+ // return ModelLoadResult::FAIL;
2571+ llama_backend_init ();
2572+
2573+ llama_model_params model_params = llama_model_default_params ();
2574+ llama_context_params llama_ctx_params = llama_context_default_params ();
2575+ llama_ctx_params.n_ctx = clamped_max_context_length;
2576+ if (kcpp_data->use_contextshift )
2577+ {
2578+ llama_ctx_params.n_ctx += extra_context_handle_fragmentation;
2579+ }
2580+
2581+ llama_ctx_params.offload_kqv = !inputs.low_vram ;
2582+ llama_ctx_params.logits_all = false ;
2583+ model_params.use_mmap = inputs.use_mmap ;
2584+ model_params.use_mlock = inputs.use_mlock ;
2585+ model_params.n_gpu_layers = inputs.gpulayers ;
2586+
2587+ #if defined(GGML_USE_CLBLAST)
2588+ if (file_format==FileFormat::GGUF_GENERIC && model_params.n_gpu_layers >0 )
2589+ {
2590+ if (file_format_meta.model_architecture == GGUFArch::ARCH_FALCON)
2591+ {
2592+ printf (" \n OpenCL does not support GPU Layer offloading for this model architecture! GPU Offload has been disabled.\n " );
2593+ model_params.n_gpu_layers = 0 ;
2594+ }
2595+ else if (file_format_meta.n_expert_count >1 )
2596+ {
2597+ printf (" \n OpenCL cannot use regular GPU offloading for this model architecture. A fallback GPU offloader will be used with degraded performance.\n " );
2598+
2599+ }
2600+ }
2601+ #endif
2602+ #if defined(GGML_USE_CUDA)
2603+ if (cu_parseinfo_maindevice>0 )
2604+ {
2605+ printf (" CUBLAS: Set main device to %d\n " ,cu_parseinfo_maindevice);
2606+ }
2607+ ggml_cuda_set_mul_mat_q (inputs.use_mmq );
2608+ if (file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2 && !kcpp_data->flash_attn )
2609+ {
2610+ printf (" CUBLAS: Warning, you are running Qwen2 without Flash Attention and may observe incoherent output.\n " );
2611+ }
2612+ #endif
2613+ model_params.main_gpu = cu_parseinfo_maindevice;
2614+
2615+ #if defined(GGML_USE_CUDA)
2616+ model_params.split_mode = (inputs.use_rowsplit ?llama_split_mode::LLAMA_SPLIT_MODE_ROW:llama_split_mode::LLAMA_SPLIT_MODE_LAYER);
2617+ #else
2618+ model_params.split_mode = llama_split_mode::LLAMA_SPLIT_MODE_LAYER;
2619+ #endif
2620+
2621+ llama_ctx_params.n_batch = kcpp_data->n_batch ;
2622+ llama_ctx_params.n_ubatch = kcpp_data->n_ubatch ;
2623+ llama_ctx_params.n_threads = kcpp_data->n_threads ;
2624+ llama_ctx_params.n_threads_batch = kcpp_data->n_blasthreads ;
2625+
2626+ #if defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN)
2627+ bool ts_all_zero = true ;
2628+ for (int i = 0 ; i < tensor_split_max; ++i) {
2629+ if (inputs.tensor_split [i] != 0 .0f ) {
2630+ ts_all_zero = false ;
2631+ break ;
2632+ }
2633+ }
2634+ if (!ts_all_zero)
2635+ {
2636+ printf (" \n Applying Tensor Split..." );
2637+ model_params.tensor_split = inputs.tensor_split ;
2638+ }
2639+ #endif
2640+
2641+ // compat for old falcon
2642+ if (file_format_meta.fileversion ==1 )
2643+ {
2644+ // apply compat fix
2645+ printf (" \n Using older tokenizer for GGUFv1..." );
2646+ OldBPETokenizerMode = true ;
2647+ }
2648+
2649+ llama_model * llamamodel = llama_load_model_from_file (kcpp_data->model_filename .c_str (), model_params);
2650+ if (overwriteRope)
2651+ {
2652+ llama_ctx_params.rope_freq_base = rope_freq_base;
2653+ llama_ctx_params.rope_freq_scale = rope_freq_scale;
2654+ }
2655+ else
2656+ {
2657+ // if the model modifes rope in any way, or uses yarn, use the model values. Otherwise, use our automatic ones
2658+ // special exception for llama, which uses auto scale
2659+ if ((llamamodel->hparams .rope_freq_base_train !=10000 .0f && llamamodel->hparams .rope_freq_base_train !=500000 .0f ) ||
2660+ llamamodel->hparams .rope_freq_scale_train !=1 .0f ||
2661+ llamamodel->hparams .rope_scaling_type_train ==2 )
2662+ {
2663+ printf (" Automatic RoPE Scaling: Using model internal value.\n " );
2664+ }
2665+ else
2666+ {
2667+ // Calculate rope_freq_base using the gradientAI formula, solar requires ctx *8 for correct scaling
2668+ rope_freq_base = CalcGradientAIRopeFreqBase (llamamodel->hparams .rope_freq_base_train , file_format_meta.n_ctx_train , kcpp_data->n_ctx , file_format_meta.model_architecture );
2669+ llama_ctx_params.rope_freq_base = rope_freq_base;
2670+ llama_ctx_params.rope_freq_scale = rope_freq_scale;
2671+ printf (" Automatic RoPE Scaling: Using (scale:%.3f, base:%.1f).\n " , rope_freq_scale, rope_freq_base);
2672+ }
2673+ }
2674+
2675+ if (file_format_meta.model_architecture ==GGUFArch::ARCH_RWKV)
2676+ {
2677+ printf (" \n RWKV6 Overriding EOS and BOS IDs to 0\n " );
2678+ llamamodel->vocab .special_bos_id = llamamodel->vocab .special_eos_id = 0 ;
2679+ }
2680+
2681+ llama_ctx_params.flash_attn = kcpp_data->flash_attn ;
2682+ llama_ctx_params.type_k =
2683+ (inputs.quant_k ==22 ?GGML_TYPE_IQ4_NL:
2684+ (inputs.quant_k ==21 ?GGML_TYPE_Q4_0:
2685+ (inputs.quant_k ==20 ?GGML_TYPE_Q4_1:
2686+ (inputs.quant_k ==19 ?GGML_TYPE_Q5_0:
2687+ (inputs.quant_k ==18 ?GGML_TYPE_Q5_1:
2688+ (inputs.quant_k ==17 ?GGML_TYPE_Q6_0:
2689+ (inputs.quant_k ==16 ?GGML_TYPE_Q8_0:
2690+ (inputs.quant_k ==15 ?GGML_TYPE_F16:
2691+ (inputs.quant_k ==14 ?GGML_TYPE_IQ4_NL:
2692+ (inputs.quant_k ==13 ?GGML_TYPE_Q5_0:
2693+ (inputs.quant_k ==12 ?GGML_TYPE_Q5_1:
2694+ (inputs.quant_k ==11 ?GGML_TYPE_Q5_1:
2695+ (inputs.quant_k ==10 ?GGML_TYPE_Q6_0:
2696+ (inputs.quant_k ==9 ?GGML_TYPE_Q6_0:
2697+ (inputs.quant_k ==8 ?GGML_TYPE_Q6_0:
2698+ (inputs.quant_k ==7 ?GGML_TYPE_Q8_0:
2699+ (inputs.quant_k ==6 ?GGML_TYPE_Q8_0:
2700+ (inputs.quant_k ==5 ?GGML_TYPE_Q8_0:
2701+ (inputs.quant_k ==4 ?GGML_TYPE_F16:
2702+ (inputs.quant_k ==3 ?GGML_TYPE_F16:
2703+ (inputs.quant_k ==2 ?GGML_TYPE_Q4_0:
2704+ (inputs.quant_k ==1 ?GGML_TYPE_Q8_0:
2705+ GGML_TYPE_F16))))))))))))))))))))));
2706+ llama_ctx_params.type_v =
2707+ (inputs.quant_v ==22 ?GGML_TYPE_F16:
2708+ (inputs.quant_v ==21 ?GGML_TYPE_F16:
2709+ (inputs.quant_v ==20 ?GGML_TYPE_F16:
2710+ (inputs.quant_v ==19 ?GGML_TYPE_F16:
2711+ (inputs.quant_v ==18 ?GGML_TYPE_F16:
2712+ (inputs.quant_v ==17 ?GGML_TYPE_F16:
2713+ (inputs.quant_v ==16 ?GGML_TYPE_F16:
2714+ (inputs.quant_v ==15 ?GGML_TYPE_F16:
2715+ (inputs.quant_v ==14 ?GGML_TYPE_IQ4_NL:
2716+ (inputs.quant_v ==13 ?GGML_TYPE_IQ4_NL:
2717+ (inputs.quant_v ==12 ?GGML_TYPE_IQ4_NL:
2718+ (inputs.quant_v ==11 ?GGML_TYPE_Q5_0:
2719+ (inputs.quant_v ==10 ?GGML_TYPE_IQ4_NL:
2720+ (inputs.quant_v ==9 ?GGML_TYPE_Q5_0:
2721+ (inputs.quant_v ==8 ?GGML_TYPE_Q6_0:
2722+ (inputs.quant_v ==7 ?GGML_TYPE_IQ4_NL:
2723+ (inputs.quant_v ==6 ?GGML_TYPE_Q5_0:
2724+ (inputs.quant_v ==5 ?GGML_TYPE_Q6_0:
2725+ (inputs.quant_v ==4 ?GGML_TYPE_Q6_0:
2726+ (inputs.quant_v ==3 ?GGML_TYPE_Q8_0:
2727+ (inputs.quant_v ==2 ?GGML_TYPE_Q4_0:
2728+ (inputs.quant_v ==1 ?GGML_TYPE_Q8_0:
2729+ GGML_TYPE_F16))))))))))))))))))))));
2730+ llama_ctx_v4 = llama_new_context_with_model (llamamodel, llama_ctx_params);
2731+
2732+ if (llama_ctx_v4 == NULL )
2733+ {
2734+ fprintf (stderr, " %s: error: failed to load model '%s'\n " , __func__, kcpp_data->model_filename .c_str ());
2735+ return ModelLoadResult::FAIL;
2736+ }
2737+ if (lora_filename != " " )
2738+ {
2739+ printf (" \n Attempting to apply LORA adapter: %s\n " , lora_filename.c_str ());
2740+
2741+ const char * lora_base_arg = NULL ;
2742+ if (lora_base != " " ) {
2743+ printf (" Using LORA base model: %s\n " , lora_base.c_str ());
2744+ lora_base_arg = lora_base.c_str ();
2745+ }
2746+
2747+ auto adapter = llama_lora_adapter_init (llamamodel, lora_filename.c_str ());
2748+ if (adapter == nullptr ) {
2749+ fprintf (stderr, " %s: error: failed to apply lora adapter\n " , __func__);
2750+ return ModelLoadResult::FAIL;
2751+ }
2752+ llama_lora_adapter_set (llama_ctx_v4, adapter, 1 .0f );
2753+ }
2754+
2755+ if (mmproj_filename != " " && file_format==FileFormat::GGUF_GENERIC)
2756+ {
2757+ printf (" \n Attempting to apply Multimodal Projector: %s\n " , mmproj_filename.c_str ());
2758+ clp_ctx = clip_model_load (mmproj_filename.c_str (), /* verbosity=*/ 1 );
2759+ if (clp_ctx == nullptr ) {
2760+ fprintf (stderr, " %s: error: failed to load mmproj model!\n " , __func__);
2761+ return ModelLoadResult::FAIL;
2762+ }
2763+ const int n_embd_clip = clip_n_mmproj_embd (clp_ctx);
2764+ const int n_embd_llm = llama_n_embd (llamamodel);
2765+ if (n_embd_clip != n_embd_llm) {
2766+ fprintf (stderr, " %s: mmproj embedding mismatch (%d and %d)! Make sure you use the correct mmproj file!\n " , __func__,n_embd_clip, n_embd_llm);
2767+ return ModelLoadResult::FAIL;
2768+ }
2769+ clp_img_data = clip_image_u8_init ();
2770+ }
2771+
2772+ n_vocab = llama_n_vocab (llamamodel);
2773+
2774+ // determine mem per token
2775+ std::vector<int > tmp = {1 , 2 , 3 , 4 };
2776+ llama_kv_cache_clear (llama_ctx_v4);
2777+ auto er = llama_decode (llama_ctx_v4, llama_batch_get_one (tmp.data (), tmp.size ()));
2778+ if (er!=0 )
2779+ {
2780+ printf (" \n LLAMA EVAL returned nonzero: %d\n " ,er);
2781+ }
2782+ return ModelLoadResult::SUCCESS;
25712783 }
25722784
25732785}
@@ -3150,7 +3362,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
31503362 }
31513363 }
31523364
3153- bool blasmode = (embd_inp.size () >= 1 && kcpp_cpu_has_blas () && kcpp_data->n_batch >=1 );
3365+ bool blasmode = (embd_inp.size () >= 32 && kcpp_cpu_has_blas () && kcpp_data->n_batch >=32 );
31543366
31553367 current_context_tokens.resize (n_past);
31563368
@@ -3240,9 +3452,9 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
32403452 printf (" %s\n\n " , RemoveBell (outstr).c_str ());
32413453 }
32423454
3243- if (llama_ctx_v4) {
3244- empcats_init (llama_ctx_v4, embd_inp, grammarstr);
3245- }
3455+ // if (llama_ctx_v4) {
3456+ // empcats_init(llama_ctx_v4, embd_inp, grammarstr);
3457+ // }
32463458
32473459 while (remaining_tokens > 0 )
32483460 {
@@ -3432,16 +3644,17 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
34323644 std::vector<int >& bans = antislop_banned_token_ids[n_past];
34333645
34343646 // unwanted print, but.. ^^
3435- print_tok_vec_str (bans);
3647+ // print_tok_vec_str(bans);
3648+
34363649 for (int t=0 ;t<bans.size ();++t)
34373650 {
34383651 logitsPtr[bans[t]]=lowestLogit;
34393652 }
34403653 }
34413654
3442- if (llama_ctx_v4) {
3443- empcats_step_pre (llama_ctx_v4, logitsPtr);
3444- }
3655+ // if (llama_ctx_v4) {
3656+ // empcats_step_pre(llama_ctx_v4, logitsPtr);
3657+ // }
34453658
34463659 id = SampleLogits (logitsPtr, nctx, n_vocab, last_n_size, repeat_penalty, kcpp_data->rep_pen_slope , presence_penalty,
34473660 top_k, top_a, top_p, min_p, typical_p, tfs_z, temp, rng,
@@ -3450,9 +3663,9 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
34503663 kcpp_data->dry_allowed_length , kcpp_data->dry_penalty_last_n , kcpp_data->xtc_threshold , kcpp_data->xtc_probability ,
34513664 sampler_order, grammar, dynatemp_range, dynatemp_exponent, smoothing_factor);
34523665
3453- if (llama_ctx_v4) {
3454- empcats_step_post (llama_ctx_v4, id );
3455- }
3666+ // if (llama_ctx_v4) {
3667+ // empcats_step_post(llama_ctx_v4, id );
3668+ // }
34563669
34573670 if (grammar != nullptr ) {
34583671 grammar_accept_token (file_format, n_vocab, grammar, id);
0 commit comments