ServeurpersoCom
diff --git a/‎common/arg.cpp‎
Lines changed: 7 additions & 0 deletions b/‎common/arg.cpp‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎common/common.h‎
Lines changed: 2 additions & 0 deletions b/‎common/common.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎convert_hf_to_gguf.py‎
Lines changed: 30 additions & 0 deletions b/‎convert_hf_to_gguf.py‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎examples/eval-callback/eval-callback.cpp‎
Lines changed: 1 addition & 1 deletion b/‎examples/eval-callback/eval-callback.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/model-conversion/scripts/causal/run-converted-model.sh‎
Lines changed: 7 additions & 1 deletion b/‎examples/model-conversion/scripts/causal/run-converted-model.sh‎
Lines changed: 7 additions & 1 deletion
@@ -1951,6 +1951,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.kv_unified = true;
         }
     ).set_env("LLAMA_ARG_KV_SPLIT"));
+    add_opt(common_arg(
+        {"--dump-cache"},
+        "dump cache statistics after each token generation",
+        [](common_params & params) {
+            params.dump_cache = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(common_arg(
         {"--no-context-shift"},
         string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
 
@@ -398,6 +398,8 @@ struct common_params {
 
     ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
     ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
+    
+    bool dump_cache = false; // dump cache statistics after each token
 
     common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
 
 
@@ -3826,6 +3826,36 @@ def set_vocab(self):
         super().set_vocab()
 
 
+@ModelBase.register("Qwen3NextForCausalLM")
+class Qwen3NextModel(Qwen3MoeModel):
+    model_arch = gguf.MODEL_ARCH.QWEN3NEXT
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_ssm_conv_kernel(self.find_hparam(["linear_conv_kernel_dim"]))
+        self.gguf_writer.add_ssm_state_size(self.find_hparam(["linear_key_head_dim"]))
+        self.gguf_writer.add_ssm_group_count(self.find_hparam(["linear_num_key_heads"]))
+        self.gguf_writer.add_ssm_time_step_rank(self.find_hparam(["linear_num_value_heads"]))
+        self.gguf_writer.add_ssm_inner_size(self.find_hparam(['linear_value_head_dim']) * self.find_hparam(['linear_num_value_heads']))
+        if (rope_dim := self.hparams.get("head_dim")) is None:
+            rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
+        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.25)))
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        if name.startswith("mtp"):
+            return [] # ignore MTP layers for now
+        if name.endswith(".A_log"):
+            data_torch = -torch.exp(data_torch)
+        elif name.endswith(".dt_bias"):
+            name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias"
+        elif "conv1d" in name:
+            data_torch = data_torch.squeeze()
+        elif name.endswith("norm.weight") and not name.endswith("linear_attn.norm.weight"):
+            data_torch = data_torch + 1
+            
+        yield from Qwen2MoeModel.modify_tensors(self, data_torch, name, bid)
+
+
 @ModelBase.register("GPT2LMHeadModel")
 class GPT2Model(TextModel):
     model_arch = gguf.MODEL_ARCH.GPT2
 
@@ -154,7 +154,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
 
     if (!ggml_is_quantized(t->type)) {
         uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
-        ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
+        ggml_print_tensor(data, t->type, t->ne, t->nb, 8);
     }
 
     return true;
 
@@ -4,6 +4,11 @@ set -e
 
 # First try command line argument, then environment variable, then file
 CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
+MODEL_TESTING_PROMPT="${2:-"$MODEL_TESTING_PROMPT"}"
+
+if [ -z "$MODEL_TESTING_PROMPT"]; then
+    MODEL_TESTING_PROMPT="Hello, my name is"
+fi
 
 # Final check if we have a model path
 if [ -z "$CONVERTED_MODEL" ]; then
@@ -14,7 +19,8 @@ if [ -z "$CONVERTED_MODEL" ]; then
 fi
 
 echo $CONVERTED_MODEL
+echo $MODEL_TESTING_PROMPT
 
 cmake --build ../../build --target llama-logits -j8
 
-../../build/bin/llama-logits -m "$CONVERTED_MODEL" "Hello, my name is"
+../../build/bin/llama-logits -m "$CONVERTED_MODEL" "$MODEL_TESTING_PROMPT"
Original file line number	Diff line number	Diff line change
`@@ -154,7 +154,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {`
`154`	`154`
`155`	`155`	`if (!ggml_is_quantized(t->type)) {`
`156`	`156`	`uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();`
`157`		`- ggml_print_tensor(data, t->type, t->ne, t->nb, 3);`
	`157`	`+ ggml_print_tensor(data, t->type, t->ne, t->nb, 8);`
`158`	`158`	`}`
`159`	`159`
`160`	`160`	`return true;`