Thireus
diff --git a/‎common/arg.cpp‎
Lines changed: 2 additions & 2 deletions b/‎common/arg.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎common/common.cpp‎
Lines changed: 5 additions & 0 deletions b/‎common/common.cpp‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎common/common.h‎
Lines changed: 3 additions & 0 deletions b/‎common/common.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎convert_hf_to_gguf.py‎
Lines changed: 77 additions & 2 deletions b/‎convert_hf_to_gguf.py‎
Lines changed: 77 additions & 2 deletions
diff --git a/‎examples/model-conversion/Makefile‎
Lines changed: 14 additions & 0 deletions b/‎examples/model-conversion/Makefile‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎examples/model-conversion/scripts/causal/convert-model.sh‎
Lines changed: 29 additions & 5 deletions b/‎examples/model-conversion/scripts/causal/convert-model.sh‎
Lines changed: 29 additions & 5 deletions
diff --git a/‎ggml/src/ggml-cann/common.h‎
Lines changed: 0 additions & 9 deletions b/‎ggml/src/ggml-cann/common.h‎
Lines changed: 0 additions & 9 deletions
diff --git a/‎ggml/src/ggml-cann/ggml-cann.cpp‎
Lines changed: 0 additions & 5 deletions b/‎ggml/src/ggml-cann/ggml-cann.cpp‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎ggml/src/ggml-cpu/ggml-cpu-impl.h‎
Lines changed: 1 addition & 1 deletion b/‎ggml/src/ggml-cpu/ggml-cpu-impl.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ggml/src/ggml-cpu/ops.cpp‎
Lines changed: 11 additions & 10 deletions b/‎ggml/src/ggml-cpu/ops.cpp‎
Lines changed: 11 additions & 10 deletions
@@ -2555,15 +2555,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         {"--lora"}, "FNAME",
         "path to LoRA adapter (can be repeated to use multiple adapters)",
         [](common_params & params, const std::string & value) {
-            params.lora_adapters.push_back({ std::string(value), 1.0, nullptr });
+            params.lora_adapters.push_back({ std::string(value), 1.0, "", "", nullptr });
         }
         // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
     ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
     add_opt(common_arg(
         {"--lora-scaled"}, "FNAME", "SCALE",
         "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
         [](common_params & params, const std::string & fname, const std::string & scale) {
-            params.lora_adapters.push_back({ fname, std::stof(scale), nullptr });
+            params.lora_adapters.push_back({ fname, std::stof(scale), "", "", nullptr });
         }
         // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
     ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
 
@@ -988,7 +988,12 @@ struct common_init_result common_init_from_params(common_params & params) {
             return iparams;
         }
 
+        char buf[1024];
         la.ptr = lora.get();
+        llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
+        la.task_name = buf;
+        llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
+        la.prompt_prefix = buf;
         iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
     }
 
 
@@ -34,6 +34,9 @@ struct common_adapter_lora_info {
     std::string path;
     float scale;
 
+    std::string task_name;
+    std::string prompt_prefix;
+
     struct llama_adapter_lora * ptr;
 };
 
 
@@ -72,6 +72,7 @@ class ModelBase:
     endianess: gguf.GGUFEndian
     use_temp_file: bool
     lazy: bool
+    dry_run: bool
     part_names: list[str]
     is_safetensors: bool
     hparams: dict[str, Any]
@@ -111,6 +112,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
         self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
         self.use_temp_file = use_temp_file
         self.lazy = not eager or (remote_hf_model_id is not None)
+        self.dry_run = dry_run
         self.remote_hf_model_id = remote_hf_model_id
         if remote_hf_model_id is not None:
             self.is_safetensors = True
@@ -4871,11 +4873,35 @@ def modify_tensors(self, data_torch, name, bid):
 @ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
 class XLMRobertaModel(BertModel):
     model_arch = gguf.MODEL_ARCH.BERT
+    _lora_files = {}
+    _lora_names = []
 
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
+    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, **kwargs: Any):
+        hparams = kwargs.pop("hparams", None)
+        if hparams is None:
+            hparams = ModelBase.load_hparams(dir_model, False)
+
+        if lora_names := hparams.get("lora_adaptations"):
+            self._lora_names = lora_names
+            self.model_arch = gguf.MODEL_ARCH.JINA_BERT_V3
+
+        super().__init__(dir_model, ftype, fname_out, hparams=hparams, **kwargs)
         self._xlmroberta_tokenizer_init()
 
+    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
+        if self._lora_names:
+            for name in self._lora_names:
+                fname = self.add_prefix_to_filename(self.fname_out, f"lora-{name}-")
+                self._lora_files[name] = gguf.GGUFWriter(fname, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file, dry_run=self.dry_run)
+
+        return super().generate_extra_tensors()
+
+    def set_type(self):
+        for lora_writer in self._lora_files.values():
+            lora_writer.add_type(gguf.GGUFType.ADAPTER)
+            lora_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
+        super().set_type()
+
     def set_vocab(self):
         self._xlmroberta_set_vocab()
 
@@ -4885,13 +4911,62 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         if name.startswith("roberta."):
             name = name[8:]
 
+        # jina-embeddings-v3
+        if ".parametrizations." in name:
+            name = name.replace(".parametrizations.", ".")
+            if name.endswith(".original"):
+                name = name[:-9]
+
         # position embeddings start at pad_token_id + 1, so just chop down the weight tensor
         if name == "embeddings.position_embeddings.weight":
             if self._position_offset is not None:
                 data_torch = data_torch[self._position_offset:,:]
 
+        if name.endswith(".0.lora_A") or name.endswith(".0.lora_B"):
+            if name.startswith("pooler.dense"):
+                return []
+
+            num_loras = data_torch.size(0)
+            assert num_loras == len(self._lora_names)
+
+            # Split out each LoRA in their own GGUF
+            for i, lora_writer in enumerate(self._lora_files.values()):
+                new_name = self.map_tensor_name(name[:-9]) + name[-7:].lower()
+                data = data_torch[i, :, :]
+                # Transpose/flip token_embd/types into correct shape
+                if new_name == "token_embd.weight.lora_b":
+                    data = data.T
+                elif new_name.startswith("token_types.weight."):
+                    new_name = new_name[:-1] + ("a" if new_name[-1:] == "b" else "b")
+                lora_writer.add_tensor(new_name, data.float().numpy(), raw_dtype=gguf.GGMLQuantizationType.F32)
+
+            return []
+
         return super().modify_tensors(data_torch, name, bid)
 
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+
+        # jina-embeddings-v3
+        if rotary_emb_base := self.hparams.get("rotary_emb_base"):
+            self.gguf_writer.add_rope_freq_base(rotary_emb_base)
+        lora_alpha = self.hparams.get("lora_alpha")
+        if lora_prompt_prefixes := self.hparams.get("task_instructions"):
+            assert self._lora_files and all(lora_name in lora_prompt_prefixes for lora_name in self._lora_files.keys())
+        for lora_name, lora_writer in self._lora_files.items():
+            lora_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, lora_alpha if lora_alpha is not None else 1.0)
+            lora_writer.add_string(gguf.Keys.Adapter.LORA_TASK_NAME, lora_name)
+            if lora_prompt_prefixes:
+                lora_writer.add_string(gguf.Keys.Adapter.LORA_PROMPT_PREFIX, lora_prompt_prefixes[lora_name])
+
+    def write(self):
+        super().write()
+        for lora_writer in self._lora_files.values():
+            lora_writer.write_header_to_file()
+            lora_writer.write_kv_data_to_file()
+            lora_writer.write_tensors_to_file(progress=True)
+            lora_writer.close()
+
 
 @ModelBase.register("GemmaForCausalLM")
 class GemmaModel(TextModel):
 
@@ -37,6 +37,20 @@ causal-convert-model:
 	METADATA_OVERRIDE="$(METADATA_OVERRIDE)" \
 	./scripts/causal/convert-model.sh
 
+causal-convert-mm-model-bf16: OUTTYPE=bf16
+causal-convert-mm-model-bf16: MM_OUTTYPE=f16
+causal-convert-mm-model-bf16: causal-convert-mm-model
+
+causal-convert-mm-model:
+	$(call validate_model_path,causal-convert-mm-model)
+	@MODEL_NAME="$(MODEL_NAME)" OUTTYPE="$(OUTTYPE)" MODEL_PATH="$(MODEL_PATH)" \
+	METADATA_OVERRIDE="$(METADATA_OVERRIDE)" \
+	./scripts/causal/convert-model.sh
+
+	@MODEL_NAME="$(MODEL_NAME)" OUTTYPE="$(MM_OUTTYPE)" MODEL_PATH="$(MODEL_PATH)" \
+	METADATA_OVERRIDE="$(METADATA_OVERRIDE)" \
+	./scripts/causal/convert-model.sh --mmproj
+
 causal-run-original-model:
 	$(call validate_model_path,causal-run-original-model)
 	@MODEL_PATH="$(MODEL_PATH)" ./scripts/causal/run-org-model.py
 
@@ -1,5 +1,21 @@
 #!/bin/bash
 
+set -e
+
+# Parse command line arguments
+MMPROJ=""
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --mmproj)
+            MMPROJ="--mmproj"
+            shift
+            ;;
+        *)
+            shift
+            ;;
+    esac
+done
+
 MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}"
 OUTPUT_DIR="${OUTPUT_DIR:-../../models}"
 TYPE="${OUTTYPE:-f16}"
@@ -11,12 +27,20 @@ echo "Model name: ${MODEL_NAME}"
 echo "Data  type: ${TYPE}"
 echo "Converted model path:: ${CONVERTED_MODEL}"
 echo "Metadata override: ${METADATA_OVERRIDE}"
-python ../../convert_hf_to_gguf.py --verbose \
-    ${MODEL_PATH} \
-    --outfile ${CONVERTED_MODEL} \
-    --outtype ${TYPE} \
-    --metadata "${METADATA_OVERRIDE}"
+
+CMD_ARGS=("python" "../../convert_hf_to_gguf.py" "--verbose")
+CMD_ARGS+=("${MODEL_PATH}")
+CMD_ARGS+=("--outfile" "${CONVERTED_MODEL}")
+CMD_ARGS+=("--outtype" "${TYPE}")
+[[ -n "$METADATA_OVERRIDE" ]] && CMD_ARGS+=("--metadata" "${METADATA_OVERRIDE}")
+[[ -n "$MMPROJ" ]] && CMD_ARGS+=("${MMPROJ}")
+
+"${CMD_ARGS[@]}"
 
 echo ""
 echo "The environment variable CONVERTED_MODEL can be set to this path using:"
 echo "export CONVERTED_MODEL=$(realpath ${CONVERTED_MODEL})"
+if [[ -n "$MMPROJ" ]]; then
+    mmproj_file="${OUTPUT_DIR}/mmproj-$(basename "${CONVERTED_MODEL}")"
+    echo "The mmproj model was created in $(realpath "$mmproj_file")"
+fi
@@ -374,7 +374,6 @@ struct ggml_backend_cann_context {
 #endif
     cann_task_queue task_queue;
     bool async_mode;
-    bool support_set_rows;
     // Rope Cache
     void* rope_init_ptr = nullptr;
     void* rope_sin_ptr = nullptr;
@@ -400,14 +399,6 @@ struct ggml_backend_cann_context {
         async_mode = parse_bool(get_env("GGML_CANN_ASYNC_MODE").value_or(""));
         GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__,
             device, async_mode ? "ON" : "OFF");
-
-        support_set_rows = parse_bool(get_env("LLAMA_SET_ROWS").value_or(""));
-        GGML_LOG_INFO("%s: LLAMA_SET_ROWS is %s\n", __func__, support_set_rows ? "ON" : "OFF");
-
-        if (!support_set_rows) {
-            GGML_LOG_INFO("%s: CANN Graph currently only supports execution when LLAMA_SET_ROWS is ON. "
-                    "Falling back to eager mode.\n", __func__);
-        }
     }
 
     /**
 
@@ -2251,11 +2251,6 @@ static enum ggml_status ggml_backend_cann_graph_compute(
     bool use_cann_graph = true;
     bool cann_graph_update_required = false;
 
-    // check environment LLAMA_SET_ROWS
-    if (!cann_ctx->support_set_rows) {
-        use_cann_graph = false;
-    }
-
     if (use_cann_graph) {
         if (cann_ctx->cann_graph == nullptr) {
             cann_ctx->cann_graph.reset(new ggml_cann_graph());
 
@@ -489,7 +489,7 @@ inline static int16x8_t vec_padd_s16(int16x8_t a, int16x8_t b) {
 /**
  * @see https://github.com/ggml-org/llama.cpp/pull/14037
  */
-inline float vec_hsum(float32x4_t v) {
+inline static float vec_hsum(float32x4_t v) {
     float32x4_t v_temp = v + vec_reve(v);
     return v_temp[0] + v_temp[1];
 }
 
@@ -9003,8 +9003,7 @@ static void ggml_compute_forward_ssm_scan_f32(
     GGML_ASSERT(src4->nb[0] == sizeof(float));
     GGML_ASSERT(src5->nb[0] == sizeof(float));
     GGML_ASSERT(src6->nb[0] == sizeof(int32_t));
-    // allows optimizing the modulo since n_group should be a power of 2
-    GGML_ASSERT((ng & -ng) == ng);
+    GGML_ASSERT(nh % ng == 0);
 
     // heads per thread
     const int dh = (nh + nth - 1)/nth;
@@ -9035,6 +9034,7 @@ static void ggml_compute_forward_ssm_scan_f32(
                     // ref: https://github.com/state-spaces/mamba/blob/62db608da60f6fc790b8ed9f4b3225e95ca15fde/mamba_ssm/ops/triton/softplus.py#L16
                     const float dt_soft_plus = dt[h] <= 20.0f ? log1pf(expf(dt[h])) : dt[h];
                     const float dA = expf(dt_soft_plus * A[h]);
+                    const int g = h / (nh / ng); // repeat_interleave
 
                     // dim
                     for (int i1 = 0; i1 < nr; ++i1) {
@@ -9057,8 +9057,8 @@ static void ggml_compute_forward_ssm_scan_f32(
                             // TODO: maybe unroll more?
                             for (int j = 0; j < 1; j++) {
                                 GGML_F32_VEC t0 = GGML_F32_VEC_LOAD(s0 + i + j*ggml_f32_epr + ii*nc);
-                                GGML_F32_VEC t1 = GGML_F32_VEC_LOAD(B + i + j*ggml_f32_epr + (h & (ng - 1))*nc);
-                                GGML_F32_VEC t2 = GGML_F32_VEC_LOAD(C + i + j*ggml_f32_epr + (h & (ng - 1))*nc);
+                                GGML_F32_VEC t1 = GGML_F32_VEC_LOAD(B + i + j*ggml_f32_epr + g*nc);
+                                GGML_F32_VEC t2 = GGML_F32_VEC_LOAD(C + i + j*ggml_f32_epr + g*nc);
 
                                 t0 = GGML_F32_VEC_MUL(t0, adA);
                                 t1 = GGML_F32_VEC_MUL(t1, axdt);
@@ -9090,8 +9090,8 @@ static void ggml_compute_forward_ssm_scan_f32(
                         for (int i = 0; i < np; i += GGML_F32_STEP) {
                             for (int j = 0; j < GGML_F32_ARR; j++) {
                                 ax[j] = GGML_F32_VEC_LOAD(s0 + i + j*GGML_F32_EPR + ii*nc);
-                                ay[j] = GGML_F32_VEC_LOAD(B + i + j*GGML_F32_EPR + (h & (ng - 1))*nc);
-                                az[j] = GGML_F32_VEC_LOAD(C + i + j*GGML_F32_EPR + (h & (ng - 1))*nc);
+                                ay[j] = GGML_F32_VEC_LOAD(B + i + j*GGML_F32_EPR + g*nc);
+                                az[j] = GGML_F32_VEC_LOAD(C + i + j*GGML_F32_EPR + g*nc);
 
                                 ax[j] = GGML_F32_VEC_MUL(ax[j], adA);
                                 ay[j] = GGML_F32_VEC_MUL(ay[j], axdt);
@@ -9113,7 +9113,7 @@ static void ggml_compute_forward_ssm_scan_f32(
                         // d_state
                         for (int i0 = np; i0 < nc; ++i0) {
                             const int i = i0 + ii*nc;
-                            const int ig = i0 + (h & (ng - 1))*nc;
+                            const int ig = i0 + g*nc;
                             // state = prev_state * dA + dB * x
                             const float state = (s0[i] * dA) + (B[ig] * x_dt);
                             // y = rowwise_dotprod(state, C)
@@ -9130,6 +9130,7 @@ static void ggml_compute_forward_ssm_scan_f32(
                 for (int h = ih0; h < ih1; ++h) {
                     // ref: https://github.com/state-spaces/mamba/blob/62db608da60f6fc790b8ed9f4b3225e95ca15fde/mamba_ssm/ops/triton/softplus.py#L16
                     const float dt_soft_plus = dt[h] <= 20.0f ? log1pf(expf(dt[h])) : dt[h];
+                    const int g = h / (nh / ng); // repeat_interleave
 
                     // dim
                     for (int i1 = 0; i1 < nr; ++i1) {
@@ -9144,8 +9145,8 @@ static void ggml_compute_forward_ssm_scan_f32(
                         // TODO: what happens when (d_state % svcntw()) != 0?
                         for (int64_t k = 0; k < nc; k += svcntw()) {
                             svfloat32_t vA = GGML_F32_VEC_LOAD(&A[h*nc + k]);
-                            svfloat32_t vB = GGML_F32_VEC_LOAD(&B[k + (h & (ng - 1))*nc]);
-                            svfloat32_t vC = GGML_F32_VEC_LOAD(&C[k + (h & (ng - 1))*nc]);
+                            svfloat32_t vB = GGML_F32_VEC_LOAD(&B[k + g*nc]);
+                            svfloat32_t vC = GGML_F32_VEC_LOAD(&C[k + g*nc]);
                             svfloat32_t vs0 = GGML_F32_VEC_LOAD(&s0[ii*nc + k]);
 
                             svfloat32_t t1 = GGML_F32_VEC_MUL(vdt_soft_plus, vA);
@@ -9165,7 +9166,7 @@ static void ggml_compute_forward_ssm_scan_f32(
                         // d_state
                         for (int i0 = 0; i0 < nc; ++i0) {
                             const int i = i0 + ii*nc;
-                            const int ig = i0 + (h & (ng - 1))*nc;
+                            const int ig = i0 + g*nc;
                             // state = prev_state * dA + dB * x
                             const float state = (s0[i] * expf(dt_soft_plus * A[i0 + h*nc])) + (B[ig] * x_dt);
                             // y = rowwise_dotprod(state, C)
Original file line number	Diff line number	Diff line change
`@@ -988,7 +988,12 @@ struct common_init_result common_init_from_params(common_params & params) {`
`988`	`988`	`return iparams;`
`989`	`989`	`}`
`990`	`990`
	`991`	`+ char buf[1024];`
`991`	`992`	`la.ptr = lora.get();`
	`993`	`+ llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));`
	`994`	`+ la.task_name = buf;`
	`995`	`+ llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));`
	`996`	`+ la.prompt_prefix = buf;`
`992`	`997`	`iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters`
`993`	`998`	`}`
`994`	`999`
Original file line number	Diff line number	Diff line change
`@@ -489,7 +489,7 @@ inline static int16x8_t vec_padd_s16(int16x8_t a, int16x8_t b) {`
`489`	`489`	`/**`
`490`	`490`	`* @see https://github.com/ggml-org/llama.cpp/pull/14037`
`491`	`491`	`*/`
`492`		`-inline float vec_hsum(float32x4_t v) {`
	`492`	`+inline static float vec_hsum(float32x4_t v) {`
`493`	`493`	`float32x4_t v_temp = v + vec_reve(v);`
`494`	`494`	`return v_temp[0] + v_temp[1];`
`495`	`495`	`}`