Skip to content

Commit 0e68971

Browse files
Merge pull request #339 from janhq/update-dev-from-master-2025-11-25-00-34
Sync master with upstream release b7150
2 parents edbf778 + 3d07caa commit 0e68971

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+4499
-3031
lines changed

convert_hf_to_gguf.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -565,7 +565,7 @@ def prepare_tensors(self):
565565
gguf.MODEL_TENSOR.ALTUP_PREDICT_COEF,
566566
)
567567
)
568-
or not new_name.endswith(".weight")
568+
or new_name[-7:] not in (".weight", ".lora_a", ".lora_b")
569569
):
570570
data_qtype = gguf.GGMLQuantizationType.F32
571571

@@ -4183,6 +4183,21 @@ def set_vocab(self):
41834183
super().set_vocab()
41844184

41854185

4186+
@ModelBase.register("RND1")
4187+
class RND1Model(Qwen2MoeModel):
4188+
model_arch = gguf.MODEL_ARCH.RND1
4189+
4190+
def set_gguf_parameters(self):
4191+
super().set_gguf_parameters()
4192+
4193+
# RND1 specific parameters
4194+
# RND1 uses bidirectional attention
4195+
self.gguf_writer.add_causal_attention(False)
4196+
4197+
if (mask_token_id := self.hparams.get("mask_token_id")) is not None:
4198+
self.gguf_writer.add_mask_token_id(mask_token_id)
4199+
4200+
41864201
@ModelBase.register("Qwen3VLForConditionalGeneration", "Qwen3VLMoeForConditionalGeneration")
41874202
class Qwen3VLVisionModel(MmprojModel):
41884203
def __init__(self, *args, **kwargs):

convert_lora_to_gguf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,7 @@ def parse_args() -> argparse.Namespace:
242242
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
243243
)
244244
parser.add_argument(
245-
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
245+
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f32",
246246
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
247247
)
248248
parser.add_argument(

examples/batched/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
The example demonstrates batched generation from a given prompt
44

55
```bash
6-
./llama-batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4
6+
./llama-batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4 --kv-unified
77

88
...
99

examples/diffusion/README.md

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,54 @@ More Info:
66
- https://github.com/ggml-org/llama.cpp/pull/14644
77
- https://github.com/ggml-org/llama.cpp/pull/14771
88

9+
## Parameters
10+
The diffusion CLI supports various parameters to control the generation process:
911

10-
Example of using Dream architechture: `llama-diffusion-cli -m dream7b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-eps 0.001 --diffusion-algorithm 3 --diffusion-steps 256 --diffusion-visual`
12+
### Core Diffusion Parameters
13+
- `--diffusion-steps`: Number of diffusion steps (default: 256)
14+
- `--diffusion-algorithm`: Algorithm for token selection
15+
- `0`: ORIGIN - Token will be generated in a purely random order from https://arxiv.org/abs/2107.03006.
16+
- `1`: ENTROPY_BASED - Entropy-based selection
17+
- `2`: MARGIN_BASED - Margin-based selection
18+
- `3`: RANDOM - Random selection
19+
- `4`: CONFIDENCE_BASED - Confidence-based selection (default)
20+
- More documentation here https://github.com/DreamLM/Dream
21+
- `--diffusion-visual`: Enable live visualization during generation
1122

12-
Example of using LLaDA architechture: `llama-diffusion-cli -m llada-8b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-block-length 32 --diffusion-steps 256 --diffusion-visual`
23+
### Scheduling Parameters
24+
Choose one of the following scheduling methods:
1325

26+
**Timestep-based scheduling:**
27+
- `--diffusion-eps`: Epsilon value for timestep scheduling (e.g., 0.001)
28+
29+
**Block-based scheduling:**
30+
- `--diffusion-block-length`: Block size for block-based scheduling (e.g., 32)
31+
32+
### Sampling Parameters
33+
- `--temp`: Temperature for sampling (0.0 = greedy/deterministic, higher = more random)
34+
- `--top-k`: Top-k filtering for sampling
35+
- `--top-p`: Top-p (nucleus) filtering for sampling
36+
- `--seed`: Random seed for reproducibility
37+
38+
### Model Parameters
39+
- `-m`: Path to the GGUF model file
40+
- `-p`: Input prompt text
41+
- `-ub`: Maximum sequence length (ubatch size)
42+
- `-c`: Context size
43+
- `-b`: Batch size
44+
45+
### Examples
46+
#### Dream architechture:
47+
```
48+
llama-diffusion-cli -m dream7b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-eps 0.001 --diffusion-algorithm 3 --diffusion-steps 256 --diffusion-visual
49+
```
50+
51+
#### LLaDA architechture:
52+
```
53+
llama-diffusion-cli -m llada-8b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-block-length 32 --diffusion-steps 256 --diffusion-visual
54+
```
55+
56+
#### RND1 architecture:
57+
```
58+
llama-diffusion-cli -m RND1-Base-0910.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-algorithm 1 --diffusion-steps 256 --diffusion-visual --temp 0.5 --diffusion-eps 0.001
59+
```

ggml/CMakeLists.txt

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,16 +25,17 @@ if(GIT_EXE)
2525
)
2626
endif()
2727

28-
# Build the version string with optional dirty flag
2928
set(GGML_VERSION "${GGML_VERSION_BASE}")
30-
if(GGML_GIT_DIRTY AND NOT GGML_GIT_DIRTY EQUAL 0)
31-
set(GGML_VERSION "${GGML_VERSION}-dirty")
32-
endif()
3329

3430
if(NOT GGML_BUILD_COMMIT)
3531
set(GGML_BUILD_COMMIT "unknown")
3632
endif()
3733

34+
# Build the commit string with optional dirty flag
35+
if(DEFINED GGML_GIT_DIRTY AND GGML_GIT_DIRTY EQUAL 1)
36+
set(GGML_BUILD_COMMIT "${GGML_BUILD_COMMIT}-dirty")
37+
endif()
38+
3839
include(CheckIncludeFileCXX)
3940

4041
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

ggml/src/CMakeLists.txt

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,14 @@ function(ggml_add_cpu_backend_variant tag_name)
328328
set(GGML_INTERNAL_${feat} OFF)
329329
endforeach()
330330

331+
foreach (feat ${ARGN})
332+
set(GGML_INTERNAL_${feat} ON)
333+
endforeach()
334+
elseif (GGML_SYSTEM_ARCH STREQUAL "riscv64")
335+
foreach (feat RVV)
336+
set(GGML_INTERNAL_${feat} OFF)
337+
endforeach()
338+
331339
foreach (feat ${ARGN})
332340
set(GGML_INTERNAL_${feat} ON)
333341
endforeach()
@@ -402,6 +410,13 @@ if (GGML_CPU_ALL_VARIANTS)
402410
else()
403411
message(FATAL_ERROR "Unsupported s390x target OS: ${CMAKE_SYSTEM_NAME}")
404412
endif()
413+
elseif (GGML_SYSTEM_ARCH STREQUAL "riscv64")
414+
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
415+
ggml_add_cpu_backend_variant(riscv64_0)
416+
ggml_add_cpu_backend_variant(riscv64_v RVV)
417+
else()
418+
message(FATAL_ERROR "Unsupported RISC-V target OS: ${CMAKE_SYSTEM_NAME}")
419+
endif()
405420
else()
406421
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported with ${GGML_SYSTEM_ARCH} on ${CMAKE_SYSTEM_NAME}")
407422
endif()

ggml/src/ggml-cann/ggml-cann.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2303,9 +2303,9 @@ static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend,
23032303
// calculate rope cache for fist layer in current device.
23042304
cann_ctx->rope_cache.cached = false;
23052305

2306+
bool cann_graph_update_required = false;
23062307
#ifdef USE_ACL_GRAPH
23072308
bool use_cann_graph = true;
2308-
bool cann_graph_update_required = false;
23092309

23102310
static bool prefill_use_graph = parse_bool(get_env("GGML_CANN_PREFILL_USE_GRAPH").value_or(""));
23112311
if (!prefill_use_graph) {
@@ -2336,7 +2336,6 @@ static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend,
23362336
}
23372337
#else
23382338
bool use_cann_graph = false;
2339-
bool cann_graph_update_required = false;
23402339
#endif // USE_ACL_GRAPH
23412340
evaluate_and_capture_cann_graph(cann_ctx, cgraph, use_cann_graph, cann_graph_update_required);
23422341

ggml/src/ggml-cpu/CMakeLists.txt

Lines changed: 27 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -452,22 +452,35 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
452452
ggml-cpu/spacemit/ime_kernels.h
453453
)
454454
endif()
455-
set(MARCH_STR "rv64gc")
456-
if (GGML_RV_ZFH)
457-
string(APPEND MARCH_STR "_zfh")
458-
endif()
459-
if (GGML_XTHEADVECTOR)
460-
string(APPEND MARCH_STR "_xtheadvector")
461-
elseif (GGML_RVV)
462-
string(APPEND MARCH_STR "_v")
463-
if (GGML_RV_ZVFH)
464-
string(APPEND MARCH_STR "_zvfh")
455+
if(NOT GGML_CPU_ALL_VARIANTS)
456+
set(MARCH_STR "rv64gc")
457+
if (GGML_RV_ZFH)
458+
string(APPEND MARCH_STR "_zfh")
465459
endif()
460+
if (GGML_XTHEADVECTOR)
461+
string(APPEND MARCH_STR "_xtheadvector")
462+
elseif (GGML_RVV)
463+
string(APPEND MARCH_STR "_v")
464+
if (GGML_RV_ZVFH)
465+
string(APPEND MARCH_STR "_zvfh")
466+
endif()
467+
endif()
468+
if (GGML_RV_ZICBOP)
469+
string(APPEND MARCH_STR "_zicbop")
470+
endif()
471+
list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
472+
else()
473+
# Begin with the lowest baseline
474+
set(ARCH_DEFINITIONS "")
475+
476+
if (GGML_INTERNAL_RVV)
477+
message(STATUS "RVV enabled")
478+
list(APPEND ARCH_DEFINITIONS GGML_USE_RVV)
479+
list(APPEND ARCH_FLAGS -march=rv64gc_v -mabi=lp64d)
480+
endif()
481+
482+
ggml_add_cpu_backend_features(${GGML_CPU_NAME} riscv ${ARCH_DEFINITIONS})
466483
endif()
467-
if (GGML_RV_ZICBOP)
468-
string(APPEND MARCH_STR "_zicbop")
469-
endif()
470-
list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
471484
elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
472485
message(STATUS "s390x detected")
473486
list(APPEND GGML_CPU_SOURCES

ggml/src/ggml-cpu/arch-fallback.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,8 @@
5151
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
5252
// repack.cpp
5353
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
54-
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
5554
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
5655
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
57-
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
5856
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
5957
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
6058
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)

0 commit comments

Comments
 (0)