Skip to content

Commit 3e9f565

Browse files
committed
Merge branch 'master' into quantize
2 parents 2fd0b41 + f423981 commit 3e9f565

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

84 files changed

+3774
-3547
lines changed

.github/workflows/build.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -803,7 +803,7 @@ jobs:
803803
env:
804804
OPENBLAS_VERSION: 0.3.23
805805
SDE_VERSION: 9.33.0-2024-01-07
806-
VULKAN_VERSION: 1.4.304.1
806+
VULKAN_VERSION: 1.4.309.0
807807

808808
strategy:
809809
matrix:

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,8 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
112112
- [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM)
113113
- [x] [QRWKV-6](https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1)
114114
- [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
115+
- [X] [Trillion-7B-preview](https://huggingface.co/trillionlabs/Trillion-7B-preview)
116+
- [x] [Ling models](https://huggingface.co/collections/inclusionAI/ling-67c51c85b34a7ea0aba94c32)
115117

116118
#### Multimodal
117119

ci/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ docker run --privileged -it \
6060
Inside the container, execute the following commands:
6161

6262
```bash
63-
apt update -y && apt install -y bc cmake git python3.10-venv time unzip wget
63+
apt update -y && apt install -y bc cmake ccache git python3.10-venv time unzip wget
6464
git config --global --add safe.directory /ws
6565
GG_BUILD_MUSA=1 bash ./ci/run.sh /ci-results /ci-cache
6666
```

ci/run.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ fi
6969
if [ ! -z ${GG_BUILD_MUSA} ]; then
7070
# Use qy1 by default (MTT S80)
7171
MUSA_ARCH=${MUSA_ARCH:-21}
72-
CMAKE_EXTRA="-DGGML_MUSA=ON -DMUSA_ARCHITECTURES=${MUSA_ARCH}"
72+
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_MUSA=ON -DMUSA_ARCHITECTURES=${MUSA_ARCH}"
7373
fi
7474
## helpers
7575

common/minja/minja.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,7 @@ class Value : public std::enable_shared_from_this<Value> {
240240
auto index = key.get<int>();
241241
return array_->at(index < 0 ? array_->size() + index : index);
242242
} else if (object_) {
243-
if (!key.is_hashable()) throw std::runtime_error("Unashable type: " + dump());
243+
if (!key.is_hashable()) throw std::runtime_error("Unhashable type: " + dump());
244244
auto it = object_->find(key.primitive_);
245245
if (it == object_->end()) return Value();
246246
return it->second;
@@ -249,7 +249,7 @@ class Value : public std::enable_shared_from_this<Value> {
249249
}
250250
void set(const Value& key, const Value& value) {
251251
if (!object_) throw std::runtime_error("Value is not an object: " + dump());
252-
if (!key.is_hashable()) throw std::runtime_error("Unashable type: " + dump());
252+
if (!key.is_hashable()) throw std::runtime_error("Unhashable type: " + dump());
253253
(*object_)[key.primitive_] = value;
254254
}
255255
Value call(const std::shared_ptr<Context> & context, ArgumentsValue & args) const {

convert_hf_to_gguf.py

Lines changed: 107 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -708,6 +708,12 @@ def get_vocab_base_pre(self, tokenizer) -> str:
708708
if chkhsh == "7dec86086fcc38b66b7bc1575a160ae21cf705be7718b9d5598190d7c12db76f":
709709
# ref: https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k
710710
res = "superbpe"
711+
if chkhsh == "1994ffd01900cfb37395608534236ecd63f2bd5995d6cb1004dda1af50240f15":
712+
# ref: https://huggingface.co/trillionlabs/Trillion-7B-preview
713+
res = "trillion"
714+
if chkhsh == "96a5f08be6259352137b512d4157e333e21df7edd3fcd152990608735a65b224":
715+
# ref: https://huggingface.co/inclusionAI/Ling-lite
716+
res = "bailingmoe"
711717

712718
if res is None:
713719
logger.warning("\n")
@@ -3551,8 +3557,8 @@ def set_gguf_parameters(self):
35513557
head_size = hidden_size // num_attention_heads
35523558
rms_norm_eps = self.hparams["rms_norm_eps"]
35533559
intermediate_size = self.hparams["intermediate_size"]
3554-
time_mix_extra_dim = 64 if hidden_size >= 4096 else 32
3555-
time_decay_extra_dim = 128 if hidden_size >= 4096 else 64
3560+
time_mix_extra_dim = self.hparams.get("lora_rank_tokenshift", 64 if hidden_size >= 4096 else 32)
3561+
time_decay_extra_dim = self.hparams.get("lora_rank_decay", 128 if hidden_size >= 4096 else 64)
35563562

35573563
# RWKV isn't context limited
35583564
self.gguf_writer.add_context_length(1048576)
@@ -5130,6 +5136,105 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
51305136
return super().modify_tensors(data_torch, name, bid)
51315137

51325138

5139+
@Model.register("BailingMoeForCausalLM")
5140+
class BailingMoeModel(Model):
5141+
model_arch = gguf.MODEL_ARCH.BAILINGMOE
5142+
5143+
def set_vocab(self):
5144+
self._set_vocab_gpt2()
5145+
5146+
def set_gguf_parameters(self):
5147+
super().set_gguf_parameters()
5148+
hparams = self.hparams
5149+
rope_dim = hparams.get("head_dim") or hparams["hidden_size"] // hparams["num_attention_heads"]
5150+
5151+
self.gguf_writer.add_rope_dimension_count(rope_dim)
5152+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
5153+
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
5154+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
5155+
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
5156+
self.gguf_writer.add_expert_weights_scale(1.0)
5157+
self.gguf_writer.add_expert_count(hparams["num_experts"])
5158+
self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"])
5159+
self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
5160+
5161+
_experts: list[dict[str, Tensor]] | None = None
5162+
5163+
@staticmethod
5164+
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
5165+
if n_head_kv is not None and n_head != n_head_kv:
5166+
n_head = n_head_kv
5167+
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
5168+
.swapaxes(1, 2)
5169+
.reshape(weights.shape))
5170+
5171+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
5172+
n_head = self.hparams["num_attention_heads"]
5173+
n_kv_head = self.hparams.get("num_key_value_heads")
5174+
n_embd = self.hparams["hidden_size"]
5175+
head_dim = self.hparams.get("head_dim") or n_embd // n_head
5176+
5177+
output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
5178+
5179+
if name.endswith("attention.dense.weight"):
5180+
return [(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid), data_torch)]
5181+
elif name.endswith("query_key_value.weight"):
5182+
q, k, v = data_torch.split([n_head * head_dim, n_kv_head * head_dim, n_kv_head * head_dim], dim=-2)
5183+
5184+
return [
5185+
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), BailingMoeModel.permute(q, n_head, n_head)),
5186+
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), BailingMoeModel.permute(k, n_head, n_kv_head)),
5187+
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), v)
5188+
]
5189+
elif name.find("mlp.experts") != -1:
5190+
n_experts = self.hparams["num_experts"]
5191+
assert bid is not None
5192+
5193+
tensors: list[tuple[str, Tensor]] = []
5194+
5195+
if self._experts is None:
5196+
self._experts = [{} for _ in range(self.block_count)]
5197+
5198+
self._experts[bid][name] = data_torch
5199+
5200+
if len(self._experts[bid]) >= n_experts * 3:
5201+
# merge the experts into a single 3d tensor
5202+
for w_name in ["down_proj", "gate_proj", "up_proj"]:
5203+
datas: list[Tensor] = []
5204+
5205+
for xid in range(n_experts):
5206+
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
5207+
datas.append(self._experts[bid][ename])
5208+
del self._experts[bid][ename]
5209+
5210+
data_torch = torch.stack(datas, dim=0)
5211+
5212+
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
5213+
5214+
new_name = self.map_tensor_name(merged_name)
5215+
5216+
tensors.append((new_name, data_torch))
5217+
5218+
return tensors
5219+
5220+
new_name = self.map_tensor_name(name)
5221+
5222+
if new_name == output_name and self.hparams.get("norm_head"):
5223+
data_torch = data_torch.float()
5224+
data_torch /= torch.norm(data_torch, p=2, dim=0, keepdim=True) + 1e-7
5225+
5226+
return [(new_name, data_torch)]
5227+
5228+
def prepare_tensors(self):
5229+
super().prepare_tensors()
5230+
5231+
if self._experts is not None:
5232+
# flatten `list[dict[str, Tensor]]` into `list[str]`
5233+
experts = [k for d in self._experts for k in d.keys()]
5234+
if len(experts) > 0:
5235+
raise ValueError(f"Unprocessed experts: {experts}")
5236+
5237+
51335238
@Model.register("ChameleonForConditionalGeneration")
51345239
@Model.register("ChameleonForCausalLM") # obsolete
51355240
class ChameleonModel(Model):

convert_hf_to_gguf_update.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,8 @@ class TOKENIZER_TYPE(IntEnum):
111111
{"name": "deepseek-r1-qwen", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"},
112112
{"name": "gpt-4o", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Xenova/gpt-4o", },
113113
{"name": "superbpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k", },
114+
{"name": "trillion", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/trillionlabs/Trillion-7B-preview", },
115+
{"name": "bailingmoe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-lite", },
114116
]
115117

116118

docs/backend/SYCL.md

Lines changed: 7 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
**oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:
2121

2222
- **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
23-
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL and oneDNN)*.
23+
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. Intel oneMKL, oneMath and oneDNN)*.
2424
- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
2525
- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
2626

@@ -227,16 +227,6 @@ Upon a successful installation, SYCL is enabled for the available intel devices,
227227

228228
**oneAPI Plugin**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.
229229

230-
231-
**oneMKL for cuBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs.
232-
233-
```sh
234-
git clone https://github.com/oneapi-src/oneMKL
235-
cd oneMKL
236-
cmake -B buildWithCublas -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON -DTARGET_DOMAINS=blas
237-
cmake --build buildWithCublas --config Release
238-
```
239-
240230
**oneDNN**: The current oneDNN releases *(shipped with the oneAPI base-toolkit)* do not include the NVIDIA backend. Therefore, oneDNN must be compiled from source to enable the NVIDIA target:
241231

242232
```sh
@@ -250,16 +240,6 @@ cmake --build build-nvidia --config Release
250240

251241
**oneAPI Plugin**: In order to enable SYCL support on AMD GPUs, please install the [Codeplay oneAPI Plugin for AMD GPUs](https://developer.codeplay.com/products/oneapi/amd/download). As with Nvidia GPUs, the user should also make sure the plugin version matches the installed base toolkit.
252242

253-
**oneMKL for rocBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* doesn't contain the rocBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *rocBLAS* backend enabled is thus required to run it on AMD GPUs.
254-
255-
```sh
256-
git clone https://github.com/oneapi-src/oneMKL
257-
cd oneMKL
258-
# Find your HIPTARGET with rocminfo, under the key 'Name:'
259-
cmake -B buildWithrocBLAS -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_ROCBLAS_BACKEND=ON -DHIPTARGETS=${HIPTARGET} -DTARGET_DOMAINS=blas
260-
cmake --build buildWithrocBLAS --config Release
261-
```
262-
263243
3. **Verify installation and environment**
264244

265245
In order to check the available SYCL devices on the machine, please use the `sycl-ls` command.
@@ -324,13 +304,10 @@ cmake --build build --config Release -j -v
324304

325305
#### Nvidia GPU
326306

327-
```sh
328-
# Export relevant ENV variables
329-
export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LD_LIBRARY_PATH
330-
export LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LIBRARY_PATH
331-
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_DIR
332-
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
307+
The SYCL backend depends on [oneMath](https://github.com/uxlfoundation/oneMath) for Nvidia and AMD devices.
308+
By default it is automatically built along with the project. A specific build can be provided by setting the CMake flag `-DoneMath_DIR=/path/to/oneMath/install/lib/cmake/oneMath`.
333309

310+
```sh
334311
# Build LLAMA with Nvidia BLAS acceleration through SYCL
335312
# Setting GGML_SYCL_DEVICE_ARCH is optional but can improve performance
336313
GGML_SYCL_DEVICE_ARCH=sm_80 # Example architecture
@@ -347,12 +324,10 @@ cmake --build build --config Release -j -v
347324

348325
#### AMD GPU
349326

350-
```sh
351-
# Export relevant ENV variables
352-
export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LD_LIBRARY_PATH
353-
export LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LIBRARY_PATH
354-
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithrocBLAS/include:$CPLUS_INCLUDE_DIR
327+
The SYCL backend depends on [oneMath](https://github.com/uxlfoundation/oneMath) for Nvidia and AMD devices.
328+
By default it is automatically built along with the project. A specific build can be provided by setting the CMake flag `-DoneMath_DIR=/path/to/oneMath/install/lib/cmake/oneMath`.
355329

330+
```sh
356331
# Build LLAMA with rocBLAS acceleration through SYCL
357332

358333
## AMD

examples/llava/clip.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1396,14 +1396,16 @@ struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_p
13961396
const int n_kv = gguf_get_n_kv(ctx);
13971397
const int ftype = get_u32(ctx, KEY_FTYPE);
13981398
const std::string ftype_str = get_ftype(ftype);
1399-
const int idx_desc = get_key_idx(ctx, KEY_DESCRIPTION);
1400-
const std::string description = gguf_get_val_str(ctx, idx_desc);
14011399
const int idx_name = gguf_find_key(ctx, KEY_NAME);
14021400
if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
14031401
const std::string name = gguf_get_val_str(ctx, idx_name);
14041402
LOG_INF("%s: model name: %s\n", __func__, name.c_str());
14051403
}
1406-
LOG_INF("%s: description: %s\n", __func__, description.c_str());
1404+
const int idx_desc = gguf_find_key(ctx, KEY_DESCRIPTION);
1405+
if (idx_desc != -1) { // ditto
1406+
const std::string description = gguf_get_val_str(ctx, idx_desc);
1407+
LOG_INF("%s: description: %s\n", __func__, description.c_str());
1408+
}
14071409
LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
14081410
LOG_INF("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
14091411
LOG_INF("%s: n_tensors: %d\n", __func__, n_tensors);

examples/tts/tts.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -699,11 +699,13 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
699699
const std::string voice_data = audio_data;
700700

701701
auto tmp = common_tokenize(vocab, voice_data, false, true);
702-
printf("\n\n");
702+
703+
std::ostringstream tokens_oss;
703704
for (size_t i = 0; i < tmp.size(); ++i) {
704-
printf("%d, ", tmp[i]);
705+
tokens_oss << tmp[i] << ", ";
705706
}
706-
printf("\n\n");
707+
LOG_INF("\n\n%s: llama tokens: %s\n\n", __func__, tokens_oss.str().c_str());
708+
707709
prompt_add(prompt_inp, tmp);
708710
#else
709711
prompt_add(prompt_inp, llama_tokens {

0 commit comments

Comments
 (0)