Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .clang-format
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,13 @@ AllowShortIfStatementsOnASingleLine: Never
AllowShortLambdasOnASingleLine: Inline
AllowShortLoopsOnASingleLine: false
AlwaysBreakBeforeMultilineStrings: true
# Treat CUDA keywords/attributes as "attribute macros" and avoid breaking lines inside them
AttributeMacros:
- __host__
- __device__
- __global__
- __forceinline__
- __launch_bounds__
BinPackArguments: true
BinPackParameters: false # OnePerLine
BitFieldColonSpacing: Both
Expand Down
29 changes: 20 additions & 9 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ env:

jobs:
macOS-latest-cmake-arm64:
runs-on: macos-14
runs-on: macos-latest

steps:
- name: Clone
Expand Down Expand Up @@ -97,7 +97,7 @@ jobs:
ctest -L 'main|curl' --verbose --timeout 900

macOS-latest-cmake-x64:
runs-on: macos-13
runs-on: macos-latest

steps:
- name: Clone
Expand Down Expand Up @@ -138,7 +138,7 @@ jobs:
ctest -L main --verbose --timeout 900

macOS-latest-cmake-arm64-webgpu:
runs-on: macos-14
runs-on: macos-latest

steps:
- name: Clone
Expand Down Expand Up @@ -711,6 +711,7 @@ jobs:

macOS-latest-swift:
runs-on: macos-latest
needs: ios-xcode-build

strategy:
matrix:
Expand All @@ -727,6 +728,12 @@ jobs:
key: macOS-latest-swift
evict-old-files: 1d

- name: Download xcframework artifact
uses: actions/download-artifact@v4
with:
name: llama-xcframework
path: build-apple/llama.xcframework/

- name: Dependencies
id: depends
continue-on-error: true
Expand All @@ -748,11 +755,6 @@ jobs:
-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)

- name: xcodebuild for swift package
id: xcodebuild
run: |
./build-xcframework.sh

windows-msys2:
runs-on: windows-2025

Expand Down Expand Up @@ -1170,8 +1172,17 @@ jobs:
run: |
./build-xcframework.sh

- name: Upload xcframework artifact
uses: actions/upload-artifact@v4
with:
name: llama-xcframework
path: build-apple/llama.xcframework/
retention-days: 1

- name: Build Xcode project
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
run: |
xcodebuild -downloadPlatform iOS
xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build

android-build:
runs-on: ubuntu-latest
Expand Down
7 changes: 7 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,12 @@ if (MSVC)
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/bigobj>")
endif()

if (CMAKE_SYSTEM_NAME STREQUAL "iOS")
set(LLAMA_TOOLS_INSTALL_DEFAULT OFF)
else()
set(LLAMA_TOOLS_INSTALL_DEFAULT ${LLAMA_STANDALONE})
endif()

#
# option list
#
Expand All @@ -82,6 +88,7 @@ option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT})

# 3rd party libs
option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON)
Expand Down
10 changes: 5 additions & 5 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1704,7 +1704,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.system_prompt = value;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
add_opt(common_arg(
{"--no-perf"},
string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
Expand Down Expand Up @@ -2548,7 +2548,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--cpu-moe", "-cmoe"},
"keep all Mixture of Experts (MoE) weights in the CPU",
[](common_params & params) {
params.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
}
).set_env("LLAMA_ARG_CPU_MOE"));
add_opt(common_arg(
Expand All @@ -2561,7 +2561,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
for (int i = 0; i < value; ++i) {
// keep strings alive and avoid leaking memory by storing them in a static vector
static std::list<std::string> buft_overrides;
buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
buft_overrides.push_back(llm_ffn_exps_block_regex(i));
params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
}
}
Expand All @@ -2570,7 +2570,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--cpu-moe-draft", "-cmoed"},
"keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
[](common_params & params) {
params.speculative.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
add_opt(common_arg(
Expand All @@ -2582,7 +2582,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
for (int i = 0; i < value; ++i) {
static std::list<std::string> buft_overrides_draft;
buft_overrides_draft.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
buft_overrides_draft.push_back(llm_ffn_exps_block_regex(i));
params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});
}
}
Expand Down
14 changes: 14 additions & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -734,6 +734,20 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";

}

//
// MoE utils
//

const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_exps";

static std::string llm_ffn_exps_block_regex(int idx) {
return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
}

static llama_model_tensor_buft_override llm_ffn_exps_cpu_override() {
return { LLM_FFN_EXPS_REGEX, ggml_backend_cpu_buffer_type() };
}

//
// training utils
//
Expand Down
73 changes: 73 additions & 0 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -888,6 +888,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
if chkhsh == "a1e163ecab2e718a4c829d1148b6e86824ec36163bb71941c3dca9cd5ac25756":
# ref: https://huggingface.co/JetBrains/Mellum-4b-base
res = "mellum"
if chkhsh == "9b1be57e70d20d9501b2b3186e792d81181ae36ada3903c26f9fea418cf87206":
# ref: https://huggingface.co/inclusionAI/LLaDA-MoE-7B-A1B-Base
res = "llada-moe"

if res is None:
logger.warning("\n")
Expand Down Expand Up @@ -8239,6 +8242,76 @@ def prepare_tensors(self):
raise ValueError(f"Unprocessed experts: {experts}")


@ModelBase.register("LLaDAMoEModel", "LLaDAMoEModelLM")
class LLaDAMoEModel(TextModel):
model_arch = gguf.MODEL_ARCH.LLADA_MOE

def set_gguf_parameters(self):
super().set_gguf_parameters()
if (n_experts := self.hparams.get("num_experts")) is not None:
self.gguf_writer.add_expert_count(n_experts)

if (expert_intermediate_size := self.hparams.get("expert_intermediate_size")) is not None:
self.gguf_writer.add_expert_feed_forward_length(expert_intermediate_size)

# number of experts used per token (top-k)
if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
self.gguf_writer.add_expert_used_count(n_experts_used)

self.gguf_writer.add_mask_token_id(156895)
self.gguf_writer.add_causal_attention(False)
self.gguf_writer.add_diffusion_shift_logits(False)

_experts: list[dict[str, Tensor]] | None = None

# Copied from: Qwen2MoeModel
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# process the experts separately
if name.find("experts") != -1:
n_experts = self.hparams["num_experts"]
assert bid is not None

if self._experts is None:
self._experts = [{} for _ in range(self.block_count)]

self._experts[bid][name] = data_torch

if len(self._experts[bid]) >= n_experts * 3:
tensors: list[tuple[str, Tensor]] = []

# merge the experts into a single 3d tensor
for w_name in ["down_proj", "gate_proj", "up_proj"]:
datas: list[Tensor] = []

for xid in range(n_experts):
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
datas.append(self._experts[bid][ename])
del self._experts[bid][ename]

data_torch = torch.stack(datas, dim=0)

merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"

new_name = self.map_tensor_name(merged_name)

tensors.append((new_name, data_torch))
return tensors
else:
return []

return [(self.map_tensor_name(name), data_torch)]

# Copied from: Qwen2MoeModel
def prepare_tensors(self):
super().prepare_tensors()

if self._experts is not None:
# flatten `list[dict[str, Tensor]]` into `list[str]`
experts = [k for d in self._experts for k in d.keys()]
if len(experts) > 0:
raise ValueError(f"Unprocessed experts: {experts}")


@ModelBase.register("HunYuanDenseV1ForCausalLM")
class HunYuanModel(TextModel):
model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
Expand Down
1 change: 1 addition & 0 deletions convert_hf_to_gguf_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ class TOKENIZER_TYPE(IntEnum):
{"name": "lfm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
{"name": "exaone4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
{"name": "mellum", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", },
{"name": "llada-moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/LLaDA-MoE-7B-A1B-Base", },
]

# some models are known to be broken upstream, so we will skip them as exceptions
Expand Down
24 changes: 17 additions & 7 deletions examples/diffusion/diffusion-cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -510,19 +510,27 @@ static void diffusion_generate(llama_context * ctx,
n_generated = params.max_length;
}

static std::string format_input_text(const std::string & prompt, bool use_chat_template, llama_model * model) {
static std::string format_input_text(const std::string & prompt, const std::string & system_prompt, bool use_chat_template, llama_model * model) {
if (!use_chat_template) {
return prompt;
}

auto chat_templates = common_chat_templates_init(model, "");

common_chat_templates_inputs inputs;
common_chat_msg user_msg;
user_msg.role = "user";
user_msg.content = prompt;
inputs.add_generation_prompt = true;
common_chat_msg system_msg;

if (!system_prompt.empty()) {
system_msg.role = "system";
system_msg.content = system_prompt;
inputs.messages.push_back(system_msg);
}

common_chat_msg user_msg;
user_msg.role = "user";
user_msg.content = prompt;

inputs.messages.push_back(user_msg);
inputs.add_generation_prompt = true;

auto result = common_chat_templates_apply(chat_templates.get(), inputs);

Expand Down Expand Up @@ -579,7 +587,8 @@ int main(int argc, char ** argv) {
llama_set_n_threads(ctx, params.cpuparams.n_threads, params.cpuparams_batch.n_threads);

const llama_vocab * vocab = llama_model_get_vocab(model);
std::string formatted_prompt = format_input_text(params.prompt, params.enable_chat_template, model);

std::string formatted_prompt = format_input_text(params.prompt, params.system_prompt, params.enable_chat_template, model);

std::vector<llama_token> input_tokens = common_tokenize(vocab,
formatted_prompt,
Expand All @@ -596,6 +605,7 @@ int main(int argc, char ** argv) {
}

llama_token mask_token_id = llama_vocab_mask(vocab);

GGML_ASSERT(mask_token_id != LLAMA_TOKEN_NULL);

bool visual_mode = params.diffusion.visual_mode;
Expand Down
1 change: 0 additions & 1 deletion ggml/src/ggml-cpu/ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8599,7 +8599,6 @@ static void ggml_compute_forward_timestep_embedding_f32(
}
if (dim % 2 != 0 && ith == 0) {
embed_data[2 * half] = 0.f;
embed_data[dim] = 0.f;
}
}
}
Expand Down
6 changes: 3 additions & 3 deletions ggml/src/ggml-cuda/tsembd.cu
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@ static __global__ void timestep_embedding_f32(const float * timesteps, float * d
int j = threadIdx.x + blockIdx.x * blockDim.x;
float * embed_data = (float *)((char *)dst + i*nb1);

if (dim % 2 != 0 && j == ((dim + 1) / 2)) {
embed_data[dim] = 0.f;
int half = dim / 2;
if (dim % 2 != 0 && j == half) {
embed_data[2 * half] = 0.f;
}

int half = dim / 2;
if (j >= half) {
return;
}
Expand Down
2 changes: 1 addition & 1 deletion ggml/src/ggml-metal/ggml-metal.metal
Original file line number Diff line number Diff line change
Expand Up @@ -4167,7 +4167,7 @@ kernel void kernel_timestep_embedding_f32(
}

if (args.dim % 2 != 0 && tpitg.x == 0) {
embed_data[args.dim] = 0.f;
embed_data[2 * half_] = 0.f;
}
}

Expand Down
4 changes: 2 additions & 2 deletions ggml/src/ggml-opencl/kernels/tsembd.cl
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ kernel void kernel_timestep_embedding(
local_half_dim = logical_dim / 2;
local_embed_data_ptr = (global float *)((global char *)local_dst_output_base_ptr + local_i * dst_nb1_bytes);

if (logical_dim % 2 != 0 && local_j == ((logical_dim + 1) / 2)) {
local_embed_data_ptr[logical_dim] = 0.0f;
if (logical_dim % 2 != 0 && local_j == local_half_dim) {
local_embed_data_ptr[2 * local_half_dim] = 0.0f;
}

if (local_j >= local_half_dim) {
Expand Down
7 changes: 4 additions & 3 deletions ggml/src/ggml-sycl/tsembd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,12 @@ static void timestep_embedding_f32(
int j = item_ct1.get_local_id(2) + item_ct1.get_group(2) * item_ct1.get_local_range(2);
float * embed_data = (float *)((char *)dst + i*nb1);

if (dim % 2 != 0 && j == ((dim + 1) / 2)) {
embed_data[dim] = 0.f;
int half = dim / 2;

if (dim % 2 != 0 && j == half) {
embed_data[2 * half] = 0.f;
}

int half = dim / 2;
if (j >= half) {
return;
}
Expand Down
Loading