Skip to content

Commit 011a01d

Browse files
authored
Merge branch 'ggml-org:master' into tr/qwen3-vl
2 parents c212db6 + d2ee056 commit 011a01d

22 files changed

+1090
-365
lines changed

.github/workflows/build.yml

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -444,8 +444,8 @@ jobs:
444444
# This is using llvmpipe and runs slower than other backends
445445
ctest -L main --verbose --timeout 4200
446446
447-
ubuntu-22-cmake-webgpu:
448-
runs-on: ubuntu-22.04
447+
ubuntu-24-cmake-webgpu:
448+
runs-on: ubuntu-24.04
449449

450450
steps:
451451
- name: Clone
@@ -455,16 +455,34 @@ jobs:
455455
- name: ccache
456456
uses: ggml-org/[email protected]
457457
with:
458-
key: ubuntu-22-cmake-webgpu
458+
key: ubuntu-24-cmake-webgpu
459459
evict-old-files: 1d
460460

461-
- name: Vulkan SDK Dependencies
462-
id: vulkan-depends
461+
- name: Dependencies
462+
id: depends
463463
run: |
464-
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
465-
sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
464+
sudo add-apt-repository -y ppa:kisak/kisak-mesa
466465
sudo apt-get update -y
467-
sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev
466+
sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libcurl4-openssl-dev
467+
468+
- name: Get latest Vulkan SDK version
469+
id: vulkan_sdk_version
470+
run: |
471+
echo "VULKAN_SDK_VERSION=$(curl https://vulkan.lunarg.com/sdk/latest/linux.txt)" >> "$GITHUB_ENV"
472+
473+
- name: Use Vulkan SDK Cache
474+
uses: actions/cache@v4
475+
id: cache-sdk
476+
with:
477+
path: ./vulkan_sdk
478+
key: vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
479+
480+
- name: Setup Vulkan SDK
481+
if: steps.cache-sdk.outputs.cache-hit != 'true'
482+
uses: ./.github/actions/linux-setup-vulkan
483+
with:
484+
path: ./vulkan_sdk
485+
version: ${{ env.VULKAN_SDK_VERSION }}
468486

469487
- name: Dawn Dependency
470488
id: dawn-depends

CODEOWNERS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@
7070
/ggml/src/ggml-rpc/ @rgerganov
7171
/ggml/src/ggml-threading.* @ggerganov @slaren
7272
/ggml/src/ggml-vulkan/ @0cc4m
73+
/ggml/src/ggml-webgpu/ @reeselevine
7374
/ggml/src/ggml-zdnn/ @taronaeo @Andreas-Krebbel @AlekseiNikiforovIBM
7475
/ggml/src/ggml.c @ggerganov @slaren
7576
/ggml/src/ggml.cpp @ggerganov @slaren

convert_hf_to_gguf.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9015,6 +9015,75 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
90159015
return [(self.map_tensor_name(name), data_torch)]
90169016

90179017

9018+
@ModelBase.register("Lfm2MoeForCausalLM")
9019+
class LFM2MoeModel(TextModel):
9020+
model_arch = gguf.MODEL_ARCH.LFM2MOE
9021+
9022+
def set_gguf_parameters(self):
9023+
# set num_key_value_heads only for attention layers
9024+
self.hparams["num_key_value_heads"] = [
9025+
self.hparams["num_key_value_heads"] if layer_type == "full_attention" else 0
9026+
for layer_type in self.hparams["layer_types"]
9027+
]
9028+
9029+
super().set_gguf_parameters()
9030+
9031+
self.gguf_writer.add_expert_count(self.hparams["num_experts"])
9032+
self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])
9033+
self.gguf_writer.add_leading_dense_block_count(self.hparams["num_dense_layers"])
9034+
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
9035+
9036+
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
9037+
self.gguf_writer.add_shortconv_l_cache(self.hparams["conv_L_cache"])
9038+
9039+
# cache for experts weights for merging
9040+
_experts_cache: dict[int, dict[str, Tensor]] = {}
9041+
9042+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
9043+
# conv op requires 2d tensor
9044+
if 'conv.conv' in name:
9045+
data_torch = data_torch.squeeze(1)
9046+
9047+
if name.endswith(".expert_bias"):
9048+
name = name.replace(".expert_bias", ".expert_bias.bias")
9049+
9050+
# merge expert weights
9051+
if 'experts' in name:
9052+
n_experts = self.hparams["num_experts"]
9053+
assert bid is not None
9054+
9055+
expert_cache = self._experts_cache.setdefault(bid, {})
9056+
expert_cache[name] = data_torch
9057+
expert_weights = ["w1", "w2", "w3"]
9058+
9059+
# not enough expert weights to merge
9060+
if len(expert_cache) < n_experts * len(expert_weights):
9061+
return []
9062+
9063+
tensors: list[tuple[str, Tensor]] = []
9064+
for w_name in expert_weights:
9065+
datas: list[Tensor] = []
9066+
9067+
for xid in range(n_experts):
9068+
ename = f"model.layers.{bid}.feed_forward.experts.{xid}.{w_name}.weight"
9069+
datas.append(expert_cache[ename])
9070+
del expert_cache[ename]
9071+
9072+
data_torch = torch.stack(datas, dim=0)
9073+
merged_name = f"layers.{bid}.feed_forward.experts.{w_name}.weight"
9074+
new_name = self.map_tensor_name(merged_name)
9075+
tensors.append((new_name, data_torch))
9076+
9077+
del self._experts_cache[bid]
9078+
return tensors
9079+
9080+
return [(self.map_tensor_name(name), data_torch)]
9081+
9082+
def prepare_tensors(self):
9083+
super().prepare_tensors()
9084+
assert not self._experts_cache
9085+
9086+
90189087
@ModelBase.register("Lfm2VlForConditionalGeneration")
90199088
class LFM2VLModel(MmprojModel):
90209089
def __init__(self, *args, **kwargs):

ggml/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,9 @@ option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation"
222222
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
223223
option(GGML_WEBGPU "ggml: use WebGPU" OFF)
224224
option(GGML_WEBGPU_DEBUG "ggml: enable WebGPU debug output" OFF)
225+
option(GGML_WEBGPU_CPU_PROFILE "ggml: enable WebGPU profiling (CPU)" OFF)
226+
option(GGML_WEBGPU_GPU_PROFILE "ggml: enable WebGPU profiling (GPU)" OFF)
227+
225228
option(GGML_ZDNN "ggml: use zDNN" OFF)
226229
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
227230
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)

ggml/src/ggml-metal/ggml-metal-device.cpp

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -959,7 +959,53 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext_pad(
959959
//ggml_metal_cv_set_int32(cv, ns20, FC_FLASH_ATTN_EXT_PAD + 21);
960960
//ggml_metal_cv_set_int32(cv, nsg, FC_FLASH_ATTN_EXT_PAD + 22);
961961
//ggml_metal_cv_set_int32(cv, nwg, FC_FLASH_ATTN_EXT_PAD + 23);
962-
ggml_metal_cv_set_int32(cv, ncpsg, FC_FLASH_ATTN_EXT_PAD + 24);
962+
//ggml_metal_cv_set_int32(cv, nqptg, FC_FLASH_ATTN_EXT_PAD + 24);
963+
ggml_metal_cv_set_int32(cv, ncpsg, FC_FLASH_ATTN_EXT_PAD + 25);
964+
965+
res = ggml_metal_library_compile_pipeline(lib, base, name, cv);
966+
967+
ggml_metal_cv_free(cv);
968+
969+
return res;
970+
}
971+
972+
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext_blk(
973+
ggml_metal_library_t lib,
974+
const struct ggml_tensor * op,
975+
int32_t nqptg,
976+
int32_t ncpsg) {
977+
assert(op->op == GGML_OP_FLASH_ATTN_EXT);
978+
GGML_UNUSED(op);
979+
980+
char base[256];
981+
char name[256];
982+
983+
snprintf(base, 256, "kernel_%s",
984+
"flash_attn_ext_blk");
985+
986+
snprintf(name, 256, "%s_nqptg=%d_ncpsg=%d",
987+
base,
988+
nqptg,
989+
ncpsg);
990+
991+
ggml_metal_pipeline_t res = ggml_metal_library_get_pipeline(lib, name);
992+
if (res) {
993+
return res;
994+
}
995+
996+
ggml_metal_cv_t cv = ggml_metal_cv_init();
997+
998+
//ggml_metal_cv_set_bool(cv, has_mask, FC_FLASH_ATTN_EXT_BLK + 0);
999+
//ggml_metal_cv_set_bool(cv, has_sinks, FC_FLASH_ATTN_EXT_BLK + 1);
1000+
//ggml_metal_cv_set_bool(cv, has_bias, FC_FLASH_ATTN_EXT_BLK + 2);
1001+
//ggml_metal_cv_set_bool(cv, has_scap, FC_FLASH_ATTN_EXT_BLK + 3);
1002+
1003+
//ggml_metal_cv_set_int32(cv, ns10, FC_FLASH_ATTN_EXT_BLK + 20);
1004+
//ggml_metal_cv_set_int32(cv, ns20, FC_FLASH_ATTN_EXT_BLK + 21);
1005+
//ggml_metal_cv_set_int32(cv, nsg, FC_FLASH_ATTN_EXT_BLK + 22);
1006+
//ggml_metal_cv_set_int32(cv, nwg, FC_FLASH_ATTN_EXT_BLK + 23);
1007+
ggml_metal_cv_set_int32(cv, nqptg, FC_FLASH_ATTN_EXT_BLK + 24);
1008+
ggml_metal_cv_set_int32(cv, ncpsg, FC_FLASH_ATTN_EXT_BLK + 25);
9631009

9641010
res = ggml_metal_library_compile_pipeline(lib, base, name, cv);
9651011

ggml/src/ggml-metal/ggml-metal-device.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,12 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext_pad(
141141
bool has_mask,
142142
int32_t ncpsg);
143143

144+
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext_blk(
145+
ggml_metal_library_t lib,
146+
const struct ggml_tensor * op,
147+
int32_t nqptg,
148+
int32_t ncpsg);
149+
144150
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext(
145151
ggml_metal_library_t lib,
146152
const struct ggml_tensor * op,

ggml/src/ggml-metal/ggml-metal-impl.h

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -70,11 +70,19 @@
7070

7171
// function constants offsets
7272
#define FC_FLASH_ATTN_EXT_PAD 100
73-
#define FC_FLASH_ATTN_EXT 200
74-
#define FC_FLASH_ATTN_EXT_VEC 300
75-
#define FC_FLASH_ATTN_EXT_VEC_REDUCE 400
76-
#define FC_MUL_MV 500
77-
#define FC_MUL_MM 600
73+
#define FC_FLASH_ATTN_EXT_BLK 200
74+
#define FC_FLASH_ATTN_EXT 300
75+
#define FC_FLASH_ATTN_EXT_VEC 400
76+
#define FC_FLASH_ATTN_EXT_VEC_REDUCE 500
77+
#define FC_MUL_MV 600
78+
#define FC_MUL_MM 700
79+
80+
// op-specific constants
81+
#define OP_FLASH_ATTN_EXT_NQPTG 8
82+
#define OP_FLASH_ATTN_EXT_NCPSG 64
83+
84+
#define OP_FLASH_ATTN_EXT_VEC_NQPTG 1
85+
#define OP_FLASH_ATTN_EXT_VEC_NCPSG 32
7886

7987
// kernel argument structs
8088
//
@@ -263,6 +271,17 @@ typedef struct {
263271
uint64_t nb33;
264272
} ggml_metal_kargs_flash_attn_ext_pad;
265273

274+
typedef struct {
275+
int32_t ne01;
276+
int32_t ne30;
277+
int32_t ne31;
278+
int32_t ne32;
279+
int32_t ne33;
280+
uint64_t nb31;
281+
uint64_t nb32;
282+
uint64_t nb33;
283+
} ggml_metal_kargs_flash_attn_ext_blk;
284+
266285
typedef struct {
267286
int32_t ne01;
268287
int32_t ne02;

0 commit comments

Comments
 (0)