Skip to content

Commit 19a458f

Browse files
authored
Merge branch 'master' into add_qwen3vl
2 parents 950c764 + bacddc0 commit 19a458f

File tree

16 files changed

+529
-34
lines changed

16 files changed

+529
-34
lines changed

convert_hf_to_gguf.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1528,7 +1528,7 @@ def set_gguf_parameters(self):
15281528
self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size"]))
15291529
self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size"]))
15301530
self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys))
1531-
self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads"]))
1531+
self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads", "num_heads"]))
15321532

15331533
# preprocessor config
15341534
image_mean = _MISTRAL_COMMON_DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"]
@@ -9710,6 +9710,37 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
97109710

97119711
return [] # skip other tensors
97129712

9713+
9714+
@ModelBase.register("CogVLMForCausalLM")
9715+
class CogVLMVisionModel(MmprojModel):
9716+
9717+
def set_gguf_parameters(self):
9718+
super().set_gguf_parameters()
9719+
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
9720+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.COGVLM)
9721+
9722+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
9723+
del bid # unused
9724+
9725+
if not name.startswith("model.vision."):
9726+
return []
9727+
9728+
return [(self.map_tensor_name(name), data_torch)]
9729+
9730+
9731+
@ModelBase.register("CogVLMForCausalLM")
9732+
class CogVLMModel(LlamaModel):
9733+
model_arch = gguf.MODEL_ARCH.COGVLM
9734+
9735+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
9736+
del bid # unused
9737+
9738+
# block vision tensors
9739+
if name.startswith("model.vision."):
9740+
return []
9741+
9742+
return [(self.map_tensor_name(name), data_torch)]
9743+
97139744
###### CONVERSION LOGIC ######
97149745

97159746

ggml/src/ggml-cuda/argsort.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ template<ggml_sort_order order>
8787
static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ncols_pad) {
8888
// bitonic sort
8989
int col = threadIdx.x;
90-
int row = blockIdx.y;
90+
int row = blockIdx.x;
9191

9292
if (col >= ncols_pad) {
9393
return;
@@ -151,7 +151,7 @@ static void argsort_f32_i32_cuda_bitonic(const float * x,
151151
const int ncols_pad = next_power_of_2(ncols);
152152

153153
const dim3 block_dims(ncols_pad, 1, 1);
154-
const dim3 block_nums(1, nrows, 1);
154+
const dim3 block_nums(nrows, 1, 1);
155155
const size_t shared_mem = ncols_pad * sizeof(int);
156156

157157
// FIXME: this limit could be raised by ~2-4x on Ampere or newer

ggml/src/ggml-cuda/mmvq.cu

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -190,12 +190,28 @@ static __global__ void mul_mat_vec_q(
190190

191191
const uint32_t channel_bias = ids ? channel_x : channel_dst;
192192

193+
float x_biases[ncols_dst][rows_per_cuda_block] = { { 0.0f } };
194+
float gate_biases[ncols_dst][rows_per_cuda_block] = { { 0.0f } };
193195
if constexpr (has_fusion) {
194196
if (use_bias) {
195197
x_bias = x_bias + sample_dst*stride_sample_dst + channel_bias*stride_channel_dst + row0;
198+
// 1. Hide latency by prefetching bias and gate here
199+
// 2. load only on threads that won't die after partial sum calculation
200+
if (threadIdx.x < rows_per_cuda_block && threadIdx.y == 0 &&
201+
(rows_per_cuda_block == 1 || uint32_t(row0 + threadIdx.x) < stride_col_dst)) {
202+
for (int j = 0; j < ncols_dst; ++j) {
203+
x_biases[j][threadIdx.x] = x_bias[j * stride_col_dst + threadIdx.x];
204+
}
205+
}
196206
}
197207
if (use_gate_bias) {
198208
gate_bias = gate_bias + sample_dst*stride_sample_dst + channel_bias*stride_channel_dst + row0;
209+
if (threadIdx.x < rows_per_cuda_block && threadIdx.y == 0 &&
210+
(rows_per_cuda_block == 1 || uint32_t(row0 + threadIdx.x) < stride_col_dst)) {
211+
for (int j = 0; j < ncols_dst; ++j) {
212+
gate_biases[j][threadIdx.x] = gate_bias[j * stride_col_dst + threadIdx.x];
213+
}
214+
}
199215
}
200216
}
201217

@@ -283,12 +299,12 @@ static __global__ void mul_mat_vec_q(
283299
float result = tmp[j][threadIdx.x];
284300
if constexpr (has_fusion) {
285301
if (use_bias) {
286-
result += x_bias[j*stride_col_dst + threadIdx.x];
302+
result += x_biases[j][threadIdx.x];
287303
}
288304
if (use_gate) {
289305
float gate_value = tmp_gate[j][threadIdx.x];
290306
if (use_gate_bias) {
291-
gate_value += gate_bias[j*stride_col_dst + threadIdx.x];
307+
gate_value += gate_biases[j][threadIdx.x];
292308
}
293309
switch (active_glu) {
294310
case GGML_GLU_OP_SWIGLU:

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1083,6 +1083,7 @@ struct vk_op_soft_max_push_constants {
10831083

10841084
struct vk_op_argsort_push_constants {
10851085
uint32_t ncols;
1086+
uint32_t nrows;
10861087
int32_t order;
10871088
};
10881089

@@ -8709,6 +8710,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
87098710
break;
87108711
case GGML_OP_ARGSORT:
87118712
elements = { (uint32_t)ne00, (uint32_t)ggml_nrows(src0), 1 };
8713+
elements[1] = std::min(elements[1], ctx->device->properties.limits.maxComputeWorkGroupCount[1]);
87128714
break;
87138715
case GGML_OP_IM2COL:
87148716
{
@@ -9957,9 +9959,11 @@ static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context& subctx, c
99579959
int32_t * op_params = (int32_t *)dst->op_params;
99589960

99599961
uint32_t ncols = src0->ne[0];
9962+
uint32_t nrows = ggml_nrows(src0);
99609963

99619964
ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_ARGSORT, {
99629965
ncols,
9966+
nrows,
99639967
op_params[0],
99649968
}, dryrun);
99659969
}

ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ layout (binding = 1) buffer D {int data_d[];};
1414

1515
layout (push_constant) uniform parameter {
1616
uint ncols;
17+
uint nrows;
1718
uint order;
1819
} p;
1920

@@ -26,10 +27,9 @@ void swap(uint idx0, uint idx1) {
2627
dst_row[idx1] = tmp;
2728
}
2829

29-
void argsort(bool needs_bounds_check) {
30+
void argsort(bool needs_bounds_check, const uint row) {
3031
// bitonic sort
3132
const int col = int(gl_LocalInvocationID.x);
32-
const uint row = gl_WorkGroupID.y;
3333

3434
const uint row_offset = row * p.ncols;
3535

@@ -72,8 +72,16 @@ void argsort(bool needs_bounds_check) {
7272

7373
void main() {
7474
if (p.ncols == BLOCK_SIZE) {
75-
argsort(false);
75+
uint row = gl_WorkGroupID.y;
76+
while (row < p.nrows) {
77+
argsort(false, row);
78+
row += gl_WorkGroupSize.y * gl_NumWorkGroups.y;
79+
}
7680
} else {
77-
argsort(true);
81+
uint row = gl_WorkGroupID.y;
82+
while (row < p.nrows) {
83+
argsort(true, row);
84+
row += gl_WorkGroupSize.y * gl_NumWorkGroups.y;
85+
}
7886
}
7987
}

gguf-py/gguf/constants.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,7 @@ class MODEL_ARCH(IntEnum):
424424
SEED_OSS = auto()
425425
GROVEMOE = auto()
426426
APERTUS = auto()
427+
COGVLM = auto()
427428

428429

429430
class VISION_PROJECTOR_TYPE(IntEnum):
@@ -435,6 +436,7 @@ class VISION_PROJECTOR_TYPE(IntEnum):
435436
MERGER = auto()
436437
GEMMA3 = auto()
437438
QWEN3VL = auto()
439+
COGVLM = auto()
438440

439441

440442
class MODEL_TENSOR(IntEnum):
@@ -605,6 +607,11 @@ class MODEL_TENSOR(IntEnum):
605607
SHORTCONV_CONV = auto()
606608
SHORTCONV_INPROJ = auto()
607609
SHORTCONV_OUTPROJ = auto()
610+
VISEXP_ATTN_QKV = auto()
611+
VISEXP_ATTN_OUT = auto()
612+
VISEXP_GATE = auto()
613+
VISEXP_DOWN = auto()
614+
VISEXP_UP = auto()
608615
# vision
609616
V_MMPROJ = auto()
610617
V_MMPROJ_FC = auto()
@@ -649,6 +656,12 @@ class MODEL_TENSOR(IntEnum):
649656
V_DS_NORM = auto() # qwen3vl
650657
V_DS_FC1 = auto() # qwen3vl
651658
V_DS_FC2 = auto() # qwen3vl
659+
V_MM_POST_FC_NORM = auto() # cogvlm
660+
V_MM_UP = auto() # cogvlm
661+
V_MM_DOWN = auto() # cogvlm
662+
V_MM_GATE = auto() # cogvlm
663+
V_TOK_BOI = auto() # cogvlm
664+
V_TOK_EOI = auto() # cogvlm
652665
# audio (mtmd)
653666
A_ENC_EMBD_POS = auto()
654667
A_ENC_CONV1D = auto()
@@ -777,6 +790,7 @@ class MODEL_TENSOR(IntEnum):
777790
MODEL_ARCH.SEED_OSS: "seed_oss",
778791
MODEL_ARCH.GROVEMOE: "grovemoe",
779792
MODEL_ARCH.APERTUS: "apertus",
793+
MODEL_ARCH.COGVLM: "cogvlm",
780794
}
781795

782796
VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
@@ -957,6 +971,11 @@ class MODEL_TENSOR(IntEnum):
957971
MODEL_TENSOR.SHORTCONV_CONV: "blk.{bid}.shortconv.conv",
958972
MODEL_TENSOR.SHORTCONV_INPROJ: "blk.{bid}.shortconv.in_proj",
959973
MODEL_TENSOR.SHORTCONV_OUTPROJ: "blk.{bid}.shortconv.out_proj",
974+
MODEL_TENSOR.VISEXP_ATTN_QKV: "blk.{bid}.vis_attn_qkv",
975+
MODEL_TENSOR.VISEXP_ATTN_OUT: "blk.{bid}.vis_attn_output",
976+
MODEL_TENSOR.VISEXP_GATE: "blk.{bid}.vis_gate",
977+
MODEL_TENSOR.VISEXP_DOWN: "blk.{bid}.vis_down",
978+
MODEL_TENSOR.VISEXP_UP: "blk.{bid}.vis_up",
960979
# vision
961980
MODEL_TENSOR.V_MMPROJ: "mm.{bid}",
962981
MODEL_TENSOR.V_MMPROJ_FC: "mm.model.fc",
@@ -1001,6 +1020,12 @@ class MODEL_TENSOR(IntEnum):
10011020
MODEL_TENSOR.V_DS_NORM: "v.deepstack.{bid}.norm",
10021021
MODEL_TENSOR.V_DS_FC1: "v.deepstack.{bid}.fc1",
10031022
MODEL_TENSOR.V_DS_FC2: "v.deepstack.{bid}.fc2",
1023+
MODEL_TENSOR.V_MM_POST_FC_NORM: "mm.post_fc_norm", # cogvlm
1024+
MODEL_TENSOR.V_MM_UP: "mm.up",
1025+
MODEL_TENSOR.V_MM_DOWN: "mm.down",
1026+
MODEL_TENSOR.V_MM_GATE: "mm.gate",
1027+
MODEL_TENSOR.V_TOK_BOI: "v.boi",
1028+
MODEL_TENSOR.V_TOK_EOI: "v.eoi",
10041029
# audio (mtmd)
10051030
MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd",
10061031
MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}",
@@ -1073,6 +1098,12 @@ class MODEL_TENSOR(IntEnum):
10731098
MODEL_TENSOR.V_DS_NORM,
10741099
MODEL_TENSOR.V_DS_FC1,
10751100
MODEL_TENSOR.V_DS_FC2,
1101+
MODEL_TENSOR.V_MM_POST_FC_NORM,
1102+
MODEL_TENSOR.V_MM_UP,
1103+
MODEL_TENSOR.V_MM_DOWN,
1104+
MODEL_TENSOR.V_MM_GATE,
1105+
MODEL_TENSOR.V_TOK_BOI,
1106+
MODEL_TENSOR.V_TOK_EOI,
10761107
# audio
10771108
MODEL_TENSOR.A_ENC_EMBD_POS,
10781109
MODEL_TENSOR.A_ENC_CONV1D,
@@ -2890,6 +2921,23 @@ class MODEL_TENSOR(IntEnum):
28902921
MODEL_TENSOR.FFN_DOWN_CHEXP,
28912922
MODEL_TENSOR.FFN_UP_CHEXP,
28922923
],
2924+
MODEL_ARCH.COGVLM: [
2925+
MODEL_TENSOR.TOKEN_EMBD,
2926+
MODEL_TENSOR.OUTPUT_NORM,
2927+
MODEL_TENSOR.OUTPUT,
2928+
MODEL_TENSOR.ATTN_NORM,
2929+
MODEL_TENSOR.ATTN_QKV,
2930+
MODEL_TENSOR.ATTN_OUT,
2931+
MODEL_TENSOR.FFN_NORM,
2932+
MODEL_TENSOR.FFN_GATE,
2933+
MODEL_TENSOR.FFN_DOWN,
2934+
MODEL_TENSOR.FFN_UP,
2935+
MODEL_TENSOR.VISEXP_ATTN_QKV,
2936+
MODEL_TENSOR.VISEXP_ATTN_OUT,
2937+
MODEL_TENSOR.VISEXP_GATE,
2938+
MODEL_TENSOR.VISEXP_UP,
2939+
MODEL_TENSOR.VISEXP_DOWN,
2940+
],
28932941
# TODO
28942942
}
28952943

@@ -3117,6 +3165,7 @@ class VisionProjectorType:
31173165
LFM2 = "lfm2"
31183166
KIMIVL = "kimivl"
31193167
LIGHTONOCR = "lightonocr"
3168+
COGVLM = "cogvlm"
31203169

31213170

31223171
# Items here are (block size, type size)

0 commit comments

Comments
 (0)