Skip to content

Commit 4eae369

Browse files
authored
Merge pull request #36 from JJJYmmm/add_qwen3vl
Bulk fixes + catchup with master
2 parents d192d9f + 7d9c149 commit 4eae369

File tree

19 files changed

+586
-64
lines changed

19 files changed

+586
-64
lines changed

convert_hf_to_gguf.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1528,7 +1528,7 @@ def set_gguf_parameters(self):
15281528
self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size"]))
15291529
self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size"]))
15301530
self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys))
1531-
self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads"]))
1531+
self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads", "num_heads"]))
15321532

15331533
# preprocessor config
15341534
image_mean = _MISTRAL_COMMON_DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"]
@@ -9710,6 +9710,37 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
97109710

97119711
return [] # skip other tensors
97129712

9713+
9714+
@ModelBase.register("CogVLMForCausalLM")
9715+
class CogVLMVisionModel(MmprojModel):
9716+
9717+
def set_gguf_parameters(self):
9718+
super().set_gguf_parameters()
9719+
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
9720+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.COGVLM)
9721+
9722+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
9723+
del bid # unused
9724+
9725+
if not name.startswith("model.vision."):
9726+
return []
9727+
9728+
return [(self.map_tensor_name(name), data_torch)]
9729+
9730+
9731+
@ModelBase.register("CogVLMForCausalLM")
9732+
class CogVLMModel(LlamaModel):
9733+
model_arch = gguf.MODEL_ARCH.COGVLM
9734+
9735+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
9736+
del bid # unused
9737+
9738+
# block vision tensors
9739+
if name.startswith("model.vision."):
9740+
return []
9741+
9742+
return [(self.map_tensor_name(name), data_torch)]
9743+
97139744
###### CONVERSION LOGIC ######
97149745

97159746

ggml/src/ggml-cuda/argsort.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ template<ggml_sort_order order>
8787
static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ncols_pad) {
8888
// bitonic sort
8989
int col = threadIdx.x;
90-
int row = blockIdx.y;
90+
int row = blockIdx.x;
9191

9292
if (col >= ncols_pad) {
9393
return;
@@ -151,7 +151,7 @@ static void argsort_f32_i32_cuda_bitonic(const float * x,
151151
const int ncols_pad = next_power_of_2(ncols);
152152

153153
const dim3 block_dims(ncols_pad, 1, 1);
154-
const dim3 block_nums(1, nrows, 1);
154+
const dim3 block_nums(nrows, 1, 1);
155155
const size_t shared_mem = ncols_pad * sizeof(int);
156156

157157
// FIXME: this limit could be raised by ~2-4x on Ampere or newer

ggml/src/ggml-cuda/mmvq.cu

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -190,12 +190,28 @@ static __global__ void mul_mat_vec_q(
190190

191191
const uint32_t channel_bias = ids ? channel_x : channel_dst;
192192

193+
float x_biases[ncols_dst][rows_per_cuda_block] = { { 0.0f } };
194+
float gate_biases[ncols_dst][rows_per_cuda_block] = { { 0.0f } };
193195
if constexpr (has_fusion) {
194196
if (use_bias) {
195197
x_bias = x_bias + sample_dst*stride_sample_dst + channel_bias*stride_channel_dst + row0;
198+
// 1. Hide latency by prefetching bias and gate here
199+
// 2. load only on threads that won't die after partial sum calculation
200+
if (threadIdx.x < rows_per_cuda_block && threadIdx.y == 0 &&
201+
(rows_per_cuda_block == 1 || uint32_t(row0 + threadIdx.x) < stride_col_dst)) {
202+
for (int j = 0; j < ncols_dst; ++j) {
203+
x_biases[j][threadIdx.x] = x_bias[j * stride_col_dst + threadIdx.x];
204+
}
205+
}
196206
}
197207
if (use_gate_bias) {
198208
gate_bias = gate_bias + sample_dst*stride_sample_dst + channel_bias*stride_channel_dst + row0;
209+
if (threadIdx.x < rows_per_cuda_block && threadIdx.y == 0 &&
210+
(rows_per_cuda_block == 1 || uint32_t(row0 + threadIdx.x) < stride_col_dst)) {
211+
for (int j = 0; j < ncols_dst; ++j) {
212+
gate_biases[j][threadIdx.x] = gate_bias[j * stride_col_dst + threadIdx.x];
213+
}
214+
}
199215
}
200216
}
201217

@@ -283,12 +299,12 @@ static __global__ void mul_mat_vec_q(
283299
float result = tmp[j][threadIdx.x];
284300
if constexpr (has_fusion) {
285301
if (use_bias) {
286-
result += x_bias[j*stride_col_dst + threadIdx.x];
302+
result += x_biases[j][threadIdx.x];
287303
}
288304
if (use_gate) {
289305
float gate_value = tmp_gate[j][threadIdx.x];
290306
if (use_gate_bias) {
291-
gate_value += gate_bias[j*stride_col_dst + threadIdx.x];
307+
gate_value += gate_biases[j][threadIdx.x];
292308
}
293309
switch (active_glu) {
294310
case GGML_GLU_OP_SWIGLU:

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1056,6 +1056,7 @@ struct vk_op_rope_push_constants {
10561056
uint32_t s1;
10571057
uint32_t s2;
10581058
int32_t sections[4];
1059+
uint32_t is_imrope;
10591060
uint32_t is_back;
10601061
uint32_t set_rows_stride;
10611062
};
@@ -1082,6 +1083,7 @@ struct vk_op_soft_max_push_constants {
10821083

10831084
struct vk_op_argsort_push_constants {
10841085
uint32_t ncols;
1086+
uint32_t nrows;
10851087
int32_t order;
10861088
};
10871089

@@ -8708,6 +8710,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
87088710
break;
87098711
case GGML_OP_ARGSORT:
87108712
elements = { (uint32_t)ne00, (uint32_t)ggml_nrows(src0), 1 };
8713+
elements[1] = std::min(elements[1], ctx->device->properties.limits.maxComputeWorkGroupCount[1]);
87118714
break;
87128715
case GGML_OP_IM2COL:
87138716
{
@@ -9925,6 +9928,8 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, cons
99259928
memcpy(sections, (int32_t *) dst->op_params + 11, sizeof(int)*4);
99269929
}
99279930

9931+
const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE;
9932+
99289933
float corr_dims[2];
99299934
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
99309935

@@ -9946,17 +9951,19 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, cons
99469951
(uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
99479952
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1]}, theta_scale,
99489953
src2 != nullptr, (uint32_t)src0->ne[2], s1, s2,
9949-
{ sections[0], sections[1], sections[2], sections[3] }, backprop, set_rows_stride,
9954+
{ sections[0], sections[1], sections[2], sections[3] }, is_imrope, backprop, set_rows_stride,
99509955
}, dryrun);
99519956
}
99529957

99539958
static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
99549959
int32_t * op_params = (int32_t *)dst->op_params;
99559960

99569961
uint32_t ncols = src0->ne[0];
9962+
uint32_t nrows = ggml_nrows(src0);
99579963

99589964
ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_ARGSORT, {
99599965
ncols,
9966+
nrows,
99609967
op_params[0],
99619968
}, dryrun);
99629969
}

ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ layout (binding = 1) buffer D {int data_d[];};
1414

1515
layout (push_constant) uniform parameter {
1616
uint ncols;
17+
uint nrows;
1718
uint order;
1819
} p;
1920

@@ -26,10 +27,9 @@ void swap(uint idx0, uint idx1) {
2627
dst_row[idx1] = tmp;
2728
}
2829

29-
void argsort(bool needs_bounds_check) {
30+
void argsort(bool needs_bounds_check, const uint row) {
3031
// bitonic sort
3132
const int col = int(gl_LocalInvocationID.x);
32-
const uint row = gl_WorkGroupID.y;
3333

3434
const uint row_offset = row * p.ncols;
3535

@@ -72,8 +72,16 @@ void argsort(bool needs_bounds_check) {
7272

7373
void main() {
7474
if (p.ncols == BLOCK_SIZE) {
75-
argsort(false);
75+
uint row = gl_WorkGroupID.y;
76+
while (row < p.nrows) {
77+
argsort(false, row);
78+
row += gl_WorkGroupSize.y * gl_NumWorkGroups.y;
79+
}
7680
} else {
77-
argsort(true);
81+
uint row = gl_WorkGroupID.y;
82+
while (row < p.nrows) {
83+
argsort(true, row);
84+
row += gl_WorkGroupSize.y * gl_NumWorkGroups.y;
85+
}
7886
}
7987
}

ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ layout (push_constant) uniform parameter {
2727
uint s1;
2828
uint s2;
2929
int sections[4];
30+
uint is_imrope;
3031
uint is_back;
3132
uint set_rows_stride;
3233
} p;

ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp

Lines changed: 23 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -32,17 +32,29 @@ void main() {
3232
const uint sector = (i0 / 2) % sect_dims;
3333

3434
float theta_base = 0.0;
35-
if (sector < p.sections[0]) {
36-
theta_base = data_pos[channel_x]*pow(p.theta_scale, i0/2.0f);
37-
}
38-
else if (sector >= p.sections[0] && sector < sec_w) {
39-
theta_base = data_pos[channel_x + ne2 * 1]*pow(p.theta_scale, i0/2.0f);
40-
}
41-
else if (sector >= sec_w && sector < sec_w + p.sections[2]) {
42-
theta_base = data_pos[channel_x + ne2 * 2]*pow(p.theta_scale, i0/2.0f);
43-
}
44-
else if (sector >= sec_w + p.sections[2]) {
45-
theta_base = data_pos[channel_x + ne2 * 3]*pow(p.theta_scale, i0/2.0f);
35+
if (p.is_imrope != 0) {
36+
if (sector % 3 == 1 && sector < 3 * p.sections[1]) {
37+
theta_base = data_pos[channel_x + ne2 * 1]*pow(p.theta_scale, i0/2.0f);
38+
} else if (sector % 3 == 2 && sector < 3 * p.sections[2]) {
39+
theta_base = data_pos[channel_x + ne2 * 2]*pow(p.theta_scale, i0/2.0f);
40+
} else if (sector % 3 == 0 && sector < 3 * p.sections[0]) {
41+
theta_base = data_pos[channel_x]*pow(p.theta_scale, i0/2.0f);
42+
} else {
43+
theta_base = data_pos[channel_x + ne2 * 3]*pow(p.theta_scale, i0/2.0f);
44+
}
45+
} else {
46+
if (sector < p.sections[0]) {
47+
theta_base = data_pos[channel_x]*pow(p.theta_scale, i0/2.0f);
48+
}
49+
else if (sector >= p.sections[0] && sector < sec_w) {
50+
theta_base = data_pos[channel_x + ne2 * 1]*pow(p.theta_scale, i0/2.0f);
51+
}
52+
else if (sector >= sec_w && sector < sec_w + p.sections[2]) {
53+
theta_base = data_pos[channel_x + ne2 * 2]*pow(p.theta_scale, i0/2.0f);
54+
}
55+
else if (sector >= sec_w + p.sections[2]) {
56+
theta_base = data_pos[channel_x + ne2 * 3]*pow(p.theta_scale, i0/2.0f);
57+
}
4658
}
4759

4860
const float freq_factor = p.has_ff != 0 ? data_ff[i0/2] : 1.0f;

ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl

Lines changed: 31 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,7 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
221221

222222
let is_neox = bool(params.mode & 2);
223223
let is_mrope = bool(params.mode & 8);
224+
let is_imrope = params.mode == 40;
224225
let is_vision = params.mode == 24;
225226

226227
var i = gid.x * 2; // start index for this thread
@@ -248,24 +249,36 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
248249
let sec_w = params.sections1 + params.sections0;
249250
let sec_e = params.sections2 + sec_w;
250251
let sector = (i0 / 2) % sect_dims;
251-
if (sector >= params.sections0 && sector < sec_w) {
252-
theta_base_mult = 1;
253-
if (is_vision) {
254-
theta_scale_pwr = sector - params.sections0;
255-
}
256-
} else if (sector >= sec_w && sector < sec_e) {
257-
theta_base_mult = 2;
258-
if (is_vision) {
259-
theta_scale_pwr = sector - sec_w;
260-
}
261-
} else if (sector >= sec_e) {
262-
if (is_vision) {
263-
theta_scale_pwr = sector - sec_e;
264-
theta_scale_pwr = (i0 / 2) % sec_e;
265-
}
266-
theta_base_mult = 3;
267-
} else if (is_vision) {
268-
theta_scale_pwr = sector;
252+
if (is_imrope) {
253+
if (sector % 3 == 1 && sector < 3 * params.sections1) {
254+
theta_base_mult = 1;
255+
} else if (sector % 3 == 2 && sector < 3 * params.sections2) {
256+
theta_base_mult = 2;
257+
} else if (sector % 3 == 0 && sector < 3 * params.sections0) {
258+
theta_base_mult = 0;
259+
} else {
260+
theta_base_mult = 3;
261+
}
262+
} else {
263+
if (sector >= params.sections0 && sector < sec_w) {
264+
theta_base_mult = 1;
265+
if (is_vision) {
266+
theta_scale_pwr = sector - params.sections0;
267+
}
268+
} else if (sector >= sec_w && sector < sec_e) {
269+
theta_base_mult = 2;
270+
if (is_vision) {
271+
theta_scale_pwr = sector - sec_w;
272+
}
273+
} else if (sector >= sec_e) {
274+
if (is_vision) {
275+
theta_scale_pwr = sector - sec_e;
276+
theta_scale_pwr = (i0 / 2) % sec_e;
277+
}
278+
theta_base_mult = 3;
279+
} else if (is_vision) {
280+
theta_scale_pwr = sector;
281+
}
269282
}
270283
}
271284
let theta_base = f32(src1[params.offset_src1 + i2 + params.ne2 * theta_base_mult]) * pow(params.theta_scale, f32(theta_scale_pwr));

0 commit comments

Comments
 (0)