Skip to content

Commit c424e75

Browse files
authored
Merge branch 'ggml-org:master' into master
2 parents cccff4a + dcca0d3 commit c424e75

File tree

17 files changed

+631
-53
lines changed

17 files changed

+631
-53
lines changed

convert_hf_to_gguf.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1528,7 +1528,7 @@ def set_gguf_parameters(self):
15281528
self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size"]))
15291529
self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size"]))
15301530
self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys))
1531-
self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads"]))
1531+
self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads", "num_heads"]))
15321532

15331533
# preprocessor config
15341534
image_mean = _MISTRAL_COMMON_DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"]
@@ -9493,6 +9493,37 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
94939493

94949494
return [] # skip other tensors
94959495

9496+
9497+
@ModelBase.register("CogVLMForCausalLM")
9498+
class CogVLMVisionModel(MmprojModel):
9499+
9500+
def set_gguf_parameters(self):
9501+
super().set_gguf_parameters()
9502+
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
9503+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.COGVLM)
9504+
9505+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
9506+
del bid # unused
9507+
9508+
if not name.startswith("model.vision."):
9509+
return []
9510+
9511+
return [(self.map_tensor_name(name), data_torch)]
9512+
9513+
9514+
@ModelBase.register("CogVLMForCausalLM")
9515+
class CogVLMModel(LlamaModel):
9516+
model_arch = gguf.MODEL_ARCH.COGVLM
9517+
9518+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
9519+
del bid # unused
9520+
9521+
# block vision tensors
9522+
if name.startswith("model.vision."):
9523+
return []
9524+
9525+
return [(self.map_tensor_name(name), data_torch)]
9526+
94969527
###### CONVERSION LOGIC ######
94979528

94989529

ggml/src/ggml-cpu/ops.cpp

Lines changed: 90 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -7909,10 +7909,10 @@ void ggml_compute_forward_argsort(
79097909

79107910
// ggml_compute_forward_flash_attn_ext
79117911

7912-
static void ggml_compute_forward_flash_attn_ext_f16(
7912+
static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
79137913
const ggml_compute_params * params,
7914-
ggml_tensor * dst) {
7915-
7914+
ggml_tensor * dst,
7915+
int ir0, int ir1) {
79167916
const ggml_tensor * q = dst->src[0];
79177917
const ggml_tensor * k = dst->src[1];
79187918
const ggml_tensor * v = dst->src[2];
@@ -7928,9 +7928,6 @@ static void ggml_compute_forward_flash_attn_ext_f16(
79287928
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
79297929
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
79307930

7931-
const int ith = params->ith;
7932-
const int nth = params->nth;
7933-
79347931
const int64_t DK = nek0;
79357932
const int64_t DV = nev0;
79367933
const int64_t N = neq1;
@@ -7964,16 +7961,6 @@ static void ggml_compute_forward_flash_attn_ext_f16(
79647961

79657962
// parallelize by q rows using ggml_vec_dot_f32
79667963

7967-
// total rows in q
7968-
const int nr = neq1*neq2*neq3;
7969-
7970-
// rows per thread
7971-
const int dr = (nr + nth - 1)/nth;
7972-
7973-
// row range for this thread
7974-
const int ir0 = dr*ith;
7975-
const int ir1 = MIN(ir0 + dr, nr);
7976-
79777964
float scale = 1.0f;
79787965
float max_bias = 0.0f;
79797966
float logit_softcap = 0.0f;
@@ -8000,6 +7987,8 @@ static void ggml_compute_forward_flash_attn_ext_f16(
80007987
GGML_ASSERT(( q_to_vec_dot) && "fattn: unsupported K-type");
80017988
GGML_ASSERT((v->type == GGML_TYPE_F32 || v_to_float ) && "fattn: unsupported V-type");
80027989

7990+
int ith = params->ith;
7991+
80037992
// loop over n_batch and n_head
80047993
for (int ir = ir0; ir < ir1; ++ir) {
80057994
// q indices
@@ -8147,6 +8136,91 @@ static void ggml_compute_forward_flash_attn_ext_f16(
81478136
}
81488137
}
81498138

8139+
static void ggml_compute_forward_flash_attn_ext_f16(
8140+
const ggml_compute_params * params,
8141+
ggml_tensor * dst) {
8142+
8143+
const ggml_tensor * q = dst->src[0];
8144+
const ggml_tensor * k = dst->src[1];
8145+
const ggml_tensor * v = dst->src[2];
8146+
8147+
GGML_TENSOR_LOCALS(int64_t, neq, q, ne)
8148+
GGML_TENSOR_LOCALS(size_t, nbq, q, nb)
8149+
GGML_TENSOR_LOCALS(int64_t, nek, k, ne)
8150+
GGML_TENSOR_LOCALS(size_t, nbk, k, nb)
8151+
GGML_TENSOR_LOCALS(int64_t, nev, v, ne)
8152+
GGML_TENSOR_LOCALS(size_t, nbv, v, nb)
8153+
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
8154+
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
8155+
8156+
const int64_t DK = nek0;
8157+
const int64_t DV = nev0;
8158+
const int64_t N = neq1;
8159+
8160+
GGML_ASSERT(ne0 == DV);
8161+
GGML_ASSERT(ne2 == N);
8162+
8163+
// input tensor rows must be contiguous
8164+
GGML_ASSERT(nbq0 == ggml_type_size(q->type));
8165+
GGML_ASSERT(nbk0 == ggml_type_size(k->type));
8166+
GGML_ASSERT(nbv0 == ggml_type_size(v->type));
8167+
8168+
GGML_ASSERT(neq0 == DK);
8169+
GGML_ASSERT(nek0 == DK);
8170+
GGML_ASSERT(nev0 == DV);
8171+
8172+
GGML_ASSERT(neq1 == N);
8173+
8174+
// dst cannot be transposed or permuted
8175+
GGML_ASSERT(nb0 == sizeof(float));
8176+
GGML_ASSERT(nb0 <= nb1);
8177+
GGML_ASSERT(nb1 <= nb2);
8178+
GGML_ASSERT(nb2 <= nb3);
8179+
8180+
// parallelize by q rows using ggml_vec_dot_f32
8181+
8182+
// total rows in q
8183+
const int64_t nr = neq1*neq2*neq3;
8184+
8185+
// rows per thread
8186+
const int ith = params->ith;
8187+
const int nth = params->nth;
8188+
8189+
// disable for NUMA
8190+
const bool disable_chunking = ggml_is_numa();
8191+
8192+
// 4x chunks per thread
8193+
int nth_scaled = nth * 4;
8194+
int64_t chunk_size = (nr + nth_scaled - 1) / nth_scaled;
8195+
int64_t nchunk = (nr + chunk_size - 1) / chunk_size;
8196+
8197+
if (nth == 1 || nchunk < nth || disable_chunking) {
8198+
nchunk = nth;
8199+
}
8200+
8201+
if (ith == 0) {
8202+
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
8203+
ggml_threadpool_chunk_set(params->threadpool, nth);
8204+
}
8205+
8206+
ggml_barrier(params->threadpool);
8207+
8208+
// The number of elements in each chunk
8209+
const int64_t dr = (nr + nchunk - 1) / nchunk;
8210+
8211+
// The first chunk comes from our thread_id, the rest will get auto-assigned.
8212+
int current_chunk = ith;
8213+
8214+
while (current_chunk < nchunk) {
8215+
const int64_t ir0 = dr * current_chunk;
8216+
const int64_t ir1 = MIN(ir0 + dr, nr);
8217+
8218+
ggml_compute_forward_flash_attn_ext_f16_one_chunk(params, dst, ir0, ir1);
8219+
8220+
current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1);
8221+
}
8222+
}
8223+
81508224
void ggml_compute_forward_flash_attn_ext(
81518225
const ggml_compute_params * params,
81528226
ggml_tensor * dst) {

ggml/src/ggml-cuda/argsort.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ template<ggml_sort_order order>
8787
static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ncols_pad) {
8888
// bitonic sort
8989
int col = threadIdx.x;
90-
int row = blockIdx.y;
90+
int row = blockIdx.x;
9191

9292
if (col >= ncols_pad) {
9393
return;
@@ -151,7 +151,7 @@ static void argsort_f32_i32_cuda_bitonic(const float * x,
151151
const int ncols_pad = next_power_of_2(ncols);
152152

153153
const dim3 block_dims(ncols_pad, 1, 1);
154-
const dim3 block_nums(1, nrows, 1);
154+
const dim3 block_nums(nrows, 1, 1);
155155
const size_t shared_mem = ncols_pad * sizeof(int);
156156

157157
// FIXME: this limit could be raised by ~2-4x on Ampere or newer

ggml/src/ggml-cuda/mmvq.cu

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -190,12 +190,28 @@ static __global__ void mul_mat_vec_q(
190190

191191
const uint32_t channel_bias = ids ? channel_x : channel_dst;
192192

193+
float x_biases[ncols_dst][rows_per_cuda_block] = { { 0.0f } };
194+
float gate_biases[ncols_dst][rows_per_cuda_block] = { { 0.0f } };
193195
if constexpr (has_fusion) {
194196
if (use_bias) {
195197
x_bias = x_bias + sample_dst*stride_sample_dst + channel_bias*stride_channel_dst + row0;
198+
// 1. Hide latency by prefetching bias and gate here
199+
// 2. load only on threads that won't die after partial sum calculation
200+
if (threadIdx.x < rows_per_cuda_block && threadIdx.y == 0 &&
201+
(rows_per_cuda_block == 1 || uint32_t(row0 + threadIdx.x) < stride_col_dst)) {
202+
for (int j = 0; j < ncols_dst; ++j) {
203+
x_biases[j][threadIdx.x] = x_bias[j * stride_col_dst + threadIdx.x];
204+
}
205+
}
196206
}
197207
if (use_gate_bias) {
198208
gate_bias = gate_bias + sample_dst*stride_sample_dst + channel_bias*stride_channel_dst + row0;
209+
if (threadIdx.x < rows_per_cuda_block && threadIdx.y == 0 &&
210+
(rows_per_cuda_block == 1 || uint32_t(row0 + threadIdx.x) < stride_col_dst)) {
211+
for (int j = 0; j < ncols_dst; ++j) {
212+
gate_biases[j][threadIdx.x] = gate_bias[j * stride_col_dst + threadIdx.x];
213+
}
214+
}
199215
}
200216
}
201217

@@ -283,12 +299,12 @@ static __global__ void mul_mat_vec_q(
283299
float result = tmp[j][threadIdx.x];
284300
if constexpr (has_fusion) {
285301
if (use_bias) {
286-
result += x_bias[j*stride_col_dst + threadIdx.x];
302+
result += x_biases[j][threadIdx.x];
287303
}
288304
if (use_gate) {
289305
float gate_value = tmp_gate[j][threadIdx.x];
290306
if (use_gate_bias) {
291-
gate_value += gate_bias[j*stride_col_dst + threadIdx.x];
307+
gate_value += gate_biases[j][threadIdx.x];
292308
}
293309
switch (active_glu) {
294310
case GGML_GLU_OP_SWIGLU:

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1082,6 +1082,7 @@ struct vk_op_soft_max_push_constants {
10821082

10831083
struct vk_op_argsort_push_constants {
10841084
uint32_t ncols;
1085+
uint32_t nrows;
10851086
int32_t order;
10861087
};
10871088

@@ -8708,6 +8709,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
87088709
break;
87098710
case GGML_OP_ARGSORT:
87108711
elements = { (uint32_t)ne00, (uint32_t)ggml_nrows(src0), 1 };
8712+
elements[1] = std::min(elements[1], ctx->device->properties.limits.maxComputeWorkGroupCount[1]);
87118713
break;
87128714
case GGML_OP_IM2COL:
87138715
{
@@ -9954,9 +9956,11 @@ static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context& subctx, c
99549956
int32_t * op_params = (int32_t *)dst->op_params;
99559957

99569958
uint32_t ncols = src0->ne[0];
9959+
uint32_t nrows = ggml_nrows(src0);
99579960

99589961
ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_ARGSORT, {
99599962
ncols,
9963+
nrows,
99609964
op_params[0],
99619965
}, dryrun);
99629966
}

ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ layout (binding = 1) buffer D {int data_d[];};
1414

1515
layout (push_constant) uniform parameter {
1616
uint ncols;
17+
uint nrows;
1718
uint order;
1819
} p;
1920

@@ -26,10 +27,9 @@ void swap(uint idx0, uint idx1) {
2627
dst_row[idx1] = tmp;
2728
}
2829

29-
void argsort(bool needs_bounds_check) {
30+
void argsort(bool needs_bounds_check, const uint row) {
3031
// bitonic sort
3132
const int col = int(gl_LocalInvocationID.x);
32-
const uint row = gl_WorkGroupID.y;
3333

3434
const uint row_offset = row * p.ncols;
3535

@@ -72,8 +72,16 @@ void argsort(bool needs_bounds_check) {
7272

7373
void main() {
7474
if (p.ncols == BLOCK_SIZE) {
75-
argsort(false);
75+
uint row = gl_WorkGroupID.y;
76+
while (row < p.nrows) {
77+
argsort(false, row);
78+
row += gl_WorkGroupSize.y * gl_NumWorkGroups.y;
79+
}
7680
} else {
77-
argsort(true);
81+
uint row = gl_WorkGroupID.y;
82+
while (row < p.nrows) {
83+
argsort(true, row);
84+
row += gl_WorkGroupSize.y * gl_NumWorkGroups.y;
85+
}
7886
}
7987
}

0 commit comments

Comments
 (0)