Skip to content

Commit 9cb010c

Browse files
authored
Merge branch 'ggml-org:master' into master
2 parents 6ea32dc + 74f52f7 commit 9cb010c

File tree

4 files changed

+75
-2
lines changed

4 files changed

+75
-2
lines changed

ggml/src/ggml-cuda/vecdotq.cuh

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,58 @@ static __device__ __forceinline__ int get_int_b4(const void * x, const int & i32
2828
return ((const int *) x)[i32]; // assume at least 4 byte alignment
2929
}
3030

31+
// q4 contains 8 indices with 4 bit each.
32+
// This function selects those bytes from table that are at those indices and returns them as int2.
33+
// The first int contains the bytes with even indices in q4, the second int contains the bytes with odd indices in q4.
3134
static __device__ __forceinline__ int2 get_int_from_table_16(const int & q4, const int8_t * table) {
35+
#if defined(GGML_USE_HIP)
36+
// Load the 16-byte table into four 32-bit unsigned integers.
37+
const uint32_t *values = (const uint32_t *)table;
38+
39+
const uint32_t q_even = q4;
40+
const uint32_t q_odd = (q4 >> 4);
41+
42+
// Perform lookups in the lower half of the table (indices 0-7).
43+
uint32_t v_even_low = __builtin_amdgcn_perm(values[1], values[0], q_even & 0x07070707);
44+
uint32_t v_odd_low = __builtin_amdgcn_perm(values[1], values[0], q_odd & 0x07070707);
45+
46+
// Perform lookups in the upper half of the table (indices 8-15).
47+
uint32_t v_even_high = __builtin_amdgcn_perm(values[3], values[2], q_even & 0x07070707);
48+
uint32_t v_odd_high = __builtin_amdgcn_perm(values[3], values[2], q_odd & 0x07070707);
49+
50+
// Select between the low and high results based on the MSB of each index nibble.
51+
uint32_t mask_even = 0x03020100 | ((q_even & 0x08080808) >> 1);
52+
uint32_t res_x = __builtin_amdgcn_perm(v_even_high, v_even_low, mask_even);
53+
uint32_t mask_odd = 0x03020100 | ((q_odd & 0x08080808) >> 1);
54+
uint32_t res_y = __builtin_amdgcn_perm(v_odd_high, v_odd_low, mask_odd);
55+
56+
return make_int2(res_x, res_y);
57+
#elif !defined(GGML_USE_MUSA)
58+
// CUDA does not have an instruction for selecting bytes with 4 bit indices.
59+
// However, __byte_perm is an instruction that selects bytes with 3 bit indices that can be used instead.
60+
const uint32_t * table32 = (const uint32_t *) table;
61+
62+
// __byte_perm selects bytes based on the lower 16 bits in its third argument.
63+
// Therefore, do 2 iterations over the 32 bits in q4 with 0 and 16 shift.
64+
// To handle the fourth bit, first call _byte_perm both for the low and the high 64 bit of table, using the low 3 bits.
65+
// Then, call __byte_perm again to select from the low and high bytes based on the fourth bit.
66+
uint32_t tmp[2];
67+
const uint32_t low_high_selection_indices = (0x32103210 | ((q4 & 0x88888888) >> 1));
68+
#pragma unroll
69+
for (uint32_t i = 0; i < 2; ++i) {
70+
const uint32_t shift = 16 * i;
71+
72+
const uint32_t low = __byte_perm(table32[0], table32[1], q4 >> shift);
73+
const uint32_t high = __byte_perm(table32[2], table32[3], q4 >> shift);
74+
tmp[i] = __byte_perm(low, high, low_high_selection_indices >> shift);
75+
}
76+
77+
// tmp contains the bytes from tyble in the same order as the 4 bit indices in q4.
78+
// However, for the result we need ints with all even/odd 4 bit indices in q4.
79+
// Therefore, 2 more calls to __byte_perm to put the bytes in the correct order.
80+
return make_int2(__byte_perm(tmp[0], tmp[1], 0x6420), __byte_perm(tmp[0], tmp[1], 0x7531));
81+
#else
82+
// Generic implementation.
3283
const int q0_32 = (q4 >> 0) & 0x0F0F0F0F;
3384
const int8_t * q0_8 = (const int8_t *) &q0_32;
3485
const char4 val0_8 = make_char4(
@@ -40,6 +91,7 @@ static __device__ __forceinline__ int2 get_int_from_table_16(const int & q4, con
4091
table[q1_8[0]], table[q1_8[1]], table[q1_8[2]], table[q1_8[3]]);
4192

4293
return make_int2(*((const int *) &val0_8), *((const int *) &val1_8));
94+
#endif
4395
}
4496

4597
// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2647,8 +2647,9 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
26472647
return op->src[0]->type == GGML_TYPE_F32;
26482648
case GGML_OP_SOFT_MAX:
26492649
case GGML_OP_NORM:
2650-
case GGML_OP_RMS_NORM:
26512650
return true;
2651+
case GGML_OP_RMS_NORM:
2652+
return op->ne[0] % 4 == 0 && ggml_is_contiguous_rows(op->src[0]);
26522653
case GGML_OP_REPEAT:
26532654
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; // Assuming F32 for now, can be expanded
26542655
case GGML_OP_PAD:

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2183,7 +2183,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
21832183
const uint32_t mul_mat_subgroup_size_32 = std::max(mul_mat_subgroup_size, 32u);
21842184

21852185
const bool subgroup_min_size_16 = (!device->subgroup_size_control && device->subgroup_size >= 16) ||
2186-
(device->subgroup_size_control && device->subgroup_min_size <= 16 && device->subgroup_max_size >= 16);
2186+
(device->subgroup_size_control && device->subgroup_max_size >= 16);
21872187

21882188
// mulmat
21892189
std::vector<uint32_t> l_warptile, m_warptile, s_warptile,

tests/test-backend-ops.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2209,6 +2209,26 @@ struct test_count_equal : public test_case {
22092209
double max_nmse_err() override {
22102210
return 0.0;
22112211
}
2212+
2213+
void initialize_tensors(ggml_context * ctx) override {
2214+
std::random_device rd;
2215+
std::default_random_engine rng(rd());
2216+
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
2217+
if (t->type == GGML_TYPE_F32) {
2218+
// initialize with unique values to avoid ties
2219+
for (int64_t r = 0; r < ggml_nrows(t); r++) {
2220+
std::vector<float> data(t->ne[0]);
2221+
for (int i = 0; i < t->ne[0]; i++) {
2222+
data[i] = i;
2223+
}
2224+
std::shuffle(data.begin(), data.end(), rng);
2225+
ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float));
2226+
}
2227+
} else {
2228+
init_tensor_uniform(t);
2229+
}
2230+
}
2231+
}
22122232
};
22132233

22142234
// GGML_OP_REPEAT

0 commit comments

Comments
 (0)