Skip to content

Commit f256169

Browse files
committed
Merge branch 'master' into huydt/bert-ja-support
2 parents 94184ae + 663445b commit f256169

File tree

7 files changed

+149
-37
lines changed

7 files changed

+149
-37
lines changed

convert_hf_to_gguf.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3889,6 +3889,12 @@ def _xlmroberta_set_vocab(self) -> None:
38893889
SentencePieceTokenTypes.UNKNOWN,
38903890
] + toktypes[3:-1]
38913891

3892+
if self.model_arch == gguf.MODEL_ARCH.NOMIC_BERT_MOE:
3893+
# Add mask token missing from sentencepiece.bpe.model
3894+
tokens[250001] = b'<mask>'
3895+
scores[250001] = 0.0
3896+
toktypes[250001] = SentencePieceTokenTypes.CONTROL
3897+
38923898
self.gguf_writer.add_tokenizer_model("t5")
38933899
self.gguf_writer.add_tokenizer_pre("default")
38943900
self.gguf_writer.add_token_list(tokens)

ggml/src/ggml-sycl/ggml-sycl.cpp

Lines changed: 79 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1434,6 +1434,59 @@ static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy,
14341434
reinterpret_cast<sycl::half &>(y[ib].ds.y()) = sum;
14351435
}
14361436

1437+
template <int ElementsPerWI>
1438+
static __dpct_inline__ void quantize_and_reorder_q8_1(const float * __restrict__ x, void * reordered_q8_tensor,
1439+
const int kx, const int kx_padded, const sycl::nd_item<1> & it) {
1440+
/*
1441+
Quantizes and reorders the resultant q8 tensor in a per row fashion
1442+
Each sub-group calculates one quant block. i.e. QK8_1 quant values and the d and sum values
1443+
*/
1444+
1445+
auto subgroup_id = it.get_group(0);
1446+
auto wi_id = it.get_local_id(0);
1447+
1448+
const int num_blocks_per_row = kx / QK8_1;
1449+
auto row = subgroup_id / num_blocks_per_row;
1450+
auto col = subgroup_id % num_blocks_per_row;
1451+
1452+
auto row_offset = row * (kx_padded / QK8_1) * sizeof(block_q8_1);
1453+
auto col_offset = QK8_1 * col + wi_id * ElementsPerWI;
1454+
1455+
auto quant_ptr = (int8_t *) ((char *) reordered_q8_tensor + row_offset + col_offset);
1456+
auto ds_ptr = (sycl::half2 *) ((char *) reordered_q8_tensor + row_offset + kx + col * sizeof(sycl::half2));
1457+
1458+
sycl::vec<float, ElementsPerWI> wi_f32_vals;
1459+
sycl::vec<int8_t, ElementsPerWI> quantized_values;
1460+
1461+
auto float_ptr_offset = subgroup_id * QK8_1 + ElementsPerWI * wi_id;
1462+
wi_f32_vals = *reinterpret_cast<const sycl::vec<float, ElementsPerWI> *>(x + float_ptr_offset);
1463+
1464+
float sum = 0.0f;
1465+
float amax = 0.0f;
1466+
1467+
#pragma unroll(ElementsPerWI)
1468+
for (int i = 0; i < ElementsPerWI; i++) {
1469+
sum += wi_f32_vals[i];
1470+
amax = sycl::fmax(amax, sycl::fabs(wi_f32_vals[i]));
1471+
quantized_values[i] = 0;
1472+
}
1473+
sum = sycl::reduce_over_group(it.get_group(), sum, sycl::plus<float>());
1474+
amax = sycl::reduce_over_group(it.get_group(), amax, sycl::maximum<float>());
1475+
float d = amax == 0 ? 1 : amax / 127;
1476+
1477+
#pragma unroll(ElementsPerWI)
1478+
for (int i = 0; i < ElementsPerWI; i++) {
1479+
quantized_values[i] = sycl::round(wi_f32_vals[i] / d);
1480+
}
1481+
1482+
d = amax == 0 ? 0 : d;
1483+
1484+
*reinterpret_cast<sycl::vec<int8_t, ElementsPerWI> *>(quant_ptr) = quantized_values;
1485+
if (wi_id == 0) {
1486+
*ds_ptr = sycl::half2(sycl::half(d), sycl::half(sum));
1487+
}
1488+
}
1489+
14371490
static void mul_mat_p021_f16_f32(
14381491
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
14391492
const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y,
@@ -1718,23 +1771,30 @@ static void pool2d_nchw_kernel(
17181771
o_ptr[cur_oh * ow + cur_ow] = res;
17191772
}
17201773

1721-
static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
1722-
const int ky, const int kx_padded,
1723-
queue_ptr stream) {
1724-
const int block_num_x = (kx_padded + SYCL_QUANTIZE_BLOCK_SIZE - 1) / SYCL_QUANTIZE_BLOCK_SIZE;
1725-
const sycl::range<3> num_blocks(1, ky, block_num_x);
1726-
int constexpr QUANT_BLOCK_TILE = QK8_1 / WARP_SIZE;
1727-
static_assert(QK8_1 % WARP_SIZE == 0);
1728-
const sycl::range<3> block_size(1, 1, SYCL_QUANTIZE_BLOCK_SIZE / QUANT_BLOCK_TILE);
1729-
{
1730-
dpct::has_capability_or_fail(stream->get_device(),
1731-
{sycl::aspect::fp16});
1774+
static void quantize_row_q8_1_sycl(const float * x, void * vy, const int kx, const int ky, const int kx_padded,
1775+
bool reorder_q8_tensor, queue_ptr stream) {
1776+
if (reorder_q8_tensor) {
1777+
auto local_range = std::size_t(WARP_SIZE);
1778+
auto num_quant_blocks = ky * (kx / QK8_1);
1779+
auto global_range = num_quant_blocks * local_range;
1780+
stream->parallel_for(sycl::nd_range<1>({ global_range }, { local_range }),
1781+
[=](sycl::nd_item<1> it) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
1782+
quantize_and_reorder_q8_1<QK8_1 / WARP_SIZE>(x, vy, kx, kx_padded, it);
1783+
});
1784+
} else {
1785+
const int block_num_x = (kx_padded + SYCL_QUANTIZE_BLOCK_SIZE - 1) / SYCL_QUANTIZE_BLOCK_SIZE;
1786+
const sycl::range<3> num_blocks(1, ky, block_num_x);
1787+
int constexpr QUANT_BLOCK_TILE = QK8_1 / WARP_SIZE;
1788+
static_assert(QK8_1 % WARP_SIZE == 0);
1789+
const sycl::range<3> block_size(1, 1, SYCL_QUANTIZE_BLOCK_SIZE / QUANT_BLOCK_TILE);
1790+
{
1791+
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
17321792

1733-
stream->parallel_for(
1734-
sycl::nd_range<3>(num_blocks * block_size, block_size),
1735-
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
1736-
quantize_q8_1<QUANT_BLOCK_TILE>(x, vy, kx, kx_padded, item_ct1);
1737-
});
1793+
stream->parallel_for(sycl::nd_range<3>(num_blocks * block_size, block_size),
1794+
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
1795+
quantize_q8_1<QUANT_BLOCK_TILE>(x, vy, kx, kx_padded, item_ct1);
1796+
});
1797+
}
17381798
}
17391799
}
17401800

@@ -2446,9 +2506,10 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
24462506
dev[i].src1_ddq = dev[i].src1_ddq_alloc.alloc(ctx.pool(i), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
24472507

24482508
if (src1_on_device && src1_is_contiguous) {
2509+
bool reorder_q8_tensor = src0->extra && ((ggml_tensor_extra_gpu *)src0->extra)->optimized_feature.reorder;
24492510
scope_op_debug_print scope_dbg_print(__func__, "/quantize_row_q8_1_sycl", dst,
24502511
/*num_src=*/2, " : converting src1 to Q8_1");
2451-
quantize_row_q8_1_sycl(dev[i].src1_ddf, dev[i].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
2512+
quantize_row_q8_1_sycl(dev[i].src1_ddf, dev[i].src1_ddq, ne10, nrows1, src1_padded_col_size, reorder_q8_tensor, stream);
24522513
/*
24532514
DPCT1010:90: SYCL uses exceptions to report errors and does not
24542515
use the error codes. The call was replaced with 0. You need to
@@ -2554,7 +2615,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
25542615
if (convert_src1_to_q8_1 && !src1_is_contiguous) {
25552616
scope_op_debug_print scope_dbg_print(__func__, "/quantize_row_q8_1_sycl", dst,
25562617
/*num_src=*/2, " : converting src1 to Q8_1");
2557-
quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
2618+
quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, false, stream);
25582619
/*
25592620
DPCT1010:92: SYCL uses exceptions to report errors and does
25602621
not use the error codes. The call was replaced with 0. You

ggml/src/ggml-sycl/mmvq.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,6 @@ static void mul_mat_vec_q_reorder(const void * __restrict__ vx, const void * __r
2929
static_assert(blocks_per_subgroup > 0);
3030
static_assert(block_elements_per_subgroup > 0);
3131

32-
const block_q8_1 * y = (const block_q8_1 *) vy;
33-
3432
float partial_sum = 0.0f;
3533
for (int i = sg.get_local_linear_id() / block_elements_per_subgroup; i < blocks_per_row; i += blocks_per_subgroup) {
3634
const int ibx = row * blocks_per_row + i; // x block index
@@ -40,13 +38,15 @@ static void mul_mat_vec_q_reorder(const void * __restrict__ vx, const void * __r
4038

4139
// Y block index that aligns with ibx
4240
const int iby = i * block_type::block_to_q8_1_ratio();
41+
const int8_t* q8_1_quant_ptr = (const int8_t*)vy + iby * QK8_1;
42+
const sycl::half2* q8_1_ds_ptr = (const sycl::half2*)((const char*)vy + ncols + iby * sizeof(sycl::half2));
4343

4444
#pragma unroll
4545
for (int elem = 0; elem < block_elements_per_subgroup; elem += WARP_SIZE) {
4646
// x block quant index when casting the quants to int
4747
const int iqs = elem + block_traits::vdr_mmvq * (sg.get_local_linear_id() % block_elements_per_subgroup);
4848

49-
partial_sum += reorder_vec_dot_q_sycl()(vx, bx_offset, d_offset, &y[iby], iqs, nblocks);
49+
partial_sum += reorder_vec_dot_q_sycl()(vx, bx_offset, d_offset, q8_1_quant_ptr, q8_1_ds_ptr, iqs, nblocks);
5050
}
5151
}
5252

ggml/src/ggml-sycl/vecdotq.hpp

Lines changed: 38 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -285,21 +285,21 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_0> {
285285
}
286286

287287
__dpct_inline__ float operator()(const void * __restrict__ vbq, const int ibx_offset, const int d_offset,
288-
const block_q8_1 * __restrict__ bq8_1, const int & iqs, int /* nblocks */) {
288+
const int8_t* q8_1_quant_ptr, const sycl::half2* q8_1_ds, const int & iqs, int /* nblocks */) {
289289
const uint8_t * bq4_0 = static_cast<const uint8_t *>(vbq) + ibx_offset;
290290
const ggml_half d = *(reinterpret_cast<const ggml_half *>(static_cast<const uint8_t *>(vbq) + d_offset));
291291
int v[q4_0_traits::vdr_mmvq];
292292
int u[2 * q4_0_traits::vdr_mmvq];
293293

294-
#pragma unroll
295294

295+
#pragma unroll
296296
for (size_t i = 0; i < q4_0_traits::vdr_mmvq; ++i) {
297297
v[i] = get_int_from_uint8(bq4_0, iqs + i);
298-
u[2 * i + 0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
299-
u[2 * i + 1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + q4_0_traits::qi);
298+
u[2 * i + 0] = get_int_from_int8_aligned(q8_1_quant_ptr, iqs + i);
299+
u[2 * i + 1] = get_int_from_int8_aligned(q8_1_quant_ptr, iqs + i + q4_0_traits::qi);
300300
}
301301

302-
return vec_dot_q4_0_q8_1_impl(v, u, d, bq8_1->ds);
302+
return vec_dot_q4_0_q8_1_impl(v, u, d, *q8_1_ds);
303303
};
304304
};
305305

@@ -347,7 +347,7 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K> {
347347
using q4_k_traits = typename q4_k_block::traits;
348348

349349
float operator()(const void * __restrict__ vbq, const int ibx_offset, const int d_offset,
350-
const block_q8_1 * __restrict__ bq8_1, const int & iqs, int nblocks) {
350+
const int8_t* q8_1_quant_ptr, const sycl::half2* q8_1_ds, const int & iqs, int nblocks) {
351351
const int ib = ibx_offset / (QK_K / 2);
352352

353353
const uint8_t * base = static_cast<const uint8_t *>(vbq);
@@ -360,7 +360,38 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K> {
360360
const int * q4 = (const int *) (qs + 16 * bq8_offset + 4 * ((iqs / 2) % 4));
361361
const uint16_t * scales = (const uint16_t *) scs;
362362

363-
return vec_dot_q4_K_q8_1_common(q4, scales, *dms, bq8_1, iqs);
363+
int v[2];
364+
int u[2 * QR4_K];
365+
float d8[QR4_K];
366+
367+
v[0] = q4[0];
368+
v[1] = q4[4];
369+
370+
uint16_t aux[2];
371+
const int j = (QR4_K * ((iqs / 2) / (QI8_1 / 2))) / 2;
372+
if (j < 2) {
373+
aux[0] = scales[j + 0] & 0x3f3f;
374+
aux[1] = scales[j + 2] & 0x3f3f;
375+
} else {
376+
aux[0] = ((scales[j + 2] >> 0) & 0x0f0f) | ((scales[j - 2] & 0xc0c0) >> 2);
377+
aux[1] = ((scales[j + 2] >> 4) & 0x0f0f) | ((scales[j - 0] & 0xc0c0) >> 2);
378+
}
379+
380+
const uint8_t * sc = (const uint8_t *) aux;
381+
const uint8_t * m = sc + 2;
382+
383+
for (int i = 0; i < QR4_K; ++i) {
384+
const int8_t* quant_base_ptr = q8_1_quant_ptr + (bq8_offset + i) * QK8_1;
385+
sycl::half2 ds_values = *(q8_1_ds + bq8_offset + i);
386+
387+
d8[i] = ds_values[0];
388+
389+
const int * q8 = (const int *) quant_base_ptr + ((iqs / 2) % 4);
390+
u[2 * i + 0] = q8[0];
391+
u[2 * i + 1] = q8[4];
392+
}
393+
394+
return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, *dms, d8);
364395
}
365396
};
366397

ggml/src/gguf.cpp

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -347,25 +347,28 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
347347
int64_t n_tensors = 0;
348348

349349
if (ok && gr.read(ctx->version)) {
350+
if (ok && ctx->version == 0) {
351+
GGML_LOG_ERROR("%s: bad GGUF version: %" PRIu32 "\n", __func__, ctx->version);
352+
ok = false;
353+
}
354+
350355
/*
351356
* bit layout is different when reading non-native endian models.
352357
* assuming that the GGUF version is 3, the non-native endian model
353358
* would read it as 0x30000000. we can use the AND operation against
354359
* the last 4 hexadecimal digits to check if the model is the same
355360
* endianness as the host system.
356361
*/
357-
if ((ctx->version & 0x0000FFFF) == 0x00000000) {
362+
if (ok && (ctx->version & 0x0000FFFF) == 0x00000000) {
358363
GGML_LOG_ERROR("%s: failed to load model: this GGUF file version %" PRIu32 " is extremely large, is there a mismatch between the host and model endianness?\n", __func__, ctx->version);
359-
gguf_free(ctx);
360-
return nullptr;
364+
ok = false;
361365
}
362366

363-
GGML_ASSERT(ctx->version > 0 && ctx->version <= 65535);
364-
if (ctx->version == 1) {
367+
if (ok && ctx->version == 1) {
365368
GGML_LOG_ERROR("%s: GGUFv1 is no longer supported, please use a more up-to-date version\n", __func__);
366369
ok = false;
367370
}
368-
if (ctx->version > GGUF_VERSION) {
371+
if (ok && ctx->version > GGUF_VERSION) {
369372
GGML_LOG_ERROR("%s: this GGUF file is version %" PRIu32 " but this software only supports up to version %d\n",
370373
__func__, ctx->version, GGUF_VERSION);
371374
ok = false;

src/llama-vocab.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2080,9 +2080,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
20802080

20812081
std::string model_name;
20822082
std::string tokenizer_pre;
2083+
std::string general_arch;
20832084

20842085
ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
20852086
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
2087+
ml.get_key(LLM_KV_GENERAL_ARCHITECTURE, general_arch, false);
20862088

20872089
// model name to lowercase
20882090
std::transform(model_name.begin(), model_name.end(), model_name.begin(),
@@ -2091,8 +2093,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
20912093
}
20922094
);
20932095

2094-
// set attributes by model/tokenizer name
2095-
if (_contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})) {
2096+
// set attributes by model/tokenizer/architecture name
2097+
if (false
2098+
|| _contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})
2099+
|| _contains_any(general_arch, {"nomic-bert-moe"})
2100+
) {
20962101
_set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
20972102
} else if (_contains_any(model_name, {"phi-3", "phi3"})) {
20982103
for (auto id : cache_special_tokens) {

tests/test-gguf.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ constexpr int offset_has_data = 3000;
1616

1717
enum handcrafted_file_type {
1818
HANDCRAFTED_HEADER_BAD_MAGIC = 10,
19+
HANDCRAFTED_HEADER_BAD_VERSION_0 = 15,
1920
HANDCRAFTED_HEADER_BAD_VERSION_1 = 20,
2021
HANDCRAFTED_HEADER_BAD_VERSION_FUTURE = 30,
2122
HANDCRAFTED_HEADER_BAD_N_TENSORS = 40,
@@ -51,6 +52,7 @@ enum handcrafted_file_type {
5152
static std::string handcrafted_file_type_name(const enum handcrafted_file_type hft) {
5253
switch (hft) {
5354
case HANDCRAFTED_HEADER_BAD_MAGIC: return "HEADER_BAD_MAGIC";
55+
case HANDCRAFTED_HEADER_BAD_VERSION_0: return "HEADER_BAD_VERSION_0";
5456
case HANDCRAFTED_HEADER_BAD_VERSION_1: return "HEADER_BAD_VERSION_1";
5557
case HANDCRAFTED_HEADER_BAD_VERSION_FUTURE: return "HEADER_BAD_VERSION_FUTURE";
5658
case HANDCRAFTED_HEADER_BAD_N_KV: return "HEADER_BAD_N_KV";
@@ -171,7 +173,10 @@ static FILE * get_handcrafted_file(const unsigned int seed, const enum handcraft
171173
helper_write(file, GGUF_MAGIC, 4);
172174
}
173175

174-
if (hft == HANDCRAFTED_HEADER_BAD_VERSION_1) {
176+
if (hft == HANDCRAFTED_HEADER_BAD_VERSION_0) {
177+
const uint32_t version = 0;
178+
helper_write(file, version);
179+
} else if (hft == HANDCRAFTED_HEADER_BAD_VERSION_1) {
175180
const uint32_t version = 1;
176181
helper_write(file, version);
177182
} else if (hft == HANDCRAFTED_HEADER_BAD_VERSION_FUTURE) {
@@ -660,6 +665,7 @@ static std::pair<int, int> test_handcrafted_file(const unsigned int seed) {
660665

661666
const std::vector<handcrafted_file_type> hfts = {
662667
HANDCRAFTED_HEADER_BAD_MAGIC,
668+
HANDCRAFTED_HEADER_BAD_VERSION_0,
663669
HANDCRAFTED_HEADER_BAD_VERSION_1,
664670
HANDCRAFTED_HEADER_BAD_VERSION_FUTURE,
665671
HANDCRAFTED_HEADER_BAD_N_KV,

0 commit comments

Comments
 (0)