Skip to content

Commit 0ce5d32

Browse files
authored
Merge branch 'ggml-org:master' into master
2 parents d087b13 + 7ab3643 commit 0ce5d32

File tree

12 files changed

+900
-674
lines changed

12 files changed

+900
-674
lines changed

.github/workflows/build.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1238,7 +1238,7 @@ jobs:
12381238
cmake -G "Unix Makefiles" -B build -S . `
12391239
-DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
12401240
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
1241-
-DCMAKE_CXX_FLAGS="-Irocwmma/library/include/" `
1241+
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" `
12421242
-DCMAKE_BUILD_TYPE=Release `
12431243
-DGGML_HIP=ON `
12441244
-DGGML_HIP_ROCWMMA_FATTN=ON `
@@ -1294,7 +1294,7 @@ jobs:
12941294
cmake -G "Unix Makefiles" -B build -S . `
12951295
-DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
12961296
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
1297-
-DCMAKE_CXX_FLAGS="-Irocwmma/library/include/" `
1297+
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" `
12981298
-DCMAKE_BUILD_TYPE=Release `
12991299
-DAMDGPU_TARGETS=${{ matrix.gpu_target }} `
13001300
-DGGML_HIP_ROCWMMA_FATTN=ON `

ci/run.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -352,10 +352,10 @@ function gg_run_open_llama_7b_v2 {
352352

353353
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
354354

355-
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
356-
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
357-
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
358-
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
355+
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
356+
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
357+
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
358+
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
359359

360360
function check_ppl {
361361
qnt="$1"

common/minja/minja.hpp

Lines changed: 37 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1378,13 +1378,27 @@ struct ArgumentsExpression {
13781378
}
13791379
};
13801380

1381-
static std::string strip(const std::string & s) {
1382-
auto start = s.find_first_not_of(" \t\n\r");
1381+
static std::string strip(const std::string & s, const std::string & chars = "", bool left = true, bool right = true) {
1382+
auto charset = chars.empty() ? " \t\n\r" : chars;
1383+
auto start = left ? s.find_first_not_of(charset) : 0;
13831384
if (start == std::string::npos) return "";
1384-
auto end = s.find_last_not_of(" \t\n\r");
1385+
auto end = right ? s.find_last_not_of(charset) : s.size() - 1;
13851386
return s.substr(start, end - start + 1);
13861387
}
13871388

1389+
static std::vector<std::string> split(const std::string & s, const std::string & sep) {
1390+
std::vector<std::string> result;
1391+
size_t start = 0;
1392+
size_t end = s.find(sep);
1393+
while (end != std::string::npos) {
1394+
result.push_back(s.substr(start, end - start));
1395+
start = end + sep.length();
1396+
end = s.find(sep, start);
1397+
}
1398+
result.push_back(s.substr(start));
1399+
return result;
1400+
}
1401+
13881402
static std::string capitalize(const std::string & s) {
13891403
if (s.empty()) return s;
13901404
auto result = s;
@@ -1467,8 +1481,26 @@ class MethodCallExpr : public Expression {
14671481
} else if (obj.is_string()) {
14681482
auto str = obj.get<std::string>();
14691483
if (method->get_name() == "strip") {
1470-
vargs.expectArgs("strip method", {0, 0}, {0, 0});
1471-
return Value(strip(str));
1484+
vargs.expectArgs("strip method", {0, 1}, {0, 0});
1485+
auto chars = vargs.args.empty() ? "" : vargs.args[0].get<std::string>();
1486+
return Value(strip(str, chars));
1487+
} else if (method->get_name() == "lstrip") {
1488+
vargs.expectArgs("lstrip method", {0, 1}, {0, 0});
1489+
auto chars = vargs.args.empty() ? "" : vargs.args[0].get<std::string>();
1490+
return Value(strip(str, chars, /* left= */ true, /* right= */ false));
1491+
} else if (method->get_name() == "rstrip") {
1492+
vargs.expectArgs("rstrip method", {0, 1}, {0, 0});
1493+
auto chars = vargs.args.empty() ? "" : vargs.args[0].get<std::string>();
1494+
return Value(strip(str, chars, /* left= */ false, /* right= */ true));
1495+
} else if (method->get_name() == "split") {
1496+
vargs.expectArgs("split method", {1, 1}, {0, 0});
1497+
auto sep = vargs.args[0].get<std::string>();
1498+
auto parts = split(str, sep);
1499+
Value result = Value::array();
1500+
for (const auto& part : parts) {
1501+
result.push_back(Value(part));
1502+
}
1503+
return result;
14721504
} else if (method->get_name() == "capitalize") {
14731505
vargs.expectArgs("capitalize method", {0, 0}, {0, 0});
14741506
return Value(capitalize(str));

examples/server/server.cpp

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1312,7 +1312,7 @@ struct server_slot {
13121312
return task_type == SERVER_TASK_TYPE_EMBEDDING || task_type == SERVER_TASK_TYPE_RERANK;
13131313
}
13141314

1315-
bool can_batch_with(server_slot & other_slot) {
1315+
bool can_batch_with(server_slot & other_slot) const {
13161316
return is_non_causal() == other_slot.is_non_causal()
13171317
&& are_lora_equal(lora, other_slot.lora);
13181318
}
@@ -1900,6 +1900,7 @@ struct server_context {
19001900
try {
19011901
common_chat_format_example(chat_templates.get(), params.use_jinja);
19021902
} catch (const std::exception & e) {
1903+
SRV_WRN("%s: Chat template parsing error: %s\n", __func__, e.what());
19031904
SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
19041905
chat_templates = common_chat_templates_init(model, "chatml");
19051906
}
@@ -2156,14 +2157,6 @@ struct server_context {
21562157
}
21572158

21582159
if (slot.has_new_line) {
2159-
// if we have already seen a new line, we stop after a certain time limit
2160-
if (slot.params.t_max_predict_ms > 0 && (ggml_time_us() - slot.t_start_generation > 1000.0f*slot.params.t_max_predict_ms)) {
2161-
slot.stop = STOP_TYPE_LIMIT;
2162-
slot.has_next_token = false;
2163-
2164-
SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.params.t_max_predict_ms);
2165-
}
2166-
21672160
// require that each new line has a whitespace prefix (i.e. indentation) of at least slot.params.n_indent
21682161
if (slot.params.n_indent > 0) {
21692162
// check the current indentation
@@ -2202,6 +2195,14 @@ struct server_context {
22022195
// check if there is a new line in the generated text
22032196
if (result.text_to_send.find('\n') != std::string::npos) {
22042197
slot.has_new_line = true;
2198+
2199+
// if we have seen a new line, we stop after a certain time limit, but only upon another new line
2200+
if (slot.params.t_max_predict_ms > 0 && (ggml_time_us() - slot.t_start_generation > 1000.0f*slot.params.t_max_predict_ms)) {
2201+
slot.stop = STOP_TYPE_LIMIT;
2202+
slot.has_next_token = false;
2203+
2204+
SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.params.t_max_predict_ms);
2205+
}
22052206
}
22062207

22072208
// if context shift is disabled, we stop when it reaches the context limit

ggml/src/ggml-cpu/ggml-cpu-quants.c

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11718,9 +11718,12 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
1171811718

1171911719
#elif defined __AVX2__
1172011720

11721-
const __m256i mask = _mm256_set1_epi16(2 * 0x7);
11721+
const __m256i mask = _mm256_set1_epi16(0x7);
1172211722
const __m256i mone = _mm256_set1_epi16(1);
1172311723
const __m256i mone8 = _mm256_set1_epi8(1);
11724+
const __m256i mtwo8 = _mm256_set1_epi8(2);
11725+
// VPSHUFB cannot cross 128-bit lanes so odd shifts go to upper half.
11726+
const __m256i scales_shift = _mm256_set_epi64x(9, 3, 6, 0);
1172411727

1172511728
__m256 accum1 = _mm256_setzero_ps();
1172611729
__m256 accum2 = _mm256_setzero_ps();
@@ -11732,6 +11735,14 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
1173211735
const uint16_t * sc = (const uint16_t *)x[i].scales;
1173311736

1173411737
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
11738+
// Extract 3-bit scales (16 values)
11739+
__m256i scales = _mm256_set1_epi64x(*(const uint64_t*)sc);
11740+
scales = _mm256_srlv_epi64(scales, scales_shift);
11741+
scales = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scales, mask), 1), mone);
11742+
11743+
// Indices to repeat each scale 8 times.
11744+
__m256i scales_idx1 = _mm256_set1_epi16(0x0100);
11745+
__m256i scales_idx2 = _mm256_add_epi8(scales_idx1, _mm256_set1_epi8(8));
1173511746

1173611747
__m256i sumi1 = _mm256_setzero_si256();
1173711748
__m256i sumi2 = _mm256_setzero_si256();
@@ -11777,11 +11788,12 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
1177711788
const __m256i dot3 = _mm256_maddubs_epi16(mone8, _mm256_sign_epi8(q8b_1, delta1));
1177811789
const __m256i dot4 = _mm256_maddubs_epi16(mone8, _mm256_sign_epi8(q8b_2, delta2));
1177911790

11780-
__m256i scale1 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 2), _mm_set1_epi16(sc[ib/2] << 1));
11781-
__m256i scale2 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 8), _mm_set1_epi16(sc[ib/2] >> 5));
11791+
__m256i scale1 = _mm256_shuffle_epi8(scales, scales_idx1);
11792+
__m256i scale2 = _mm256_shuffle_epi8(scales, scales_idx2);
11793+
11794+
scales_idx1 = _mm256_add_epi8(scales_idx1, mtwo8);
11795+
scales_idx2 = _mm256_add_epi8(scales_idx2, mtwo8);
1178211796

11783-
scale1 = _mm256_add_epi16(_mm256_and_si256(scale1, mask), mone);
11784-
scale2 = _mm256_add_epi16(_mm256_and_si256(scale2, mask), mone);
1178511797
const __m256i p1 = _mm256_madd_epi16(dot1, scale1);
1178611798
const __m256i p2 = _mm256_madd_epi16(dot2, scale2);
1178711799
const __m256i p3 = _mm256_madd_epi16(dot3, scale1);

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 141 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6648,14 +6648,143 @@ static void ggml_compute_forward_repeat_back(
66486648

66496649
// ggml_compute_forward_concat
66506650

6651+
static void ggml_compute_forward_concat_any(
6652+
const struct ggml_compute_params * params,
6653+
struct ggml_tensor * dst) {
6654+
6655+
const struct ggml_tensor * src0 = dst->src[0];
6656+
const struct ggml_tensor * src1 = dst->src[1];
6657+
6658+
const size_t len = ggml_type_size(src0->type);
6659+
6660+
const int ith = params->ith;
6661+
const int nth = params->nth;
6662+
6663+
GGML_TENSOR_BINARY_OP_LOCALS
6664+
6665+
const int32_t dim = ggml_get_op_params_i32(dst, 0);
6666+
6667+
GGML_ASSERT(dim >= 0 && dim < 4);
6668+
6669+
int64_t o[4] = {0, 0, 0, 0};
6670+
o[dim] = src0->ne[dim];
6671+
6672+
const char * x;
6673+
6674+
// TODO: smarter multi-theading
6675+
for (int i3 = 0; i3 < ne3; i3++) {
6676+
for (int i2 = ith; i2 < ne2; i2 += nth) {
6677+
for (int i1 = 0; i1 < ne1; i1++) {
6678+
for (int i0 = 0; i0 < ne0; i0++) {
6679+
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
6680+
x = (const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03;
6681+
} else {
6682+
x = (const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13;
6683+
}
6684+
6685+
char * y = (char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3;
6686+
6687+
memcpy(y, x, len);
6688+
}
6689+
}
6690+
}
6691+
}
6692+
}
6693+
6694+
static void ggml_compute_forward_concat_i8(
6695+
const struct ggml_compute_params * params,
6696+
struct ggml_tensor * dst) {
6697+
6698+
const struct ggml_tensor * src0 = dst->src[0];
6699+
const struct ggml_tensor * src1 = dst->src[1];
6700+
6701+
GGML_ASSERT(ggml_type_size(src0->type) == sizeof(int8_t));
6702+
6703+
const int ith = params->ith;
6704+
const int nth = params->nth;
6705+
6706+
GGML_TENSOR_BINARY_OP_LOCALS
6707+
6708+
const int32_t dim = ggml_get_op_params_i32(dst, 0);
6709+
6710+
GGML_ASSERT(dim >= 0 && dim < 4);
6711+
6712+
int64_t o[4] = {0, 0, 0, 0};
6713+
o[dim] = src0->ne[dim];
6714+
6715+
const int8_t * x;
6716+
6717+
// TODO: smarter multi-theading
6718+
for (int i3 = 0; i3 < ne3; i3++) {
6719+
for (int i2 = ith; i2 < ne2; i2 += nth) {
6720+
for (int i1 = 0; i1 < ne1; i1++) {
6721+
for (int i0 = 0; i0 < ne0; i0++) {
6722+
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
6723+
x = (const int8_t *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03);
6724+
} else {
6725+
x = (const int8_t *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13);
6726+
}
6727+
6728+
int8_t * y = (int8_t *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
6729+
6730+
*y = *x;
6731+
}
6732+
}
6733+
}
6734+
}
6735+
}
6736+
6737+
static void ggml_compute_forward_concat_f16(
6738+
const struct ggml_compute_params * params,
6739+
struct ggml_tensor * dst) {
6740+
6741+
const struct ggml_tensor * src0 = dst->src[0];
6742+
const struct ggml_tensor * src1 = dst->src[1];
6743+
6744+
GGML_ASSERT(ggml_type_size(src0->type) == sizeof(ggml_fp16_t));
6745+
6746+
const int ith = params->ith;
6747+
const int nth = params->nth;
6748+
6749+
GGML_TENSOR_BINARY_OP_LOCALS
6750+
6751+
const int32_t dim = ggml_get_op_params_i32(dst, 0);
6752+
6753+
GGML_ASSERT(dim >= 0 && dim < 4);
6754+
6755+
int64_t o[4] = {0, 0, 0, 0};
6756+
o[dim] = src0->ne[dim];
6757+
6758+
const ggml_fp16_t * x;
6759+
6760+
// TODO: smarter multi-theading
6761+
for (int i3 = 0; i3 < ne3; i3++) {
6762+
for (int i2 = ith; i2 < ne2; i2 += nth) {
6763+
for (int i1 = 0; i1 < ne1; i1++) {
6764+
for (int i0 = 0; i0 < ne0; i0++) {
6765+
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
6766+
x = (const ggml_fp16_t *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03);
6767+
} else {
6768+
x = (const ggml_fp16_t *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13);
6769+
}
6770+
6771+
ggml_fp16_t * y = (ggml_fp16_t *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
6772+
6773+
*y = *x;
6774+
}
6775+
}
6776+
}
6777+
}
6778+
}
6779+
66516780
static void ggml_compute_forward_concat_f32(
66526781
const struct ggml_compute_params * params,
66536782
struct ggml_tensor * dst) {
66546783

66556784
const struct ggml_tensor * src0 = dst->src[0];
66566785
const struct ggml_tensor * src1 = dst->src[1];
66576786

6658-
GGML_ASSERT(src0->nb[0] == sizeof(float));
6787+
GGML_ASSERT(ggml_type_size(src0->type) == sizeof(float));
66596788

66606789
const int ith = params->ith;
66616790
const int nth = params->nth;
@@ -6698,14 +6827,24 @@ static void ggml_compute_forward_concat(
66986827
const struct ggml_tensor * src0 = dst->src[0];
66996828

67006829
switch (src0->type) {
6830+
case GGML_TYPE_F16:
6831+
case GGML_TYPE_BF16:
6832+
case GGML_TYPE_I16:
6833+
{
6834+
ggml_compute_forward_concat_f16(params, dst);
6835+
} break;
6836+
case GGML_TYPE_I8:
6837+
{
6838+
ggml_compute_forward_concat_i8(params, dst);
6839+
} break;
67016840
case GGML_TYPE_F32:
67026841
case GGML_TYPE_I32:
67036842
{
67046843
ggml_compute_forward_concat_f32(params, dst);
67056844
} break;
67066845
default:
67076846
{
6708-
GGML_ABORT("fatal error");
6847+
ggml_compute_forward_concat_any(params, dst);
67096848
}
67106849
}
67116850
}

ggml/src/ggml-metal/CMakeLists.txt

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -88,9 +88,8 @@ else()
8888

8989
add_custom_command(
9090
OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
91-
COMMAND xcrun -sdk macosx metal ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
92-
COMMAND xcrun -sdk macosx metallib ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
93-
COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
91+
COMMAND xcrun -sdk macosx metal ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o - |
92+
xcrun -sdk macosx metallib - -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
9493
COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h
9594
COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal
9695
DEPENDS ggml-metal.metal ${METALLIB_COMMON}

0 commit comments

Comments
 (0)