Merge branch 'upstream' into concedo_experimental

LostRuins · LostRuins · commit 4abea4b5c9d7 · 2025-07-21T23:37:42.000+08:00
# Conflicts:
#	README.md
#	docs/build.md
#	ggml/src/ggml-cpu/CMakeLists.txt
#	ggml/src/ggml-cpu/kleidiai/kernels.cpp
#	ggml/src/ggml-cpu/kleidiai/kernels.h
#	ggml/src/ggml-cpu/kleidiai/kleidiai.cpp
#	tests/test-backend-ops.cpp
#	tools/server/README.md
diff --git a/ggml/src/ggml-cuda/im2col.cu b/ggml/src/ggml-cuda/im2col.cu
@@ -10,7 +10,7 @@ static  __global__ void im2col_kernel(
         return;
     }
 
-    const int64_t  ksize = OW * (KH > 1 ? KW : 1);
+    const int64_t  ksize = OW * KH;
     const int64_t  kx = i / ksize;
     const int64_t  kd = kx * ksize;
     const int64_t  ky = (i - kd) / OW;
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp b/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp
@@ -40,12 +40,10 @@ void main() {
     const uint src_base = ic * p.offset_delta + batch * p.batch_offset;
     const uint dst_base = ((batch * p.OH + oh) * p.OW) * p.CHW + ic * (p.KW * p.KH);
     const int oh_s1 = int(oh) * p.s1;
-    const uint ksize = p.OW * (p.KH > 1 ? p.KW : 1);
+    const uint ksize = p.OW * p.KH;
 
     const uint base_linear_idx = gidx * NUM_ITER;
 
-    const uint max_ky = ksize / p.OW;
-
     uint current_kx = base_linear_idx / ksize;
     const uint rem = base_linear_idx - (current_kx * ksize);
     uint current_ky = rem / p.OW;
@@ -76,7 +74,7 @@ void main() {
 
         if (++current_ix == p.OW) {
             current_ix = 0;
-            if (++current_ky == max_ky) {
+            if (++current_ky == p.KH) {
                 current_ky = 0;
                 current_kx++;
             }
diff --git a/tools/main/main.cpp b/tools/main/main.cpp
@@ -786,14 +786,17 @@ int main(int argc, char ** argv) {
                 }
 
                 // check for reverse prompt using special tokens
-                llama_token last_token = common_sampler_last(smpl);
-                for (auto token : antiprompt_token) {
-                    if (token == last_token) {
-                        if (params.interactive) {
-                            is_interacting = true;
+                // avoid calling common_sampler_last() if last_output is empty
+                if (!last_output.empty()) {
+                    llama_token last_token = common_sampler_last(smpl);
+                    for (auto token : antiprompt_token) {
+                        if (token == last_token) {
+                            if (params.interactive) {
+                                is_interacting = true;
+                            }
+                            is_antiprompt = true;
+                            break;
                         }
-                        is_antiprompt = true;
-                        break;
                     }
                 }
 
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
@@ -4516,9 +4516,10 @@ int main(int argc, char ** argv) {
         json tokens_response = json::array();
         if (body.count("content") != 0) {
             const bool add_special = json_value(body, "add_special", false);
+            const bool parse_special = json_value(body, "parse_special", true);
             const bool with_pieces = json_value(body, "with_pieces", false);
 
-            llama_tokens tokens = tokenize_mixed(ctx_server.vocab, body.at("content"), add_special, true);
+            llama_tokens tokens = tokenize_mixed(ctx_server.vocab, body.at("content"), add_special, parse_special);
 
             if (with_pieces) {
                 for (const auto& token : tokens) {

Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,7 @@ static __global__ void im2col_kernel(`
`10`	`10`	`return;`
`11`	`11`	`}`
`12`	`12`
`13`		`- const int64_t ksize = OW * (KH > 1 ? KW : 1);`
	`13`	`+ const int64_t ksize = OW * KH;`
`14`	`14`	`const int64_t kx = i / ksize;`
`15`	`15`	`const int64_t kd = kx * ksize;`
`16`	`16`	`const int64_t ky = (i - kd) / OW;`
Original file line number	Diff line number	Diff line change
`@@ -786,14 +786,17 @@ int main(int argc, char ** argv) {`
`786`	`786`	`}`
`787`	`787`
`788`	`788`	`// check for reverse prompt using special tokens`
`789`		`- llama_token last_token = common_sampler_last(smpl);`
`790`		`- for (auto token : antiprompt_token) {`
`791`		`- if (token == last_token) {`
`792`		`- if (params.interactive) {`
`793`		`- is_interacting = true;`
	`789`	`+ // avoid calling common_sampler_last() if last_output is empty`
	`790`	`+ if (!last_output.empty()) {`
	`791`	`+ llama_token last_token = common_sampler_last(smpl);`
	`792`	`+ for (auto token : antiprompt_token) {`
	`793`	`+ if (token == last_token) {`
	`794`	`+ if (params.interactive) {`
	`795`	`+ is_interacting = true;`
	`796`	`+ }`
	`797`	`+ is_antiprompt = true;`
	`798`	`+ break;`
`794`	`799`	`}`
`795`		`- is_antiprompt = true;`
`796`		`- break;`
`797`	`800`	`}`
`798`	`801`	`}`
`799`	`802`