Skip to content

Commit ec047e1

Browse files
committed
Merge remote-tracking branch 'upstream/master' into backend-sampling
2 parents 9e5e09d + 583cb83 commit ec047e1

21 files changed

+656
-109
lines changed

CODEOWNERS

Lines changed: 8 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,8 @@
22
# multiplie collaborators per item can be specified
33

44
/.devops/*.Dockerfile @ngxson
5-
/.github/actions/ @slaren @CISC
5+
/.github/actions/ @CISC
66
/.github/workflows/ @CISC
7-
/.github/workflows/release.yml @slaren
8-
/.github/workflows/winget.yml @slaren
97
/ci/ @ggerganov
108
/cmake/ @ggerganov
119
/common/CMakeLists.txt @ggerganov
@@ -40,41 +38,34 @@
4038
/examples/passkey/ @ggerganov
4139
/examples/retrieval/ @ggerganov
4240
/examples/save-load-state/ @ggerganov
43-
/examples/simple-chat/ @slaren
44-
/examples/simple/ @slaren
4541
/examples/speculative-simple/ @ggerganov
4642
/examples/speculative/ @ggerganov
4743
/ggml/cmake/ @ggerganov
48-
/ggml/include/ @ggerganov @slaren
49-
/ggml/src/ggml-alloc.c @slaren
50-
/ggml/src/ggml-backend* @slaren
51-
/ggml/src/ggml-blas/ @slaren
52-
/ggml/src/ggml-common.h @ggerganov @slaren
53-
/ggml/src/ggml-cpu/ @ggerganov @slaren
44+
/ggml/include/ @ggerganov
45+
/ggml/src/ggml-common.h @ggerganov
46+
/ggml/src/ggml-cpu/ @ggerganov
5447
/ggml/src/ggml-cpu/spacemit/ @alex-spacemit
55-
/ggml/src/ggml-cuda/common.cuh @slaren
5648
/ggml/src/ggml-cuda/fattn* @JohannesGaessler
57-
/ggml/src/ggml-cuda/ggml-cuda.cu @slaren
5849
/ggml/src/ggml-cuda/mmf.* @JohannesGaessler @am17an
5950
/ggml/src/ggml-cuda/mmq.* @JohannesGaessler
6051
/ggml/src/ggml-cuda/mmvf.* @JohannesGaessler
6152
/ggml/src/ggml-cuda/mmvq.* @JohannesGaessler
6253
/ggml/src/ggml-cuda/fattn-wmma* @IMbackK
6354
/ggml/src/ggml-hip/ @IMbackK
6455
/ggml/src/ggml-cuda/vendors/hip.h @IMbackK
65-
/ggml/src/ggml-impl.h @ggerganov @slaren
56+
/ggml/src/ggml-impl.h @ggerganov
6657
/ggml/src/ggml-metal/ @ggerganov
6758
/ggml/src/ggml-opencl/ @lhez @max-krasnyansky
6859
/ggml/src/ggml-hexagon/ @max-krasnyansky @lhez
6960
/ggml/src/ggml-opt.cpp @JohannesGaessler
7061
/ggml/src/ggml-quants.* @ggerganov
7162
/ggml/src/ggml-rpc/ @rgerganov
72-
/ggml/src/ggml-threading.* @ggerganov @slaren
63+
/ggml/src/ggml-threading.* @ggerganov
7364
/ggml/src/ggml-vulkan/ @0cc4m
7465
/ggml/src/ggml-webgpu/ @reeselevine
7566
/ggml/src/ggml-zdnn/ @taronaeo @Andreas-Krebbel @AlekseiNikiforovIBM
76-
/ggml/src/ggml.c @ggerganov @slaren
77-
/ggml/src/ggml.cpp @ggerganov @slaren
67+
/ggml/src/ggml.c @ggerganov
68+
/ggml/src/ggml.cpp @ggerganov
7869
/ggml/src/gguf.cpp @JohannesGaessler @Green-Sky
7970
/gguf-py/ @CISC
8071
/media/ @ggerganov
@@ -86,15 +77,11 @@
8677
/src/llama-arch.* @CISC
8778
/src/llama-chat.* @ngxson
8879
/src/llama-graph.* @CISC
89-
/src/llama-model-loader.* @slaren
9080
/src/llama-model.* @CISC
9181
/src/llama-vocab.* @CISC
9282
/src/models/ @CISC
9383
/tests/ @ggerganov
94-
/tests/test-backend-ops.cpp @slaren
95-
/tests/test-thread-safety.cpp @slaren
9684
/tools/batched-bench/ @ggerganov
97-
/tools/llama-bench/ @slaren
9885
/tools/main/ @ggerganov
9986
/tools/mtmd/ @ngxson
10087
/tools/perplexity/ @ggerganov
@@ -106,8 +93,6 @@
10693
/tools/tokenize/ @ggerganov
10794
/tools/tts/ @ggerganov
10895
/vendor/ @ggerganov
109-
/.clang-format @slaren
110-
/.clang-tidy @slaren
11196
/AUTHORS @ggerganov
11297
/CMakeLists.txt @ggerganov
11398
/CONTRIBUTING.md @ggerganov

convert_hf_to_gguf.py

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10061,6 +10061,25 @@ class LazyTorchTensor(gguf.LazyBase):
1006110061
torch.uint8: np.uint8,
1006210062
}
1006310063

10064+
# only used when byteswapping data. Only correct size is needed
10065+
_dtype_byteswap_map: dict[torch.dtype, type] = {
10066+
torch.float64: np.float64,
10067+
torch.float32: np.float32,
10068+
torch.bfloat16: np.float16,
10069+
torch.float16: np.float16,
10070+
torch.int64: np.int64,
10071+
torch.uint64: np.uint64,
10072+
torch.int32: np.int32,
10073+
torch.uint32: np.uint32,
10074+
torch.int16: np.int16,
10075+
torch.uint16: np.uint16,
10076+
torch.int8: np.int8,
10077+
torch.uint8: np.uint8,
10078+
torch.bool: np.uint8,
10079+
torch.float8_e4m3fn: np.uint8,
10080+
torch.float8_e5m2: np.uint8,
10081+
}
10082+
1006410083
# used for safetensors slices
1006510084
# ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046
1006610085
# TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734
@@ -10104,19 +10123,31 @@ def from_safetensors_slice(cls, st_slice: Any) -> Tensor:
1010410123
@classmethod
1010510124
def from_local_tensor(cls, t: gguf.utility.LocalTensor) -> Tensor:
1010610125
def load_tensor(tensor: gguf.utility.LocalTensor) -> Tensor:
10126+
def byteswap_tensor(tensor: np.ndarray, dtype: type) -> np.ndarray:
10127+
if sys.byteorder == 'big':
10128+
# switch data back to big endian
10129+
tensor = tensor.view(dtype).byteswap(inplace=False)
10130+
return tensor
1010710131
dtype = cls._dtype_str_map[tensor.dtype]
10108-
return torch.from_numpy(tensor.mmap_bytes()).view(dtype).reshape(tensor.shape)
10132+
numpy_dtype = cls._dtype_byteswap_map[dtype]
10133+
return torch.from_numpy(byteswap_tensor(tensor.mmap_bytes(), numpy_dtype)).view(dtype).reshape(tensor.shape)
1010910134
dtype = cls._dtype_str_map[t.dtype]
1011010135
shape = t.shape
1011110136
lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(t,), func=lambda r: load_tensor(r))
1011210137
return cast(torch.Tensor, lazy)
1011310138

1011410139
@classmethod
1011510140
def from_remote_tensor(cls, remote_tensor: gguf.utility.RemoteTensor):
10141+
def byteswap_tensor(tensor: np.ndarray, dtype: type) -> np.ndarray:
10142+
if sys.byteorder == 'big':
10143+
# switch data back to big endian
10144+
tensor = tensor.view(dtype).byteswap(inplace=False)
10145+
return tensor
1011610146
dtype = cls._dtype_str_map[remote_tensor.dtype]
10147+
numpy_dtype = cls._dtype_byteswap_map[dtype]
1011710148
shape = remote_tensor.shape
1011810149
meta = cls.meta_with_dtype_and_shape(dtype, shape)
10119-
lazy = cls(meta=meta, args=(remote_tensor,), func=lambda r: torch.frombuffer(r.data(), dtype=dtype).reshape(shape))
10150+
lazy = cls(meta=meta, args=(remote_tensor,), func=lambda r: torch.from_numpy(byteswap_tensor(np.frombuffer(r.data(), dtype=numpy_dtype), numpy_dtype)).view(dtype).reshape(shape))
1012010151
return cast(torch.Tensor, lazy)
1012110152

1012210153
@classmethod

ggml/include/ggml.h

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -530,6 +530,7 @@ extern "C" {
530530
GGML_OP_ARANGE,
531531
GGML_OP_TIMESTEP_EMBEDDING,
532532
GGML_OP_ARGSORT,
533+
GGML_OP_TOP_K,
533534
GGML_OP_LEAKY_RELU,
534535
GGML_OP_TRI,
535536
GGML_OP_FILL,
@@ -2258,18 +2259,25 @@ extern "C" {
22582259
struct ggml_tensor * a,
22592260
enum ggml_sort_order order);
22602261

2261-
GGML_API struct ggml_tensor * ggml_arange(
2262+
// similar to ggml_top_k but implemented as `argsort` + `view`
2263+
GGML_API struct ggml_tensor * ggml_argsort_top_k(
22622264
struct ggml_context * ctx,
2263-
float start,
2264-
float stop,
2265-
float step);
2265+
struct ggml_tensor * a,
2266+
int k);
22662267

22672268
// top k elements per row
2269+
// note: the resulting top k indices are in no particular order
22682270
GGML_API struct ggml_tensor * ggml_top_k(
22692271
struct ggml_context * ctx,
22702272
struct ggml_tensor * a,
22712273
int k);
22722274

2275+
GGML_API struct ggml_tensor * ggml_arange(
2276+
struct ggml_context * ctx,
2277+
float start,
2278+
float stop,
2279+
float step);
2280+
22732281
#define GGML_KQ_MASK_PAD 64
22742282

22752283
// q: [n_embd_k, n_batch, n_head, ne3 ]

ggml/src/ggml-cann/aclnn_ops.cpp

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
#include <aclnnop/aclnn_exp.h>
4343
#include <aclnnop/aclnn_fill_scalar.h>
4444
#include <aclnnop/aclnn_fused_infer_attention_score_v2.h>
45+
#include <aclnnop/aclnn_ger.h>
4546
#include <aclnnop/aclnn_group_norm.h>
4647
#include <aclnnop/aclnn_grouped_matmul_v3.h>
4748
#include <aclnnop/aclnn_gt_scalar.h>
@@ -3236,3 +3237,64 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context & ctx, ggml_tensor * dst
32363237
GGML_ABORT("Function is not implemented.");
32373238
}
32383239
}
3240+
3241+
static void ggml_cann_out_prod_fp(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
3242+
ggml_tensor * src0 = dst->src[0]; // weight
3243+
ggml_tensor * src1 = dst->src[1]; // input
3244+
GGML_TENSOR_BINARY_OP_LOCALS
3245+
3246+
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
3247+
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, acl_dst.get());
3248+
3249+
const int64_t dps2 = ne2 / ne02;
3250+
const int64_t dps3 = ne3 / ne03;
3251+
for (int64_t i3 = 0; i3 < ne3; i3++) {
3252+
for (int64_t i2 = 0; i2 < ne2; i2++) {
3253+
const int64_t i02 = i2 / dps2;
3254+
const int64_t i03 = i3 / dps3;
3255+
3256+
const int64_t i12 = i2;
3257+
const int64_t i13 = i3;
3258+
acl_tensor_ptr accumulator =
3259+
ggml_cann_create_tensor((char *) dst->data + i2 * nb2 + i3 * nb3, ggml_cann_type_mapping(dst->type),
3260+
ggml_type_size(dst->type), dst->ne, dst->nb, 2);
3261+
3262+
// The outer product needs to be accumulated in this dimension.
3263+
for (int64_t i1 = 0; i1 < ne11; i1++) {
3264+
acl_tensor_ptr acl_input = ggml_cann_create_tensor(
3265+
(char *) src1->data + i1 * nb11 + i12 * nb12 + i13 * nb13, ggml_cann_type_mapping(src0->type),
3266+
ggml_type_size(src0->type), src1->ne, src1->nb, 1);
3267+
3268+
acl_tensor_ptr acl_weight = ggml_cann_create_tensor(
3269+
(char *) src0->data + i1 * nb01 + i02 * nb02 + i03 * nb03, ggml_cann_type_mapping(src0->type),
3270+
ggml_type_size(src0->type), src0->ne, src0->nb, 1);
3271+
3272+
ggml_cann_pool_alloc output_allocator(ctx.pool());
3273+
void * output_buffer = output_allocator.alloc(ggml_nbytes(dst));
3274+
acl_tensor_ptr acl_out = ggml_cann_create_tensor(output_buffer, ggml_cann_type_mapping(dst->type),
3275+
ggml_type_size(dst->type), dst->ne, dst->nb, 2);
3276+
3277+
GGML_CANN_CALL_ACLNN_OP(ctx, Ger, acl_input.get(), acl_weight.get(), acl_out.get());
3278+
float alpha_value = 1.0f;
3279+
aclScalar * alpha = aclCreateScalar(&alpha_value, ACL_FLOAT);
3280+
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, accumulator.get(), acl_out.get(), alpha);
3281+
}
3282+
}
3283+
}
3284+
}
3285+
3286+
void ggml_cann_out_prod(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
3287+
ggml_tensor * src0 = dst->src[0];
3288+
3289+
const enum ggml_type type = src0->type;
3290+
3291+
switch (type) {
3292+
case GGML_TYPE_F32:
3293+
case GGML_TYPE_F16:
3294+
ggml_cann_out_prod_fp(ctx, dst);
3295+
break;
3296+
default:
3297+
GGML_ABORT("Unsupport type for GGML_OP_OUT_PROD");
3298+
break;
3299+
}
3300+
}

ggml/src/ggml-cann/aclnn_ops.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1125,3 +1125,23 @@ void ggml_cann_op_unary_gated(std::function<void(ggml_backend_cann_context &, ac
11251125
} while (0)
11261126

11271127
#endif // CANN_ACLNN_OPS
1128+
1129+
/**
1130+
* @brief Performs outer product operation on two ggml tensors using the CANN backend.
1131+
*
1132+
* @details This function computes the outer product of two input tensors (src0 and src1)
1133+
* and stores the result in the destination tensor. The outer product operation is defined as:
1134+
* dst[i,j,k,l] = sum_m (src0[i,m,k,l] * src1[j,m,k,l])
1135+
*
1136+
* The function supports multiple data types including F32, F16. For floating-point
1137+
* types, it uses batch matrix multiplication for efficient computation.
1138+
*
1139+
* The implementation handles 4D tensor broadcasting and batch processing automatically.
1140+
*
1141+
* @param ctx The CANN backend context for operation execution and memory management.
1142+
* @param dst The destination ggml_tensor where the outer product result will be stored.
1143+
* The input tensors are assumed to be `dst->src[0]` and `dst->src[1]`.
1144+
*
1145+
* @see GGML_CANN_CALL_ACLNN_OP for CANN operator invocation
1146+
*/
1147+
void ggml_cann_out_prod(ggml_backend_cann_context & ctx, ggml_tensor * dst);

ggml/src/ggml-cann/ggml-cann.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1886,6 +1886,9 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context & ctx, struct gg
18861886
case GGML_OP_FLASH_ATTN_EXT:
18871887
ggml_cann_flash_attn_ext(ctx, dst);
18881888
break;
1889+
case GGML_OP_OUT_PROD:
1890+
ggml_cann_out_prod(ctx, dst);
1891+
break;
18891892
default:
18901893
return false;
18911894
}
@@ -2563,6 +2566,16 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
25632566
case GGML_OP_PAD_REFLECT_1D:
25642567
case GGML_OP_COUNT_EQUAL:
25652568
return true;
2569+
case GGML_OP_OUT_PROD:
2570+
{
2571+
switch (op->src[0]->type) {
2572+
case GGML_TYPE_F16:
2573+
case GGML_TYPE_F32:
2574+
return true;
2575+
default:
2576+
return false;
2577+
}
2578+
}
25662579
case GGML_OP_CONV_TRANSPOSE_1D:
25672580
// TODO: ((weightL - 1) * dilationW - padLeft)=1336 should not be larger than 255.
25682581
return (op->src[0]->ne[0] - 1) <= 255;

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1927,6 +1927,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
19271927
{
19281928
ggml_compute_forward_argsort(params, tensor);
19291929
} break;
1930+
case GGML_OP_TOP_K:
1931+
{
1932+
ggml_compute_forward_top_k(params, tensor);
1933+
} break;
19301934
case GGML_OP_LEAKY_RELU:
19311935
{
19321936
ggml_compute_forward_leaky_relu(params, tensor);
@@ -2311,6 +2315,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
23112315
case GGML_OP_ARANGE:
23122316
case GGML_OP_TIMESTEP_EMBEDDING:
23132317
case GGML_OP_ARGSORT:
2318+
case GGML_OP_TOP_K:
23142319
case GGML_OP_FLASH_ATTN_EXT:
23152320
case GGML_OP_FLASH_ATTN_BACK:
23162321
case GGML_OP_SSM_CONV:
@@ -2834,6 +2839,10 @@ struct ggml_cplan ggml_graph_plan(
28342839
cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
28352840
cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
28362841
} break;
2842+
case GGML_OP_TOP_K:
2843+
{
2844+
cur += sizeof(int32_t)*node->src[0]->ne[0]*n_tasks;
2845+
} break;
28372846
case GGML_OP_FLASH_ATTN_EXT:
28382847
{
28392848
const int64_t ne10 = node->src[1]->ne[0]; // DK

0 commit comments

Comments
 (0)