Skip to content

Commit 98f7290

Browse files
Merge pull request #42 from menloresearch/update-dev-from-master-2025-04-04-00-08
Sync master with upstream release b5043
2 parents 6e30a6c + c262bed commit 98f7290

File tree

20 files changed

+337
-222
lines changed

20 files changed

+337
-222
lines changed

ci/run.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then
5959
export ONEAPI_DEVICE_SELECTOR="level_zero:0"
6060
# Enable sysman for correct memory reporting
6161
export ZES_ENABLE_SYSMAN=1
62+
# to circumvent precision issues on CPY operations
63+
export SYCL_PROGRAM_COMPILE_OPTIONS="-cl-fp32-correctly-rounded-divide-sqrt"
6264
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
6365
fi
6466

common/minja/minja.hpp

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2606,14 +2606,18 @@ inline std::shared_ptr<Context> Context::builtins() {
26062606
auto & text = args.at("text");
26072607
return text.is_null() ? text : Value(strip(text.get<std::string>()));
26082608
}));
2609-
globals.set("lower", simple_function("lower", { "text" }, [](const std::shared_ptr<Context> &, Value & args) {
2610-
auto text = args.at("text");
2611-
if (text.is_null()) return text;
2612-
std::string res;
2613-
auto str = text.get<std::string>();
2614-
std::transform(str.begin(), str.end(), std::back_inserter(res), ::tolower);
2615-
return Value(res);
2616-
}));
2609+
auto char_transform_function = [](const std::string & name, const std::function<char(char)> & fn) {
2610+
return simple_function(name, { "text" }, [=](const std::shared_ptr<Context> &, Value & args) {
2611+
auto text = args.at("text");
2612+
if (text.is_null()) return text;
2613+
std::string res;
2614+
auto str = text.get<std::string>();
2615+
std::transform(str.begin(), str.end(), std::back_inserter(res), fn);
2616+
return Value(res);
2617+
});
2618+
};
2619+
globals.set("lower", char_transform_function("lower", ::tolower));
2620+
globals.set("upper", char_transform_function("upper", ::toupper));
26172621
globals.set("default", Value::callable([=](const std::shared_ptr<Context> &, ArgumentsValue & args) {
26182622
args.expectArgs("default", {2, 3}, {0, 1});
26192623
auto & value = args.args[0];

docs/backend/SYCL.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,10 @@ cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -
302302
cmake --build build --config Release -j -v
303303
```
304304

305+
It is possible to come across some precision issues when running tests that stem from using faster
306+
instructions, which can be circumvented by setting the environment variable `SYCL_PROGRAM_COMPILE_OPTIONS`
307+
as `-cl-fp32-correctly-rounded-divide-sqrt`
308+
305309
#### Nvidia GPU
306310

307311
The SYCL backend depends on [oneMath](https://github.com/uxlfoundation/oneMath) for Nvidia and AMD devices.
@@ -322,6 +326,9 @@ cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=
322326
cmake --build build --config Release -j -v
323327
```
324328

329+
It is possible to come across some precision issues when running tests that stem from using faster
330+
instructions, which can be circumvented by passing the `-fno-fast-math` flag to the compiler.
331+
325332
#### AMD GPU
326333

327334
The SYCL backend depends on [oneMath](https://github.com/uxlfoundation/oneMath) for Nvidia and AMD devices.

ggml/src/ggml-cann/acl_tensor.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,7 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
5454
// added.
5555
int64_t acl_ne[GGML_MAX_DIMS * 2], acl_stride[GGML_MAX_DIMS * 2];
5656

57-
int64_t acl_storage_len = 0;
5857
if (ne == nullptr) {
59-
acl_storage_len = ggml_nbytes(tensor);
6058
for (int i = 0; i < GGML_MAX_DIMS; i++) {
6159
acl_ne[i] = tensor->ne[i];
6260
// The step size of acl is in elements.
@@ -65,14 +63,18 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
6563
} else {
6664
// With bcast
6765
for (int i = 0; i < dims; i++) {
68-
acl_storage_len += (ne[i] - 1) * nb[i];
6966
acl_ne[i] = ne[i];
7067
acl_stride[i] = nb[i] / ggml_element_size(tensor);
7168
}
7269
}
7370

74-
// Reverse ne and stride.
7571
int64_t final_dims = (dims == 0 ? GGML_MAX_DIMS : dims);
72+
int64_t acl_storage_len = 1;
73+
for (int i = 0; i < final_dims; i++) {
74+
acl_storage_len += (acl_ne[i] - 1) * acl_stride[i];
75+
}
76+
77+
// Reverse ne and stride.
7678
std::reverse(acl_ne, acl_ne + final_dims);
7779
std::reverse(acl_stride, acl_stride + final_dims);
7880

ggml/src/ggml-cann/acl_tensor.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -101,14 +101,14 @@ aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
101101
tmp_stride[i] = nb[i] / type_size;
102102
}
103103

104-
std::reverse(tmp_ne, tmp_ne + dims);
105-
std::reverse(tmp_stride, tmp_stride + dims);
106-
107-
int64_t acl_storage_len = 0;
104+
int64_t acl_storage_len = 1;
108105
for (int i = 0; i < dims; i++) {
109-
acl_storage_len += (ne[i] - 1) * nb[i];
106+
acl_storage_len += (tmp_ne[i] - 1) * tmp_stride[i];
110107
}
111108

109+
std::reverse(tmp_ne, tmp_ne + dims);
110+
std::reverse(tmp_stride, tmp_stride + dims);
111+
112112
aclTensor* acl_tensor =
113113
aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size,
114114
format, &acl_storage_len, 1, data_ptr);

ggml/src/ggml-cann/aclnn_ops.cpp

Lines changed: 45 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
#include <aclnnop/aclnn_triu.h>
5252
#include <aclnnop/aclnn_upsample_nearest_2d.h>
5353
#include <aclnnop/aclnn_weight_quant_batch_matmul_v2.h>
54+
#include <aclnnop/aclnn_argmax.h>
5455
#include <float.h>
5556

5657
#include <cmath>
@@ -358,8 +359,6 @@ void ggml_cann_sqr(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
358359

359360
void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
360361
ggml_tensor* src = dst->src[0];
361-
GGML_ASSERT(src->type == GGML_TYPE_F32);
362-
GGML_ASSERT(dst->type == GGML_TYPE_F32);
363362

364363
float min;
365364
float max;
@@ -1090,8 +1089,6 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
10901089
float eps;
10911090
memcpy(&eps, dst->op_params, sizeof(float));
10921091

1093-
GGML_ASSERT(eps > 0.0f);
1094-
10951092
uint64_t workspaceSize = 0;
10961093
aclOpExecutor* executor;
10971094
void* workspaceAddr = nullptr;
@@ -3152,7 +3149,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
31523149
// TODO: use ascendc
31533150
// Only test with LLAMA model.
31543151
ggml_tensor* src0 = dst->src[0]; // input
3155-
ggml_tensor* src2 = dst->src[2]; // freq_factors
3152+
// ggml_tensor* src2 = dst->src[2]; // freq_factors, not used now.
31563153

31573154
// param
31583155
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
@@ -3444,3 +3441,46 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
34443441
ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
34453442
ACL_CHECK(aclDestroyTensor(acl_dst));
34463443
}
3444+
3445+
3446+
void ggml_cann_argmax(ggml_backend_cann_context& ctx, ggml_tensor* dst){
3447+
ggml_tensor * src0 = dst->src[0];
3448+
3449+
aclTensor* acl_src = ggml_cann_create_tensor(src0);
3450+
aclTensor* acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3);
3451+
3452+
uint64_t workspaceSize = 0;
3453+
aclOpExecutor* executor;
3454+
void* workspaceAddr = nullptr;
3455+
3456+
ACL_CHECK(aclnnArgMaxGetWorkspaceSize(acl_src, 3, false, acl_dst,
3457+
&workspaceSize, &executor));
3458+
if (workspaceSize > 0) {
3459+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
3460+
workspaceAddr = workspace_allocator.get();
3461+
}
3462+
ACL_CHECK(aclnnArgMax(workspaceAddr, workspaceSize, executor, ctx.stream()));
3463+
3464+
ACL_CHECK(aclDestroyTensor(acl_src));
3465+
ACL_CHECK(aclDestroyTensor(acl_dst));
3466+
}
3467+
3468+
void ggml_cann_cos(ggml_backend_cann_context& ctx, ggml_tensor* dst){
3469+
ggml_tensor * src0 = dst->src[0];
3470+
3471+
aclTensor* acl_src = ggml_cann_create_tensor(src0);
3472+
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
3473+
aclnn_cos(ctx, acl_src, acl_dst);
3474+
ACL_CHECK(aclDestroyTensor(acl_src));
3475+
ACL_CHECK(aclDestroyTensor(acl_dst));
3476+
}
3477+
3478+
void ggml_cann_sin(ggml_backend_cann_context& ctx, ggml_tensor* dst){
3479+
ggml_tensor * src0 = dst->src[0];
3480+
3481+
aclTensor* acl_src = ggml_cann_create_tensor(src0);
3482+
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
3483+
aclnn_sin(ctx, acl_src, acl_dst);
3484+
ACL_CHECK(aclDestroyTensor(acl_src));
3485+
ACL_CHECK(aclDestroyTensor(acl_dst));
3486+
}

ggml/src/ggml-cann/aclnn_ops.h

Lines changed: 41 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -484,6 +484,47 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
484484
*/
485485
void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst);
486486

487+
/**
488+
* @brief Computes the index of the maximum value along the specified dimension
489+
* of a ggml tensor using the CANN backend.
490+
*
491+
* @details This function performs an argmax operation on the input tensor.
492+
* It finds the index of the maximum value along the specified axis
493+
* and stores these indices in the destination tensor `dst`. The
494+
* operation is executed using the CANN backend for optimized performance.
495+
*
496+
* @param ctx The CANN context used for operations.
497+
* @param dst The destination tensor where the indices of the maximum values will be stored.
498+
* dst->op is `GGML_OP_ARGMAX`.
499+
*/
500+
void ggml_cann_argmax(ggml_backend_cann_context& ctx, ggml_tensor* dst);
501+
502+
/**
503+
* @brief Computes the cosine of each element in a ggml tensor using the CANN backend.
504+
*
505+
* @details This function applies the cosine function element-wise to the input tensor.
506+
* The computed cosine values are stored in the destination tensor `dst`.
507+
* The operation is optimized using the CANN backend for improved performance.
508+
*
509+
* @param ctx The CANN context used for operations.
510+
* @param dst The destination tensor where the cosine values will be stored.
511+
* dst->op is `GGML_OP_COS`.
512+
*/
513+
void ggml_cann_cos(ggml_backend_cann_context& ctx, ggml_tensor* dst);
514+
515+
/**
516+
* @brief Computes the sine of each element in a ggml tensor using the CANN backend.
517+
*
518+
* @details This function applies the sine function element-wise to the input tensor.
519+
* The computed sine values are stored in the destination tensor `dst`.
520+
* The operation is optimized using the CANN backend for improved performance.
521+
*
522+
* @param ctx The CANN context used for operations.
523+
* @param dst The destination tensor where the sine values will be stored.
524+
* dst->op is `GGML_OP_SIN`.
525+
*/
526+
void ggml_cann_sin(ggml_backend_cann_context& ctx, ggml_tensor* dst);
527+
487528
template <aclnnStatus getWorkspaceSize(const aclTensor*, const aclTensor*,
488529
aclTensor*, uint64_t*, aclOpExecutor**),
489530
aclnnStatus execute(void*, uint64_t, aclOpExecutor*, aclrtStream)>
@@ -535,9 +576,6 @@ template <aclnnStatus getWorkspaceSize(const aclTensor*, aclTensor*, uint64_t*,
535576
void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
536577
ggml_tensor* src = dst->src[0];
537578

538-
GGML_ASSERT(src->type == GGML_TYPE_F32);
539-
GGML_ASSERT(dst->type == GGML_TYPE_F32);
540-
541579
aclTensor* acl_src = ggml_cann_create_tensor(src);
542580
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
543581

@@ -566,9 +604,6 @@ template <aclnnStatus getWorkspaceSize(const aclTensor*, const aclTensor*,
566604
void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
567605
ggml_tensor* src = dst->src[0];
568606

569-
GGML_ASSERT(src->type == GGML_TYPE_F32);
570-
GGML_ASSERT(dst->type == GGML_TYPE_F32);
571-
572607
aclTensor* acl_src = ggml_cann_create_tensor(src);
573608
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
574609

ggml/src/ggml-cann/ggml-cann.cpp

Lines changed: 30 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1420,6 +1420,15 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
14201420
case GGML_OP_ARGSORT:
14211421
ggml_cann_argsort(ctx, dst);
14221422
break;
1423+
case GGML_OP_ARGMAX:
1424+
ggml_cann_argmax(ctx, dst);
1425+
break;
1426+
case GGML_OP_COS:
1427+
ggml_cann_cos(ctx, dst);
1428+
break;
1429+
case GGML_OP_SIN:
1430+
ggml_cann_sin(ctx, dst);
1431+
break;
14231432
default:
14241433
return false;
14251434
}
@@ -1458,11 +1467,6 @@ static void ggml_backend_cann_free(ggml_backend_t backend) {
14581467
ACL_CHECK(aclrtSynchronizeDevice());
14591468
ACL_CHECK(aclrtResetDevice(cann_ctx->device));
14601469

1461-
// finalize when last backend freed.
1462-
if (cann_ctx->device == ggml_backend_cann_get_device_count() - 1) {
1463-
ACL_CHECK(aclFinalize());
1464-
}
1465-
14661470
delete cann_ctx;
14671471
delete backend;
14681472
}
@@ -1688,11 +1692,14 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
16881692
}
16891693
case GGML_OP_MUL_MAT: {
16901694
switch (op->src[0]->type) {
1691-
case GGML_TYPE_Q8_0:
16921695
case GGML_TYPE_F16:
16931696
case GGML_TYPE_F32:
1694-
case GGML_TYPE_Q4_0:
16951697
return true;
1698+
case GGML_TYPE_Q8_0:
1699+
case GGML_TYPE_Q4_0:
1700+
// only support contiguous for quantized types.
1701+
return ggml_is_contiguous(op->src[0]) &&
1702+
ggml_is_contiguous(op->src[1]);
16961703
default:
16971704
return false;
16981705
}
@@ -1738,13 +1745,14 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
17381745
}
17391746
case GGML_OP_ROPE: {
17401747
// TODO: with ops-test v == 1
1741-
float * ext_factor = (float*)((int32_t*)op->op_params + 7);
1748+
float ext_factor = 0.0f;
1749+
memcpy(&ext_factor, (const float *) op->op_params + 7, sizeof(float));
17421750
// TODO: n_dims <= ne0
17431751
if (op->src[0]->ne[0] != op->op_params[1]) {
17441752
return false;
17451753
}
17461754
// TODO: ext_factor != 0
1747-
if (*ext_factor != 0) {
1755+
if (ext_factor != 0) {
17481756
return false;
17491757
}
17501758

@@ -1766,6 +1774,16 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
17661774
}
17671775
return true;
17681776
}
1777+
case GGML_OP_POOL_2D: {
1778+
const int32_t * opts = (const int32_t *) op->op_params;
1779+
const int k0 = opts[1];
1780+
const int k1 = opts[2];
1781+
const int p0 = opts[5];
1782+
const int p1 = opts[6];
1783+
// value of paddingH should be at most half of kernelH
1784+
// value of paddingW should be at most half of kernelW
1785+
return (p0 <= (k0 / 2)) && (p1 <= (k1 / 2));
1786+
}
17691787
case GGML_OP_DUP:
17701788
case GGML_OP_IM2COL:
17711789
case GGML_OP_CONCAT:
@@ -1785,7 +1803,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
17851803
case GGML_OP_CLAMP:
17861804
case GGML_OP_DIAG_MASK_INF:
17871805
case GGML_OP_SOFT_MAX:
1788-
case GGML_OP_POOL_2D:
17891806
case GGML_OP_SUM_ROWS:
17901807
case GGML_OP_ARGSORT:
17911808
case GGML_OP_ACC:
@@ -1794,6 +1811,9 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
17941811
case GGML_OP_ARANGE:
17951812
case GGML_OP_TIMESTEP_EMBEDDING:
17961813
case GGML_OP_LEAKY_RELU:
1814+
case GGML_OP_ARGMAX:
1815+
case GGML_OP_COS:
1816+
case GGML_OP_SIN:
17971817
return true;
17981818
default:
17991819
return false;

ggml/src/ggml-cuda/common.cuh

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -729,7 +729,13 @@ struct ggml_cuda_graph {
729729
bool disable_due_to_failed_graph_capture = false;
730730
int number_consecutive_updates = 0;
731731
std::vector<ggml_graph_node_properties> ggml_graph_properties;
732-
std::vector<char **> updated_kernel_arg;
732+
bool use_cpy_indirection = false;
733+
std::vector<char *> cpy_dest_ptrs;
734+
char ** dest_ptrs_d;
735+
int dest_ptrs_size = 0;
736+
// Index to allow each cpy kernel to be aware of it's position within the graph
737+
// relative to other cpy nodes.
738+
int graph_cpynode_index = -1;
733739
#endif
734740
};
735741

0 commit comments

Comments
 (0)