Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
80 commits
Select commit Hold shift + click to select a range
b4b7e15
CUDA: don't convert BF16 weights to FP32 (ggml/1174)
CISC Apr 4, 2025
53506af
ggml : simplify Arm fp16 CPU logic (ggml/1177)
ggerganov Apr 7, 2025
614c087
llama : add option to override model tensor buffers (llama/11397)
slaren Apr 2, 2025
8a6981a
Vulkan: Fix mmq int dot float cache size (llama/12722)
0cc4m Apr 2, 2025
ab9d608
vulkan: Implement grouped query attention in the coopmat2 FA shader (…
jeffbolznv Apr 2, 2025
3502755
cmake: remove caching from vulkan coopmat checks (llama/12719)
bandoti Apr 2, 2025
2f935a8
vulkan: Implement split_k for coopmat2 flash attention. (llama/12627)
jeffbolznv Apr 2, 2025
953a903
opencl: use `max_alloc_size` in backend ctx instead of querying again…
lhez Apr 3, 2025
ec21d02
CANN: Fix failed test cases (llama/12708)
hipudding Apr 3, 2025
2d0b11b
Simplify and improve CUDA graphs through use of indirect copy pointer…
agray3 Apr 3, 2025
ade66e6
CANN: Support operator SIN COS ARGMAX (llama/12709)
noemotiovon Apr 3, 2025
d52ab9b
fix MUSA compiler warning (llama/12704)
A3shTnT Apr 3, 2025
96a8cc5
vulkan: Fix missing cmake logic for dot product extension (llama/12721)
jeffbolznv Apr 3, 2025
f88fe40
CUDA: Prefer vector flash decoding kernel for Gemma models (llama/12738)
gaugarg-nv Apr 3, 2025
4773f07
vulkan: set cmake minimum and project name in vulkan-shaders (llama/1…
jeffbolznv Apr 4, 2025
bceee71
vulkan: Hybrid waitForFences/getFenceStatus to reduce fence latency (…
jeffbolznv Apr 4, 2025
7cff350
cmake: fix ggml-shaders-gen compiler paths containing spaces (llama/1…
hydroo Apr 4, 2025
cb85299
sycl: allow ggml-sycl configuration and compilation using Visual Stud…
s-Nick Apr 4, 2025
c5b7ca9
Vulkan: Tune Vulkan mmq int dot shader for performance (llama/12767)
0cc4m Apr 5, 2025
1426dfd
vulkan: Use unclamped loads for flash attention mask (llama/12720)
jeffbolznv Apr 6, 2025
bf9d152
vulkan: fix NaN issue in flash attention shader (llama/12776)
jeffbolznv Apr 6, 2025
2653196
musa: fix compilation warnings in mp_22/31 (llama/12780)
yeahdongcn Apr 6, 2025
1751e80
CANN: Refactor to reduce duplicate code (llama/12731)
hipudding Apr 7, 2025
9dcd7ff
CANN: fix typo in ggml-cann (llama/12733)
jeffzhou2000 Apr 7, 2025
b1306b9
sycl: remove redundant memcopy in function ggml_backend_sycl_buffer_s…
jeffzhou2000 Apr 7, 2025
981432f
cuda : fix HIP and MUSA BF16 (llama/0)
ggerganov Apr 7, 2025
db561f2
opencl: better identify Adreno GPU (llama/12760)
lhez Apr 7, 2025
72ff196
Revert "sycl:remove redundant memcopy in function ggml_backend_sycl_b…
NeoZhangJianyu Apr 8, 2025
7574918
ggml : add more generic custom op, remove deprecated custom ops (ggml…
slaren Apr 9, 2025
3c4f20e
ggml : add bilinear upscale support (ggml/1185)
slaren Apr 9, 2025
8c36c7b
ggml: don't include arm_neon.h when using CUDA 12 with ARM Neon (ggml…
cmdr2 Apr 10, 2025
1f36f27
llama : fix FA when KV cache is not used (i.e. embeddings) (llama/12825)
ggerganov Apr 8, 2025
f96f36f
cuda : add f32 to bf16 copy op (llama/12806)
CISC Apr 8, 2025
d25499b
vulkan: Use fp16 for the flash attention P*V multiplication (llama/12…
jeffbolznv Apr 9, 2025
0f556ed
vulkan: In coopmat2 mmq, load q4_k/q5_k scales through shared memory …
jeffbolznv Apr 9, 2025
147541f
CANN: Support Opt CONV_TRANSPOSE_1D and ELU (llama/12786)
noemotiovon Apr 9, 2025
55e48df
ggml-impl.h: fix build on POWER9 (llama/12855)
pkubaj Apr 9, 2025
27af59e
ggml-cpu-impl.h: do not redefine bool on POWER9 (llama/12856)
pkubaj Apr 9, 2025
2b846dd
Fixes #12823 (llama/12830)
mehendarkarprajwal Apr 9, 2025
38a9901
CANN: Support more ops (llama/12841)
noemotiovon Apr 10, 2025
19575cb
cpu: fix cpu backend's supports-op for GET_ROWS_BACK. fixes a fatal w…
cmdr2 Apr 11, 2025
9f66d67
ggml: fix compilation error s390x (llama/12848)
taronaeo Apr 11, 2025
4eb5294
SYCL: Add fp16 type support to unary op kernels (llama/12788)
qnixsynapse Apr 11, 2025
963470f
sycl: Support sycl_ext_oneapi_limited_graph (llama/12873)
EwanC Apr 11, 2025
e8fe286
vulkan: use aligned loads for flash attention mask (llama/12853)
jeffbolznv Apr 12, 2025
9765632
ggml: disable CUDA graphs for unsupported DUP and CONT node types (ll…
agray3 Apr 13, 2025
31b3ada
ggml: use _mm[512/256]_dpbusd[_avx]_epi32 to directly accumulate into…
SongXiaoXi Apr 14, 2025
319af9f
ggml : Depthwise 2D convolution (ggml/1152)
Acly Apr 17, 2025
00cb979
rpc : use ggml_context_ptr (llama/12938)
rgerganov Apr 14, 2025
d3713ee
SYCL: Fix im2col (llama/12910)
qnixsynapse Apr 14, 2025
378c551
CANN: Optimize CANN buffer pool memory management (llama/12875)
bachelor-dou Apr 15, 2025
79d7842
CANN: Opt ROPE optimization (llama/12865)
noemotiovon Apr 15, 2025
b344957
ggml : Add AVX512 implementation of GEMM - Q4_Kx8 (llama/12829)
Srihari-mcw Apr 15, 2025
6f2a40f
SYCL: Add ROPE vision kernel (llama/12887)
qnixsynapse Apr 15, 2025
752162b
CUDA/HIP: Share the same unified memory allocation logic. (llama/12934)
hjc4869 Apr 15, 2025
6a2273b
CANN: Add x86 build ci (llama/12950)
hipudding Apr 15, 2025
342a49f
metal : add FA-vec kernels for head size 96 (llama/12952)
ggerganov Apr 15, 2025
505e57d
CANN: Add 310P operator support check (llama/12962)
noemotiovon Apr 16, 2025
062375e
vulkan: enable coopmat2 FA gqa and split_k optimizations more often (…
jeffbolznv Apr 16, 2025
fa69e39
opencl: fix incorrect local_size index in profiling log (llama/12868)
kimminsu38oo Apr 16, 2025
c941896
CANN: Add support for async operator submission (llama/12864)
hipudding Apr 17, 2025
9a2ac85
ggml: Re-enable CUDA graphs in presence of CONT and DUP nodes (llama/…
agray3 Apr 17, 2025
056d903
graph : make FA compatible with MLA + add initial Metal kernels (llam…
ggerganov Apr 17, 2025
4a7775c
rpc : add RPC_CMD_HELLO (llama/12955)
rgerganov Apr 18, 2025
ed18c7a
SYCL: Refactor and enable FP16 in binary broadcast OPs (llama/12975)
qnixsynapse Apr 18, 2025
f7eb6eb
metal: add neg operator (llama/13029)
jmorganca Apr 20, 2025
308239d
vulkan: support noncontiguous rms_norm (llama/13031)
jeffbolznv Apr 20, 2025
0612ed6
SYCL: Add non-contiguous support in ROPE (llama/12993)
qnixsynapse Apr 21, 2025
6d63a73
ggml : add SSE 4.2 and x64 base variant for CPUs without AVX (llama/1…
slaren Apr 21, 2025
6e92d2e
CUDA: noncont MMVQ + batched bs1 MUL_MAT_ID (llama/13014)
JohannesGaessler Apr 22, 2025
2299308
vulkan: matmul gcn tuning (llama/13016)
netrunnereve Apr 24, 2025
3bd21ed
metal : fix floating-point range of attention scores in FA kernels (l…
ggerganov Apr 24, 2025
de20f45
CUDA: use switch statements in constexpr functions (llama/13095)
JohannesGaessler Apr 24, 2025
fd137aa
ggml : fix trailing whitespaces (llama/0)
ggerganov Apr 24, 2025
6f9d3e6
opencl: split ggml-opencl.cl into multiple files and cleanup (llama/1…
lhez Apr 24, 2025
775402b
sync : ggml
ggerganov Apr 24, 2025
fc70647
opencl : remove obsolete files (skip) (ggml/1200)
ggerganov Apr 24, 2025
00fb660
sync : ggml
ggerganov Apr 24, 2025
4ae6aa3
cuda : fix unused variable compile warning (#0)
ggerganov Apr 24, 2025
475e93e
ruby : add cmake option (#0)
ggerganov Apr 24, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions bindings/ruby/ext/options.rb
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ def configure
bool "GGML_RV_ZFH"
pending "GGML_SCCACHE_FOUND"
string "GGML_SCHED_MAX_COPIES"
bool "GGML_SSE42"
ignored "GGML_STATIC"
bool "GGML_SYCL"
string "GGML_SYCL_DEVICE_ARCH"
Expand Down
2 changes: 1 addition & 1 deletion ggml/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ message(DEBUG "INS_ENB : ${INS_ENB}")
option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
option(GGML_CPU_KLEIDIAI "ggml: use KleidiAI optimized kernels if applicable" OFF)
option(GGML_SSE42 "ggml: enable SSE 4.2" ${INS_ENB})
option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
option(GGML_AVX_VNNI "ggml: enable AVX-VNNI" OFF)
option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
Expand Down Expand Up @@ -170,7 +171,6 @@ option(GGML_HIP "ggml: use HIP"
option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
option(GGML_VULKAN "ggml: use Vulkan" OFF)
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
Expand Down
3 changes: 3 additions & 0 deletions ggml/include/ggml-rpc.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
extern "C" {
#endif

#define RPC_PROTO_MAJOR_VERSION 1
#define RPC_PROTO_MINOR_VERSION 0
#define RPC_PROTO_PATCH_VERSION 0
#define GGML_RPC_MAX_SERVERS 16

// backend API
Expand Down
151 changes: 59 additions & 92 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,7 @@ extern "C" {
GGML_OP_CONV_TRANSPOSE_1D,
GGML_OP_IM2COL,
GGML_OP_IM2COL_BACK,
GGML_OP_CONV_2D_DW,
GGML_OP_CONV_TRANSPOSE_2D,
GGML_OP_POOL_1D,
GGML_OP_POOL_2D,
Expand All @@ -507,17 +508,12 @@ extern "C" {

GGML_OP_UNARY,

GGML_OP_MAP_UNARY,
GGML_OP_MAP_BINARY,

GGML_OP_MAP_CUSTOM1_F32,
GGML_OP_MAP_CUSTOM2_F32,
GGML_OP_MAP_CUSTOM3_F32,

GGML_OP_MAP_CUSTOM1,
GGML_OP_MAP_CUSTOM2,
GGML_OP_MAP_CUSTOM3,

GGML_OP_CUSTOM,

GGML_OP_CROSS_ENTROPY_LOSS,
GGML_OP_CROSS_ENTROPY_LOSS_BACK,
GGML_OP_OPT_STEP_ADAMW,
Expand Down Expand Up @@ -682,6 +678,9 @@ extern "C" {
GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2

// true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);

GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);

Expand Down Expand Up @@ -1665,7 +1664,7 @@ extern "C" {
struct ggml_tensor * a,
struct ggml_tensor * b);

// depthwise
// depthwise (via im2col and mul_mat)
GGML_API struct ggml_tensor * ggml_conv_2d_dw(
struct ggml_context * ctx,
struct ggml_tensor * a, // convolution kernel
Expand All @@ -1677,6 +1676,22 @@ extern "C" {
int d0, // dilation dimension 0
int d1); // dilation dimension 1

// Depthwise 2D convolution
// may be faster than ggml_conv_2d_dw, but not available in all backends
// a: KW KH 1 C convolution kernel
// b: W H C N input data
// res: W_out H_out C N
GGML_API struct ggml_tensor * ggml_conv_2d_dw_direct(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
int stride0,
int stride1,
int pad0,
int pad1,
int dilation0,
int dilation1);

GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
struct ggml_context * ctx,
struct ggml_tensor * a,
Expand Down Expand Up @@ -1722,24 +1737,29 @@ extern "C" {
float p0,
float p1);

// nearest interpolate
enum ggml_scale_mode {
GGML_SCALE_MODE_NEAREST = 0,
GGML_SCALE_MODE_BILINEAR = 1,
};

// interpolate
// multiplies ne0 and ne1 by scale factor
// used in stable-diffusion
GGML_API struct ggml_tensor * ggml_upscale(
struct ggml_context * ctx,
struct ggml_tensor * a,
int scale_factor);
int scale_factor,
enum ggml_scale_mode mode);

// nearest interpolate
// nearest interpolate to specified dimensions
// used in tortoise.cpp
// interpolate
// interpolate scale to specified dimensions
GGML_API struct ggml_tensor * ggml_upscale_ext(
struct ggml_context * ctx,
struct ggml_tensor * a,
int ne0,
int ne1,
int ne2,
int ne3);
int ne3,
enum ggml_scale_mode mode);

// pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
GGML_API struct ggml_tensor * ggml_pad(
Expand Down Expand Up @@ -1916,83 +1936,6 @@ extern "C" {

// custom operators

typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);

typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);

GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_f32(
struct ggml_context * ctx,
struct ggml_tensor * a,
ggml_unary_op_f32_t fun),
"use ggml_map_custom1 instead");

GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
struct ggml_context * ctx,
struct ggml_tensor * a,
ggml_unary_op_f32_t fun),
"use ggml_map_custom1_inplace instead");

GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_f32(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
ggml_binary_op_f32_t fun),
"use ggml_map_custom2 instead");

GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
ggml_binary_op_f32_t fun),
"use ggml_map_custom2_inplace instead");

GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_f32(
struct ggml_context * ctx,
struct ggml_tensor * a,
ggml_custom1_op_f32_t fun),
"use ggml_map_custom1 instead");

GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
struct ggml_context * ctx,
struct ggml_tensor * a,
ggml_custom1_op_f32_t fun),
"use ggml_map_custom1_inplace instead");

GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_f32(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
ggml_custom2_op_f32_t fun),
"use ggml_map_custom2 instead");

GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
ggml_custom2_op_f32_t fun),
"use ggml_map_custom2_inplace instead");

GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_f32(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c,
ggml_custom3_op_f32_t fun),
"use ggml_map_custom3 instead");

GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c,
ggml_custom3_op_f32_t fun),
"use ggml_map_custom3_inplace instead");

// custom operators v2

typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
Expand Down Expand Up @@ -2048,6 +1991,30 @@ extern "C" {
int n_tasks,
void * userdata);

typedef void (*ggml_custom_op_t)(struct ggml_tensor * dst , int ith, int nth, void * userdata);

GGML_API struct ggml_tensor * ggml_custom_4d(
struct ggml_context * ctx,
enum ggml_type type,
int64_t ne0,
int64_t ne1,
int64_t ne2,
int64_t ne3,
struct ggml_tensor ** args,
int n_args,
ggml_custom_op_t fun,
int n_tasks,
void * userdata);

GGML_API struct ggml_tensor * ggml_custom_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor ** args,
int n_args,
ggml_custom_op_t fun,
int n_tasks,
void * userdata);

// loss function

GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
Expand Down
15 changes: 9 additions & 6 deletions ggml/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,7 @@ function(ggml_add_cpu_backend_variant tag_name)
set(GGML_CPU_TAG_NAME ${tag_name})
# other: OPENMP LLAMAFILE CPU_HBM
foreach (feat NATIVE
SSE42
AVX AVX2 BMI2 AVX_VNNI FMA F16C
AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
AMX_TILE AMX_INT8 AMX_BF16)
Expand All @@ -286,14 +287,16 @@ if (GGML_CPU_ALL_VARIANTS)
if (NOT GGML_BACKEND_DL)
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
endif()
ggml_add_cpu_backend_variant(sandybridge AVX)
ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 BMI2 FMA)
ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 BMI2 FMA AVX512)
ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 BMI2 FMA AVX_VNNI)
ggml_add_cpu_backend_variant(x64)
ggml_add_cpu_backend_variant(sse42 SSE42)
ggml_add_cpu_backend_variant(sandybridge SSE42 AVX)
ggml_add_cpu_backend_variant(haswell SSE42 AVX F16C AVX2 BMI2 FMA)
ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
if (NOT MSVC)
# MSVC doesn't support AMX
ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
endif()
elseif (GGML_CPU)
ggml_add_cpu_backend_variant_impl("")
Expand Down
12 changes: 8 additions & 4 deletions ggml/src/ggml-cann/acl_tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ aclDataType ggml_cann_type_mapping(ggml_type type) {
return ACL_INT4;
case GGML_TYPE_Q8_0:
return ACL_INT8;
case GGML_TYPE_I64:
return ACL_INT64;
default:
return ACL_DT_UNDEFINED;
}
Expand All @@ -54,9 +56,7 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
// added.
int64_t acl_ne[GGML_MAX_DIMS * 2], acl_stride[GGML_MAX_DIMS * 2];

int64_t acl_storage_len = 0;
if (ne == nullptr) {
acl_storage_len = ggml_nbytes(tensor);
for (int i = 0; i < GGML_MAX_DIMS; i++) {
acl_ne[i] = tensor->ne[i];
// The step size of acl is in elements.
Expand All @@ -65,14 +65,18 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
} else {
// With bcast
for (int i = 0; i < dims; i++) {
acl_storage_len += (ne[i] - 1) * nb[i];
acl_ne[i] = ne[i];
acl_stride[i] = nb[i] / ggml_element_size(tensor);
}
}

// Reverse ne and stride.
int64_t final_dims = (dims == 0 ? GGML_MAX_DIMS : dims);
int64_t acl_storage_len = 1;
for (int i = 0; i < final_dims; i++) {
acl_storage_len += (acl_ne[i] - 1) * acl_stride[i];
}

// Reverse ne and stride.
std::reverse(acl_ne, acl_ne + final_dims);
std::reverse(acl_stride, acl_stride + final_dims);

Expand Down
10 changes: 5 additions & 5 deletions ggml/src/ggml-cann/acl_tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -101,14 +101,14 @@ aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
tmp_stride[i] = nb[i] / type_size;
}

std::reverse(tmp_ne, tmp_ne + dims);
std::reverse(tmp_stride, tmp_stride + dims);

int64_t acl_storage_len = 0;
int64_t acl_storage_len = 1;
for (int i = 0; i < dims; i++) {
acl_storage_len += (ne[i] - 1) * nb[i];
acl_storage_len += (tmp_ne[i] - 1) * tmp_stride[i];
}

std::reverse(tmp_ne, tmp_ne + dims);
std::reverse(tmp_stride, tmp_stride + dims);

aclTensor* acl_tensor =
aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size,
format, &acl_storage_len, 1, data_ptr);
Expand Down
Loading
Loading