@@ -1257,12 +1257,20 @@ static void aclnn_exp(ggml_backend_cann_context& ctx, aclTensor* acl_src) {
1257
1257
1258
1258
void aclnn_cos (ggml_backend_cann_context& ctx, aclTensor* acl_src,
1259
1259
aclTensor* acl_dst) {
1260
- GGML_CANN_CALL_ACLNN_OP (ctx, Cos, acl_src, acl_dst);
1260
+ if (acl_dst == nullptr ) {
1261
+ GGML_CANN_CALL_ACLNN_OP (ctx, InplaceCos, acl_src);
1262
+ } else {
1263
+ GGML_CANN_CALL_ACLNN_OP (ctx, Cos, acl_src, acl_dst);
1264
+ }
1261
1265
}
1262
1266
1263
1267
void aclnn_sin (ggml_backend_cann_context& ctx, aclTensor* acl_src,
1264
1268
aclTensor* acl_dst) {
1265
- GGML_CANN_CALL_ACLNN_OP (ctx, Sin, acl_src, acl_dst);
1269
+ if (acl_dst == nullptr ) {
1270
+ GGML_CANN_CALL_ACLNN_OP (ctx, InplaceSin, acl_src);
1271
+ } else {
1272
+ GGML_CANN_CALL_ACLNN_OP (ctx, Sin, acl_src, acl_dst);
1273
+ }
1266
1274
}
1267
1275
1268
1276
void ggml_cann_timestep_embedding (ggml_backend_cann_context& ctx,
@@ -2221,13 +2229,54 @@ static void aclnn_index_fill_tensor(ggml_backend_cann_context& ctx,
2221
2229
ggml_cann_release_resources (ctx, acl_index, acl_value);
2222
2230
}
2223
2231
2232
+ /* *
2233
+ * @brief Initializes and caches sine/cosine positional encoding values
2234
+ * (used in RoPE, Rotary Position Embedding) for attention layers.
2235
+ *
2236
+ * This function computes and caches the sin/cos values of
2237
+ * θ = position * theta_scale for RoPE encoding. The cache is shared
2238
+ * across attention layers, and only the first attention layer will
2239
+ * trigger initialization. The cache includes repeated sin/cos values
2240
+ * with different repeat methods depending on the @param is_neox flag.
2241
+ *
2242
+ * Steps performed by this function:
2243
+ * 1. Identify whether the target tensor belongs to Q/K in attention
2244
+ * and restrict computation to the first layer only.
2245
+ * 2. Initialize the theta scale array (arange → power → freq scaling).
2246
+ * 3. Allocate sin/cos caches if the max prompt length increases.
2247
+ * 4. Compute θ = position * theta_scale.
2248
+ * 5. Compute sin(θ), cos(θ) and optionally scale by attn_factor.
2249
+ * 6. Expand sin/cos values by repeat or repeat_interleave depending
2250
+ * on whether @param is_neox is enabled.
2251
+ * 7. Store the computed values into persistent buffers
2252
+ * (ctx.rope_sin_ptr / ctx.rope_cos_ptr).
2253
+ *
2254
+ * @param ctx The CANN backend context, holding memory pool,
2255
+ * stream, and persistent buffers for rope init/cache.
2256
+ * @param dst The destination ggml_tensor whose computation
2257
+ * depends on the cached RoPE values (usually Qcur/Kcur).
2258
+ * @param theta_scale Scalar exponent base for computing theta scale values.
2259
+ * @param freq_scale Frequency scaling factor, applied to theta scale.
2260
+ * @param attn_factor Attention scaling factor, applied to sin/cos.
2261
+ * @param is_neox Whether to use Neox-style repeat strategy
2262
+ * (dim expansion vs repeat_interleave).
2263
+ */
2224
2264
static void aclnn_cache_init (ggml_backend_cann_context& ctx, ggml_tensor* dst,
2225
- aclTensor* acl_cos_repeat_tensor,
2226
- aclTensor* acl_sin_repeat_tensor,
2227
2265
float theta_scale, float freq_scale,
2228
2266
float attn_factor, bool is_neox) {
2229
2267
// int sin/cos cache, cache has different repeat method depond on
2230
2268
// @param.is_neox
2269
+ bool is_q = (std::strncmp (dst->name , " Qcur-" , 5 ) == 0 );
2270
+ bool is_k = (std::strncmp (dst->name , " Kcur-" , 5 ) == 0 );
2271
+
2272
+ // used for accuracy testing
2273
+ bool is_attention = is_q || is_k;
2274
+
2275
+ // just compute in first layer in attention
2276
+ bool is_fisrt_layer = (std::strncmp (dst->name , " Qcur-0" , GGML_MAX_NAME) == 0 );
2277
+ if (is_attention && !is_fisrt_layer) {
2278
+ return ;
2279
+ }
2231
2280
2232
2281
ggml_tensor* src0 = dst->src [0 ]; // input
2233
2282
ggml_tensor* src1 = dst->src [1 ]; // position
@@ -2253,21 +2302,16 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
2253
2302
theta_nb[i] = theta_nb[i - 1 ] * theta_ne[i - 1 ];
2254
2303
}
2255
2304
2256
- bool is_q = (std::strncmp (dst->name , " Qcur-" , 5 ) == 0 );
2257
- bool is_k = (std::strncmp (dst->name , " Kcur-" , 5 ) == 0 );
2258
-
2259
- // used for accuracy testing
2260
- bool is_attention = is_q || is_k;
2261
-
2262
- if (ctx.init_ptr == nullptr || !is_attention) {
2305
+ // init theta scale, just one time
2306
+ if (ctx.rope_init_ptr == nullptr || !is_attention) {
2263
2307
// theta_scale arange, [0,1,...,ne00/2 - 1]
2264
- if (ctx.init_ptr != nullptr ){
2265
- ACL_CHECK (aclrtFree (ctx.init_ptr ));
2308
+ if (ctx.rope_init_ptr != nullptr ){
2309
+ ACL_CHECK (aclrtFree (ctx.rope_init_ptr ));
2266
2310
}
2267
- ACL_CHECK (aclrtMalloc (&ctx.init_ptr , theta_scale_length * sizeof (float_t ), ACL_MEM_MALLOC_HUGE_FIRST));
2311
+ ACL_CHECK (aclrtMalloc (&ctx.rope_init_ptr , theta_scale_length * sizeof (float_t ), ACL_MEM_MALLOC_HUGE_FIRST));
2268
2312
2269
2313
aclTensor* acl_theta_scale_tensor =
2270
- ggml_cann_create_tensor (ctx.init_ptr , ACL_FLOAT, sizeof (float_t ),
2314
+ ggml_cann_create_tensor (ctx.rope_init_ptr , ACL_FLOAT, sizeof (float_t ),
2271
2315
theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
2272
2316
float start = 0 ;
2273
2317
float step = 1 ;
@@ -2297,74 +2341,75 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
2297
2341
ggml_cann_release_resources (ctx, acl_theta_scale_tensor,acl_theta_scale);
2298
2342
}
2299
2343
2300
- if (ctx.sin_ptr == nullptr ) {
2301
- int64_t theta_length = theta_scale_length * ctx.max_prompt_length ;
2302
- ACL_CHECK (aclrtMalloc (&ctx.sin_ptr , theta_length * sizeof (float_t ), ACL_MEM_MALLOC_HUGE_FIRST));
2303
- ACL_CHECK (aclrtMalloc (&ctx.cos_ptr , theta_length * sizeof (float_t ), ACL_MEM_MALLOC_HUGE_FIRST));
2304
- }
2344
+ // init sin_repeat && cos_repeat, one token just init in 0 layer
2305
2345
if (position_length > ctx.max_prompt_length ) {
2306
2346
ctx.max_prompt_length = position_length;
2307
- int64_t theta_length = theta_scale_length * ctx.max_prompt_length ;
2308
- ACL_CHECK (aclrtFree (ctx.sin_ptr ));
2309
- ACL_CHECK (aclrtFree (ctx.cos_ptr ));
2310
- ACL_CHECK (aclrtMalloc (&ctx.sin_ptr , theta_length * sizeof (float_t ), ACL_MEM_MALLOC_HUGE_FIRST));
2311
- ACL_CHECK (aclrtMalloc (&ctx.cos_ptr , theta_length * sizeof (float_t ), ACL_MEM_MALLOC_HUGE_FIRST));
2347
+ int64_t repeat_theta_length = theta_scale_length * ctx.max_prompt_length * 2 ;
2348
+ if (ctx.rope_sin_ptr != nullptr ) {
2349
+ ACL_CHECK (aclrtFree (ctx.rope_sin_ptr ));
2350
+ ACL_CHECK (aclrtFree (ctx.rope_cos_ptr ));
2351
+ }
2352
+ ACL_CHECK (aclrtMalloc (&ctx.rope_sin_ptr , repeat_theta_length * sizeof (float_t ), ACL_MEM_MALLOC_HUGE_FIRST));
2353
+ ACL_CHECK (aclrtMalloc (&ctx.rope_cos_ptr , repeat_theta_length * sizeof (float_t ), ACL_MEM_MALLOC_HUGE_FIRST));
2312
2354
}
2313
2355
2314
- bool is_fisrt_layer = (std::strncmp (dst->name , " Qcur-0" , GGML_MAX_NAME) == 0 );
2315
-
2316
- if (is_fisrt_layer || !is_attention) {
2317
-
2318
- aclTensor* acl_theta_scale_tensor =
2319
- ggml_cann_create_tensor (ctx.init_ptr , ACL_FLOAT, sizeof (float_t ),
2356
+ aclTensor* acl_theta_scale_tensor =
2357
+ ggml_cann_create_tensor (ctx.rope_init_ptr , ACL_FLOAT, sizeof (float_t ),
2320
2358
theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
2321
2359
2322
- // position
2323
- aclTensor* acl_position_tensor = ggml_cann_create_tensor (
2324
- src1->data , ggml_cann_type_mapping (src1->type ),
2325
- ggml_type_size (src1->type ), position_ne, position_nb, GGML_MAX_DIMS);
2326
-
2327
- // power * position
2328
- int64_t theta_length = theta_scale_length * position_length;
2329
- ggml_cann_pool_alloc theta_allocator (ctx.pool (),
2330
- theta_length * sizeof (float_t ));
2331
- void * theta_buffer = theta_allocator.get ();
2332
-
2333
- aclTensor* acl_theta_tensor =
2334
- ggml_cann_create_tensor (theta_buffer, ACL_FLOAT, sizeof (float_t ),
2335
- theta_ne, theta_nb, GGML_MAX_DIMS);
2336
- aclnn_mul (ctx, acl_position_tensor, acl_theta_scale_tensor,
2337
- acl_theta_tensor);
2338
-
2339
- // sin/cos
2340
- aclTensor* acl_sin_tensor = ggml_cann_create_tensor (
2341
- ctx.sin_ptr , ACL_FLOAT, sizeof (float_t ), theta_ne, theta_nb,
2342
- GGML_MAX_DIMS, ACL_FORMAT_ND);
2343
- aclnn_sin (ctx, acl_theta_tensor, acl_sin_tensor);
2344
-
2345
- aclTensor* acl_cos_tensor = ggml_cann_create_tensor (
2346
- ctx.cos_ptr , ACL_FLOAT, sizeof (float_t ), theta_ne, theta_nb,
2347
- GGML_MAX_DIMS, ACL_FORMAT_ND);
2348
- aclnn_cos (ctx, acl_theta_tensor, acl_cos_tensor);
2349
-
2350
- // release
2351
- ggml_cann_release_resources (ctx, acl_theta_scale_tensor, acl_position_tensor,
2352
- acl_theta_tensor, acl_sin_tensor, acl_cos_tensor);
2353
- }
2354
-
2360
+ // position
2361
+ aclTensor* acl_position_tensor = ggml_cann_create_tensor (
2362
+ src1->data , ggml_cann_type_mapping (src1->type ),
2363
+ ggml_type_size (src1->type ), position_ne, position_nb, GGML_MAX_DIMS);
2364
+
2365
+ // power * position
2366
+ int64_t theta_length = theta_scale_length * position_length;
2367
+ ggml_cann_pool_alloc theta_allocator (ctx.pool (),
2368
+ theta_length * sizeof (float_t ));
2369
+ void * theta_buffer = theta_allocator.get ();
2370
+
2371
+ aclTensor* acl_theta_tensor =
2372
+ ggml_cann_create_tensor (theta_buffer, ACL_FLOAT, sizeof (float_t ),
2373
+ theta_ne, theta_nb, GGML_MAX_DIMS);
2374
+ aclnn_mul (ctx, acl_position_tensor, acl_theta_scale_tensor,
2375
+ acl_theta_tensor);
2376
+
2377
+ // sin/cos
2378
+ ggml_cann_pool_alloc sin_allocator (ctx.pool (),
2379
+ theta_length * sizeof (float_t ));
2380
+ void * sin_buffer = sin_allocator.get ();
2355
2381
aclTensor* acl_sin_tensor = ggml_cann_create_tensor (
2356
- ctx.sin_ptr , ACL_FLOAT, sizeof (float_t ), theta_ne, theta_nb,
2357
- GGML_MAX_DIMS, ACL_FORMAT_ND);
2382
+ sin_buffer, ACL_FLOAT, sizeof (float_t ), theta_ne, theta_nb,
2383
+ GGML_MAX_DIMS, ACL_FORMAT_ND);
2384
+ aclnn_sin (ctx, acl_theta_tensor, acl_sin_tensor);
2385
+
2386
+ ggml_cann_pool_alloc cos_allocator (ctx.pool (),
2387
+ theta_length * sizeof (float_t ));
2388
+ void * cos_buffer = cos_allocator.get ();
2358
2389
aclTensor* acl_cos_tensor = ggml_cann_create_tensor (
2359
- ctx.cos_ptr , ACL_FLOAT, sizeof (float_t ), theta_ne, theta_nb,
2360
- GGML_MAX_DIMS, ACL_FORMAT_ND);
2390
+ cos_buffer, ACL_FLOAT, sizeof (float_t ), theta_ne, theta_nb,
2391
+ GGML_MAX_DIMS, ACL_FORMAT_ND);
2392
+ aclnn_cos (ctx, acl_theta_tensor, acl_cos_tensor);
2361
2393
2362
2394
// attn_factor
2363
2395
if (attn_factor != 1 ) {
2364
2396
aclnn_muls (ctx, acl_sin_tensor, attn_factor, nullptr , true );
2365
2397
aclnn_muls (ctx, acl_cos_tensor, attn_factor, nullptr , true );
2366
2398
}
2367
2399
2400
+ int64_t sin_reshape_ne[4 ] = {ne00, 1 , ne02, 1 };
2401
+ size_t sin_reshape_nb[GGML_MAX_DIMS];
2402
+ sin_reshape_nb[0 ] = sizeof (float_t );
2403
+ for (int i = 1 ; i < GGML_MAX_DIMS; i++) {
2404
+ sin_reshape_nb[i] = sin_reshape_nb[i - 1 ] * sin_reshape_ne[i - 1 ];
2405
+ }
2406
+ aclTensor* acl_sin_repeat_tensor =
2407
+ ggml_cann_create_tensor (ctx.rope_sin_ptr , ACL_FLOAT, sizeof (float_t ),
2408
+ sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
2409
+ aclTensor* acl_cos_repeat_tensor =
2410
+ ggml_cann_create_tensor (ctx.rope_cos_ptr , ACL_FLOAT, sizeof (float_t ),
2411
+ sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
2412
+
2368
2413
// repeat
2369
2414
if (is_neox) {
2370
2415
int64_t repeatsArray[] = {1 , 1 , 1 , 2 };
@@ -2380,8 +2425,9 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
2380
2425
num_repeats, output_size);
2381
2426
}
2382
2427
2383
- // release
2384
- ggml_cann_release_resources (ctx, acl_sin_tensor, acl_cos_tensor);
2428
+ ggml_cann_release_resources (ctx, acl_theta_scale_tensor, acl_position_tensor,
2429
+ acl_theta_tensor, acl_sin_tensor, acl_sin_repeat_tensor, acl_cos_tensor,
2430
+ acl_cos_repeat_tensor);
2385
2431
}
2386
2432
2387
2433
#ifdef __cplusplus
@@ -2435,13 +2481,8 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2435
2481
2436
2482
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
2437
2483
2438
- // init cos/sin cache
2439
- ggml_cann_pool_alloc sin_allocator (
2440
- ctx.pool (), ne00 * ne02 * sizeof (float_t ));
2441
- ggml_cann_pool_alloc cos_allocator (
2442
- ctx.pool (), ne00 * ne02 * sizeof (float_t ));
2443
- void * sin_buffer = sin_allocator.get ();
2444
- void * cos_buffer = cos_allocator.get ();
2484
+ // init ctx.rope_cos/rope_sin cache
2485
+ aclnn_cache_init (ctx, dst, theta_scale, freq_scale, attn_factor, is_neox);
2445
2486
2446
2487
int64_t sin_reshape_ne[4 ] = {ne00, 1 , ne02, 1 };
2447
2488
size_t sin_reshape_nb[GGML_MAX_DIMS];
@@ -2450,13 +2491,11 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2450
2491
sin_reshape_nb[i] = sin_reshape_nb[i - 1 ] * sin_reshape_ne[i - 1 ];
2451
2492
}
2452
2493
aclTensor* acl_sin_reshape_tensor =
2453
- ggml_cann_create_tensor (sin_buffer , ACL_FLOAT, sizeof (float_t ),
2494
+ ggml_cann_create_tensor (ctx. rope_sin_ptr , ACL_FLOAT, sizeof (float_t ),
2454
2495
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
2455
2496
aclTensor* acl_cos_reshape_tensor =
2456
- ggml_cann_create_tensor (cos_buffer , ACL_FLOAT, sizeof (float_t ),
2497
+ ggml_cann_create_tensor (ctx. rope_cos_ptr , ACL_FLOAT, sizeof (float_t ),
2457
2498
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
2458
- aclnn_cache_init (ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor,
2459
- theta_scale, freq_scale, attn_factor, is_neox);
2460
2499
2461
2500
aclTensor* acl_src = ggml_cann_create_tensor (src0);
2462
2501
aclTensor* acl_dst = ggml_cann_create_tensor (dst);
0 commit comments