Skip to content

Commit a26dec5

Browse files
authored
Merge branch 'ggml-org:master' into master
2 parents e7565e8 + 33daece commit a26dec5

File tree

8 files changed

+173
-28
lines changed

8 files changed

+173
-28
lines changed

.github/workflows/release.yml

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -544,27 +544,52 @@ jobs:
544544
run: |
545545
git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
546546
547+
- name: Cache ROCm Installation
548+
id: cache-rocm
549+
uses: actions/cache@v4
550+
with:
551+
path: C:\Program Files\AMD\ROCm
552+
key: rocm-6.1-${{ runner.os }}-v1
553+
restore-keys: |
554+
rocm-6.1-${{ runner.os }}-
555+
547556
- name: ccache
548557
uses: ggml-org/[email protected]
549558
with:
550559
key: windows-latest-cmake-hip-${{ matrix.name }}-x64
551560
evict-old-files: 1d
552561

553-
- name: Install
562+
- name: Install ROCm
563+
if: steps.cache-rocm.outputs.cache-hit != 'true'
554564
id: depends
555565
run: |
556566
$ErrorActionPreference = "Stop"
557567
write-host "Downloading AMD HIP SDK Installer"
558568
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
559569
write-host "Installing AMD HIP SDK"
560570
$proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
561-
$proc.WaitForExit(600000)
571+
$completed = $proc.WaitForExit(600000)
572+
if (-not $completed) {
573+
Write-Error "ROCm installation timed out after 10 minutes. Killing the process"
574+
$proc.Kill()
575+
exit 1
576+
}
577+
if ($proc.ExitCode -ne 0) {
578+
Write-Error "ROCm installation failed with exit code $($proc.ExitCode)"
579+
exit 1
580+
}
562581
write-host "Completed AMD HIP SDK installation"
563582
564583
- name: Verify ROCm
565584
id: verify
566585
run: |
567-
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
586+
# Find and test ROCm installation
587+
$clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1
588+
if (-not $clangPath) {
589+
Write-Error "ROCm installation not found"
590+
exit 1
591+
}
592+
& $clangPath.FullName --version
568593
569594
- name: Build
570595
id: cmake_build

ggml/src/ggml-cann/aclnn_ops.cpp

Lines changed: 38 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -2268,26 +2268,30 @@ static void aclnn_index_fill_tensor(ggml_backend_cann_context& ctx,
22682268
* stream, and persistent buffers for rope init/cache.
22692269
* @param dst The destination ggml_tensor whose computation
22702270
* depends on the RoPE values (usually Qcur/Kcur).
2271-
* @param sin_tensor_buffer Pre-allocated buffer for storing repeated sin values.
2272-
* @param cos_tensor_buffer Pre-allocated buffer for storing repeated cos values.
22732271
* @param theta_scale Scalar exponent base for computing theta scale values.
22742272
* @param freq_scale Frequency scaling factor, applied to theta scale.
22752273
* @param attn_factor Attention scaling factor, applied to sin/cos.
22762274
* @param is_neox Whether to use Neox-style repeat strategy
22772275
* (dim expansion vs repeat_interleave).
22782276
*/
22792277
static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
2280-
void* sin_tensor_buffer, void* cos_tensor_buffer,
22812278
float* corr_dims, float ext_factor,
22822279
float theta_scale, float freq_scale,
22832280
float attn_factor, bool is_neox) {
2284-
// int sin/cos cache, cache has different repeat method depond on
2285-
// @param.is_neox
2286-
22872281
ggml_tensor* src0 = dst->src[0]; // input
22882282
ggml_tensor* src1 = dst->src[1]; // position
22892283
ggml_tensor* src2 = dst->src[2]; // freq_factors
22902284

2285+
if(src2 == nullptr && ctx.rope_cache.cached
2286+
&& ctx.rope_cache.ext_factor == ext_factor
2287+
&& ctx.rope_cache.theta_scale == theta_scale
2288+
&& ctx.rope_cache.freq_scale == freq_scale
2289+
&& ctx.rope_cache.attn_factor == attn_factor
2290+
&& ctx.rope_cache.is_neox == is_neox) {
2291+
// use cache.
2292+
return;
2293+
}
2294+
22912295
int64_t theta_scale_length = src0->ne[0] / 2;
22922296
int64_t theta_scale_ne[] = {theta_scale_length, 1, 1, 1};
22932297
size_t theta_scale_nb[] = {sizeof(float), sizeof(float), sizeof(float),
@@ -2316,8 +2320,6 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
23162320
ctx.rope_cache.freq_scale != freq_scale) {
23172321

23182322
ctx.rope_cache.theta_scale_length = theta_scale_length;
2319-
ctx.rope_cache.theta_scale = theta_scale;
2320-
ctx.rope_cache.freq_scale = freq_scale;
23212323

23222324
if (ctx.rope_cache.theta_scale_cache != nullptr) {
23232325
ACL_CHECK(aclrtFree(ctx.rope_cache.theta_scale_cache));
@@ -2342,7 +2344,7 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
23422344
// return MIN(1, MAX(0, y)) - 1;
23432345
yarn_ramp_allocator.alloc(theta_scale_length * sizeof(float));
23442346
void* yarn_ramp_buffer = yarn_ramp_allocator.get();
2345-
acl_yarn_ramp_tensor = ggml_cann_create_tensor(yarn_ramp_buffer, ACL_FLOAT, sizeof(float_t),
2347+
acl_yarn_ramp_tensor = ggml_cann_create_tensor(yarn_ramp_buffer, ACL_FLOAT, sizeof(float),
23462348
theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
23472349
float zero_value = 0, one_value = 1;
23482350
float denom_safe_value = MAX(0.001f, corr_dims[1] - corr_dims[0]);
@@ -2411,6 +2413,20 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
24112413
ggml_cann_release_resources(ctx, acl_freq_factors_tensor, acl_freq_fac_res_tensor);
24122414
}
24132415

2416+
// init sin_repeat && cos_repeat, only to accelerate first layer on each device
2417+
if (position_length > ctx.rope_cache.position_length) {
2418+
ctx.rope_cache.position_length = position_length;
2419+
if (ctx.rope_cache.sin_cache != nullptr) {
2420+
ACL_CHECK(aclrtFree(ctx.rope_cache.sin_cache));
2421+
}
2422+
if (ctx.rope_cache.cos_cache != nullptr) {
2423+
ACL_CHECK(aclrtFree(ctx.rope_cache.cos_cache));
2424+
}
2425+
int64_t repeat_theta_length = theta_scale_length * position_length * 2;
2426+
ACL_CHECK(aclrtMalloc(&ctx.rope_cache.sin_cache, repeat_theta_length * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST));
2427+
ACL_CHECK(aclrtMalloc(&ctx.rope_cache.cos_cache, repeat_theta_length * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST));
2428+
}
2429+
24142430
// position
24152431
aclTensor* acl_position_tensor = ggml_cann_create_tensor(
24162432
src1->data, ggml_cann_type_mapping(src1->type),
@@ -2462,10 +2478,10 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
24622478
sin_reshape_nb[i] = sin_reshape_nb[i - 1] * sin_reshape_ne[i - 1];
24632479
}
24642480
aclTensor* acl_sin_repeat_tensor =
2465-
ggml_cann_create_tensor(sin_tensor_buffer, ACL_FLOAT, sizeof(float),
2481+
ggml_cann_create_tensor(ctx.rope_cache.sin_cache, ACL_FLOAT, sizeof(float),
24662482
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
24672483
aclTensor* acl_cos_repeat_tensor =
2468-
ggml_cann_create_tensor(cos_tensor_buffer, ACL_FLOAT, sizeof(float),
2484+
ggml_cann_create_tensor(ctx.rope_cache.cos_cache, ACL_FLOAT, sizeof(float),
24692485
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
24702486

24712487
// repeat
@@ -2483,6 +2499,14 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
24832499
num_repeats, output_size);
24842500
}
24852501

2502+
// Other layers use cache except first layer.
2503+
ctx.rope_cache.cached = true;
2504+
ctx.rope_cache.ext_factor = ext_factor;
2505+
ctx.rope_cache.theta_scale = theta_scale;
2506+
ctx.rope_cache.freq_scale = freq_scale;
2507+
ctx.rope_cache.attn_factor = attn_factor;
2508+
ctx.rope_cache.is_neox = is_neox;
2509+
24862510
ggml_cann_release_resources(ctx, acl_theta_scale_tensor, acl_position_tensor,
24872511
acl_theta_tensor, acl_sin_tensor, acl_sin_repeat_tensor, acl_cos_tensor,
24882512
acl_cos_repeat_tensor);
@@ -2504,10 +2528,7 @@ aclnnStatus aclnnRotaryPositionEmbedding(void* workspace,
25042528
#endif
25052529

25062530
void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2507-
// TODO: use ascendc
2508-
// Only test with LLAMA model.
25092531
ggml_tensor* src0 = dst->src[0]; // input
2510-
ggml_tensor* src1 = dst->src[1];
25112532

25122533
// param
25132534
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
@@ -2538,15 +2559,8 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
25382559

25392560
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
25402561

2541-
// sin/cos tensor length.
2542-
int64_t repeat_theta_length = src0->ne[0] * src1->ne[0];
2543-
ggml_cann_pool_alloc sin_tensor_allocator(ctx.pool(), repeat_theta_length * sizeof(float));
2544-
ggml_cann_pool_alloc cos_tensor_allocator(ctx.pool(), repeat_theta_length * sizeof(float));
2545-
void *sin_tensor_buffer = sin_tensor_allocator.get();
2546-
void *cos_tensor_buffer = cos_tensor_allocator.get();
2547-
25482562
// init ctx.rope_cos/rope_sin cache
2549-
aclnn_cache_init(ctx, dst, sin_tensor_buffer, cos_tensor_buffer, corr_dims, ext_factor,
2563+
aclnn_cache_init(ctx, dst, corr_dims, ext_factor,
25502564
theta_scale, freq_scale, attn_factor, is_neox);
25512565

25522566
int64_t sin_reshape_ne[4] = {ne00, 1, ne02, 1};
@@ -2556,10 +2570,10 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
25562570
sin_reshape_nb[i] = sin_reshape_nb[i - 1] * sin_reshape_ne[i - 1];
25572571
}
25582572
aclTensor* acl_sin_reshape_tensor =
2559-
ggml_cann_create_tensor(sin_tensor_buffer, ACL_FLOAT, sizeof(float),
2573+
ggml_cann_create_tensor(ctx.rope_cache.sin_cache, ACL_FLOAT, sizeof(float),
25602574
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
25612575
aclTensor* acl_cos_reshape_tensor =
2562-
ggml_cann_create_tensor(cos_tensor_buffer, ACL_FLOAT, sizeof(float),
2576+
ggml_cann_create_tensor(ctx.rope_cache.cos_cache, ACL_FLOAT, sizeof(float),
25632577
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
25642578

25652579
aclTensor* acl_src = ggml_cann_create_tensor(src0);

ggml/src/ggml-cann/common.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -425,12 +425,27 @@ struct ggml_cann_rope_cache {
425425
if(theta_scale_cache != nullptr) {
426426
ACL_CHECK(aclrtFree(theta_scale_cache));
427427
}
428+
if(sin_cache != nullptr) {
429+
ACL_CHECK(aclrtFree(sin_cache));
430+
}
431+
if(cos_cache != nullptr) {
432+
ACL_CHECK(aclrtFree(cos_cache));
433+
}
428434
}
429435

430436
void* theta_scale_cache = nullptr;
431437
int64_t theta_scale_length = 0;
438+
// sin/cos cache, used only to accelerate first layer on each device
439+
void* sin_cache = nullptr;
440+
void* cos_cache = nullptr;
441+
int64_t position_length = 0;
442+
// Properties to check before reusing the sincos cache
443+
bool cached = false;
444+
float ext_factor = 0.0f;
432445
float theta_scale = 0.0f;
433446
float freq_scale = 0.0f;
447+
float attn_factor = 0.0f;
448+
bool is_neox = false;
434449
};
435450

436451
struct ggml_cann_tensor_cache {

ggml/src/ggml-cann/ggml-cann.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2353,6 +2353,9 @@ static enum ggml_status ggml_backend_cann_graph_compute(
23532353
ggml_cann_set_device(cann_ctx->device);
23542354
g_nz_workspaces[cann_ctx->device].clear();
23552355

2356+
// calculate rope cache for fist layer in current device.
2357+
cann_ctx->rope_cache.cached = false;
2358+
23562359
#ifdef USE_ACL_GRAPH
23572360
bool use_cann_graph = true;
23582361
bool cann_graph_update_required = false;

media/llama1-icon-transparent.png

13.9 KB
Loading

media/llama1-icon-transparent.svg

Lines changed: 77 additions & 0 deletions
Loading

tests/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,4 @@
22
!*.*
33
*.o
44
ggml-common.h
5+
**/*.swp

tests/test-backend-ops.cpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6807,7 +6807,17 @@ static void list_all_ops() {
68076807
static void show_test_coverage() {
68086808
std::set<std::string> all_ops;
68096809
for (int i = 1; i < GGML_OP_COUNT; i++) {
6810-
all_ops.insert(ggml_op_name((enum ggml_op)i));
6810+
auto op = (enum ggml_op)i;
6811+
if (op == GGML_OP_VIEW ||
6812+
op == GGML_OP_RESHAPE ||
6813+
op == GGML_OP_PERMUTE ||
6814+
op == GGML_OP_TRANSPOSE ||
6815+
op == GGML_OP_CONT ||
6816+
op == GGML_OP_GLU ||
6817+
op == GGML_OP_UNARY) {
6818+
continue;
6819+
}
6820+
all_ops.insert(ggml_op_name(op));
68116821
}
68126822
for (int i = 0; i < GGML_UNARY_OP_COUNT; i++) {
68136823
all_ops.insert(ggml_unary_op_name((enum ggml_unary_op)i));

0 commit comments

Comments
 (0)