Skip to content

Commit aaae685

Browse files
Merge pull request #248 from menloresearch/update-dev-from-master-2025-09-11-00-33
Sync master with upstream release b6445
2 parents ed2b499 + 00681df commit aaae685

25 files changed

+1163
-476
lines changed

.github/workflows/build.yml

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1063,21 +1063,46 @@ jobs:
10631063
run: |
10641064
git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
10651065
1066-
- name: Install
1066+
- name: Cache ROCm Installation
1067+
id: cache-rocm
1068+
uses: actions/cache@v4
1069+
with:
1070+
path: C:\Program Files\AMD\ROCm
1071+
key: rocm-6.1-${{ runner.os }}-v1
1072+
restore-keys: |
1073+
rocm-6.1-${{ runner.os }}-
1074+
1075+
- name: Install ROCm
1076+
if: steps.cache-rocm.outputs.cache-hit != 'true'
10671077
id: depends
10681078
run: |
10691079
$ErrorActionPreference = "Stop"
10701080
write-host "Downloading AMD HIP SDK Installer"
10711081
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
10721082
write-host "Installing AMD HIP SDK"
10731083
$proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
1074-
$proc.WaitForExit(600000)
1084+
$completed = $proc.WaitForExit(600000)
1085+
if (-not $completed) {
1086+
Write-Error "ROCm installation timed out after 10 minutes. Killing the process"
1087+
$proc.Kill()
1088+
exit 1
1089+
}
1090+
if ($proc.ExitCode -ne 0) {
1091+
Write-Error "ROCm installation failed with exit code $($proc.ExitCode)"
1092+
exit 1
1093+
}
10751094
write-host "Completed AMD HIP SDK installation"
10761095
10771096
- name: Verify ROCm
10781097
id: verify
10791098
run: |
1080-
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
1099+
# Find and test ROCm installation
1100+
$clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1
1101+
if (-not $clangPath) {
1102+
Write-Error "ROCm installation not found"
1103+
exit 1
1104+
}
1105+
& $clangPath.FullName --version
10811106
10821107
- name: Install ccache
10831108
uses: ggml-org/[email protected]

.github/workflows/release.yml

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -544,27 +544,52 @@ jobs:
544544
run: |
545545
git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
546546
547+
- name: Cache ROCm Installation
548+
id: cache-rocm
549+
uses: actions/cache@v4
550+
with:
551+
path: C:\Program Files\AMD\ROCm
552+
key: rocm-6.1-${{ runner.os }}-v1
553+
restore-keys: |
554+
rocm-6.1-${{ runner.os }}-
555+
547556
- name: ccache
548557
uses: ggml-org/[email protected]
549558
with:
550559
key: windows-latest-cmake-hip-${{ matrix.name }}-x64
551560
evict-old-files: 1d
552561

553-
- name: Install
562+
- name: Install ROCm
563+
if: steps.cache-rocm.outputs.cache-hit != 'true'
554564
id: depends
555565
run: |
556566
$ErrorActionPreference = "Stop"
557567
write-host "Downloading AMD HIP SDK Installer"
558568
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
559569
write-host "Installing AMD HIP SDK"
560570
$proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
561-
$proc.WaitForExit(600000)
571+
$completed = $proc.WaitForExit(600000)
572+
if (-not $completed) {
573+
Write-Error "ROCm installation timed out after 10 minutes. Killing the process"
574+
$proc.Kill()
575+
exit 1
576+
}
577+
if ($proc.ExitCode -ne 0) {
578+
Write-Error "ROCm installation failed with exit code $($proc.ExitCode)"
579+
exit 1
580+
}
562581
write-host "Completed AMD HIP SDK installation"
563582
564583
- name: Verify ROCm
565584
id: verify
566585
run: |
567-
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
586+
# Find and test ROCm installation
587+
$clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1
588+
if (-not $clangPath) {
589+
Write-Error "ROCm installation not found"
590+
exit 1
591+
}
592+
& $clangPath.FullName --version
568593
569594
- name: Build
570595
id: cmake_build

convert_hf_to_gguf.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6701,6 +6701,8 @@ def set_gguf_parameters(self):
67016701
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
67026702
self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
67036703
self.gguf_writer.add_block_count(self.hparams["num_layers"])
6704+
if (dec_n_layer := self.hparams.get("num_decoder_layers")) is not None:
6705+
self.gguf_writer.add_decoder_block_count(dec_n_layer)
67046706
self.gguf_writer.add_head_count(self.hparams["num_heads"])
67056707
self.gguf_writer.add_key_length(self.hparams["d_kv"])
67066708
self.gguf_writer.add_value_length(self.hparams["d_kv"])

docs/backend/CANN.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,3 +314,7 @@ Converting the matmul weight format from ND to NZ to improve performance. Enable
314314
### GGML_CANN_ACL_GRAPH
315315

316316
Operators are executed using ACL graph execution, rather than in op-by-op (eager) mode. Enabled by default.
317+
318+
### GGML_CANN_GRAPH_CACHE_CAPACITY
319+
320+
Maximum number of compiled CANN graphs kept in the LRU cache, default is 12. When the number of cached graphs exceeds this capacity, the least recently used graph will be evicted.

ggml/include/ggml-metal.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -43,14 +43,8 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void);
4343

4444
GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend);
4545

46-
GGML_DEPRECATED(
47-
GGML_BACKEND_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
48-
"obsoleted by the new device interface - https://github.com/ggml-org/llama.cpp/pull/9713");
49-
5046
GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
5147

52-
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
53-
5448
// helper to check if the device supports a specific family
5549
// ideally, the user code should be doing these checks
5650
// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf

ggml/src/ggml-cann/aclnn_ops.cpp

Lines changed: 38 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -2268,26 +2268,30 @@ static void aclnn_index_fill_tensor(ggml_backend_cann_context& ctx,
22682268
* stream, and persistent buffers for rope init/cache.
22692269
* @param dst The destination ggml_tensor whose computation
22702270
* depends on the RoPE values (usually Qcur/Kcur).
2271-
* @param sin_tensor_buffer Pre-allocated buffer for storing repeated sin values.
2272-
* @param cos_tensor_buffer Pre-allocated buffer for storing repeated cos values.
22732271
* @param theta_scale Scalar exponent base for computing theta scale values.
22742272
* @param freq_scale Frequency scaling factor, applied to theta scale.
22752273
* @param attn_factor Attention scaling factor, applied to sin/cos.
22762274
* @param is_neox Whether to use Neox-style repeat strategy
22772275
* (dim expansion vs repeat_interleave).
22782276
*/
22792277
static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
2280-
void* sin_tensor_buffer, void* cos_tensor_buffer,
22812278
float* corr_dims, float ext_factor,
22822279
float theta_scale, float freq_scale,
22832280
float attn_factor, bool is_neox) {
2284-
// int sin/cos cache, cache has different repeat method depond on
2285-
// @param.is_neox
2286-
22872281
ggml_tensor* src0 = dst->src[0]; // input
22882282
ggml_tensor* src1 = dst->src[1]; // position
22892283
ggml_tensor* src2 = dst->src[2]; // freq_factors
22902284

2285+
if(src2 == nullptr && ctx.rope_cache.cached
2286+
&& ctx.rope_cache.ext_factor == ext_factor
2287+
&& ctx.rope_cache.theta_scale == theta_scale
2288+
&& ctx.rope_cache.freq_scale == freq_scale
2289+
&& ctx.rope_cache.attn_factor == attn_factor
2290+
&& ctx.rope_cache.is_neox == is_neox) {
2291+
// use cache.
2292+
return;
2293+
}
2294+
22912295
int64_t theta_scale_length = src0->ne[0] / 2;
22922296
int64_t theta_scale_ne[] = {theta_scale_length, 1, 1, 1};
22932297
size_t theta_scale_nb[] = {sizeof(float), sizeof(float), sizeof(float),
@@ -2316,8 +2320,6 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
23162320
ctx.rope_cache.freq_scale != freq_scale) {
23172321

23182322
ctx.rope_cache.theta_scale_length = theta_scale_length;
2319-
ctx.rope_cache.theta_scale = theta_scale;
2320-
ctx.rope_cache.freq_scale = freq_scale;
23212323

23222324
if (ctx.rope_cache.theta_scale_cache != nullptr) {
23232325
ACL_CHECK(aclrtFree(ctx.rope_cache.theta_scale_cache));
@@ -2342,7 +2344,7 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
23422344
// return MIN(1, MAX(0, y)) - 1;
23432345
yarn_ramp_allocator.alloc(theta_scale_length * sizeof(float));
23442346
void* yarn_ramp_buffer = yarn_ramp_allocator.get();
2345-
acl_yarn_ramp_tensor = ggml_cann_create_tensor(yarn_ramp_buffer, ACL_FLOAT, sizeof(float_t),
2347+
acl_yarn_ramp_tensor = ggml_cann_create_tensor(yarn_ramp_buffer, ACL_FLOAT, sizeof(float),
23462348
theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
23472349
float zero_value = 0, one_value = 1;
23482350
float denom_safe_value = MAX(0.001f, corr_dims[1] - corr_dims[0]);
@@ -2411,6 +2413,20 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
24112413
ggml_cann_release_resources(ctx, acl_freq_factors_tensor, acl_freq_fac_res_tensor);
24122414
}
24132415

2416+
// init sin_repeat && cos_repeat, only to accelerate first layer on each device
2417+
if (position_length > ctx.rope_cache.position_length) {
2418+
ctx.rope_cache.position_length = position_length;
2419+
if (ctx.rope_cache.sin_cache != nullptr) {
2420+
ACL_CHECK(aclrtFree(ctx.rope_cache.sin_cache));
2421+
}
2422+
if (ctx.rope_cache.cos_cache != nullptr) {
2423+
ACL_CHECK(aclrtFree(ctx.rope_cache.cos_cache));
2424+
}
2425+
int64_t repeat_theta_length = theta_scale_length * position_length * 2;
2426+
ACL_CHECK(aclrtMalloc(&ctx.rope_cache.sin_cache, repeat_theta_length * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST));
2427+
ACL_CHECK(aclrtMalloc(&ctx.rope_cache.cos_cache, repeat_theta_length * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST));
2428+
}
2429+
24142430
// position
24152431
aclTensor* acl_position_tensor = ggml_cann_create_tensor(
24162432
src1->data, ggml_cann_type_mapping(src1->type),
@@ -2462,10 +2478,10 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
24622478
sin_reshape_nb[i] = sin_reshape_nb[i - 1] * sin_reshape_ne[i - 1];
24632479
}
24642480
aclTensor* acl_sin_repeat_tensor =
2465-
ggml_cann_create_tensor(sin_tensor_buffer, ACL_FLOAT, sizeof(float),
2481+
ggml_cann_create_tensor(ctx.rope_cache.sin_cache, ACL_FLOAT, sizeof(float),
24662482
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
24672483
aclTensor* acl_cos_repeat_tensor =
2468-
ggml_cann_create_tensor(cos_tensor_buffer, ACL_FLOAT, sizeof(float),
2484+
ggml_cann_create_tensor(ctx.rope_cache.cos_cache, ACL_FLOAT, sizeof(float),
24692485
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
24702486

24712487
// repeat
@@ -2483,6 +2499,14 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
24832499
num_repeats, output_size);
24842500
}
24852501

2502+
// Other layers use cache except first layer.
2503+
ctx.rope_cache.cached = true;
2504+
ctx.rope_cache.ext_factor = ext_factor;
2505+
ctx.rope_cache.theta_scale = theta_scale;
2506+
ctx.rope_cache.freq_scale = freq_scale;
2507+
ctx.rope_cache.attn_factor = attn_factor;
2508+
ctx.rope_cache.is_neox = is_neox;
2509+
24862510
ggml_cann_release_resources(ctx, acl_theta_scale_tensor, acl_position_tensor,
24872511
acl_theta_tensor, acl_sin_tensor, acl_sin_repeat_tensor, acl_cos_tensor,
24882512
acl_cos_repeat_tensor);
@@ -2504,10 +2528,7 @@ aclnnStatus aclnnRotaryPositionEmbedding(void* workspace,
25042528
#endif
25052529

25062530
void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2507-
// TODO: use ascendc
2508-
// Only test with LLAMA model.
25092531
ggml_tensor* src0 = dst->src[0]; // input
2510-
ggml_tensor* src1 = dst->src[1];
25112532

25122533
// param
25132534
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
@@ -2538,15 +2559,8 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
25382559

25392560
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
25402561

2541-
// sin/cos tensor length.
2542-
int64_t repeat_theta_length = src0->ne[0] * src1->ne[0];
2543-
ggml_cann_pool_alloc sin_tensor_allocator(ctx.pool(), repeat_theta_length * sizeof(float));
2544-
ggml_cann_pool_alloc cos_tensor_allocator(ctx.pool(), repeat_theta_length * sizeof(float));
2545-
void *sin_tensor_buffer = sin_tensor_allocator.get();
2546-
void *cos_tensor_buffer = cos_tensor_allocator.get();
2547-
25482562
// init ctx.rope_cos/rope_sin cache
2549-
aclnn_cache_init(ctx, dst, sin_tensor_buffer, cos_tensor_buffer, corr_dims, ext_factor,
2563+
aclnn_cache_init(ctx, dst, corr_dims, ext_factor,
25502564
theta_scale, freq_scale, attn_factor, is_neox);
25512565

25522566
int64_t sin_reshape_ne[4] = {ne00, 1, ne02, 1};
@@ -2556,10 +2570,10 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
25562570
sin_reshape_nb[i] = sin_reshape_nb[i - 1] * sin_reshape_ne[i - 1];
25572571
}
25582572
aclTensor* acl_sin_reshape_tensor =
2559-
ggml_cann_create_tensor(sin_tensor_buffer, ACL_FLOAT, sizeof(float),
2573+
ggml_cann_create_tensor(ctx.rope_cache.sin_cache, ACL_FLOAT, sizeof(float),
25602574
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
25612575
aclTensor* acl_cos_reshape_tensor =
2562-
ggml_cann_create_tensor(cos_tensor_buffer, ACL_FLOAT, sizeof(float),
2576+
ggml_cann_create_tensor(ctx.rope_cache.cos_cache, ACL_FLOAT, sizeof(float),
25632577
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
25642578

25652579
aclTensor* acl_src = ggml_cann_create_tensor(src0);

0 commit comments

Comments
 (0)