Skip to content

Commit f56c4b5

Browse files
authored
Merge b3542
b3542
2 parents 9e2b2d7 + 15fa07a commit f56c4b5

File tree

8 files changed

+103
-58
lines changed

8 files changed

+103
-58
lines changed

Makefile

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -888,15 +888,16 @@ ggml/src/ggml-metal-embed.o: \
888888
ggml/src/ggml-common.h
889889
@echo "Embedding Metal library"
890890
@sed -e '/#include "ggml-common.h"/r ggml/src/ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml/src/ggml-metal.metal > ggml/src/ggml-metal-embed.metal
891-
$(eval TEMP_ASSEMBLY=$(shell mktemp))
892-
@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
893-
@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
894-
@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
895-
@echo ".incbin \"ggml/src/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)
896-
@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
897-
@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
898-
@$(AS) $(TEMP_ASSEMBLY) -o $@
899-
@rm -f ${TEMP_ASSEMBLY}
891+
$(eval TEMP_ASSEMBLY=$(shell mktemp -d))
892+
@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)/ggml-metal-embed.s
893+
@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
894+
@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
895+
@echo ".incbin \"ggml/src/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
896+
@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
897+
@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
898+
$(CC) $(CFLAGS) -c $(TEMP_ASSEMBLY)/ggml-metal-embed.s -o $@
899+
@rm -f ${TEMP_ASSEMBLY}/ggml-metal-embed.s
900+
@rmdir ${TEMP_ASSEMBLY}
900901
endif
901902
endif # GGML_METAL
902903

examples/llama-bench/llama-bench.cpp

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,14 @@
2727
#include "ggml-cann.h"
2828
#endif
2929

30+
#ifdef _WIN32
31+
#define WIN32_LEAN_AND_MEAN
32+
#ifndef NOMINMAX
33+
# define NOMINMAX
34+
#endif
35+
#include <windows.h>
36+
#endif
37+
3038
// utils
3139
static uint64_t get_time_ns() {
3240
using clock = std::chrono::high_resolution_clock;
@@ -96,6 +104,27 @@ static std::string get_cpu_info() {
96104
}
97105
fclose(f);
98106
}
107+
#elif defined(_WIN32)
108+
HKEY hKey;
109+
if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
110+
TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
111+
0,
112+
KEY_READ,
113+
&hKey) != ERROR_SUCCESS) {
114+
// fail to open registry key
115+
return "";
116+
}
117+
char cpu_brand[256];
118+
DWORD cpu_brand_size = sizeof(cpu_brand);
119+
if (RegQueryValueExA(hKey,
120+
TEXT("ProcessorNameString"),
121+
NULL,
122+
NULL,
123+
(LPBYTE)cpu_brand,
124+
&cpu_brand_size) == ERROR_SUCCESS) {
125+
id.assign(cpu_brand, cpu_brand_size);
126+
}
127+
RegCloseKey(hKey);
99128
#endif
100129
// TODO: other platforms
101130
return id;

examples/quantize/quantize.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
9191
}
9292

9393
// usage:
94-
// ./quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
94+
// ./llama-quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
9595
//
9696
[[noreturn]]
9797
static void usage(const char * executable) {

examples/sycl/README.md

Lines changed: 9 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@ This example program provides the tools for llama.cpp for SYCL on Intel GPU.
1212

1313
List all SYCL devices with ID, compute capability, max work group size, ect.
1414

15-
1. Build the llama.cpp for SYCL for all targets.
15+
1. Build the llama.cpp for SYCL for the specified target *(using GGML_SYCL_TARGET)*.
1616

17-
2. Enable oneAPI running environment
17+
2. Enable oneAPI running environment *(if GGML_SYCL_TARGET is set to INTEL -default-)*
1818

1919
```
2020
source /opt/intel/oneapi/setvars.sh
@@ -29,19 +29,13 @@ source /opt/intel/oneapi/setvars.sh
2929
Check the ID in startup log, like:
3030

3131
```
32-
found 4 SYCL devices:
33-
Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
34-
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
35-
Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
36-
max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
37-
Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
38-
max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
39-
Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
40-
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
32+
found 2 SYCL devices:
33+
| | | | |Max | |Max |Global | |
34+
| | | | |compute|Max work|sub |mem | |
35+
|ID| Device Type| Name|Version|units |group |group|size | Driver version|
36+
|--|-------------------|---------------------------------------|-------|-------|--------|-----|-------|---------------------|
37+
| 0| [level_zero:gpu:0]| Intel Arc A770 Graphics| 1.3| 512| 1024| 32| 16225M| 1.3.29138|
38+
| 1| [level_zero:gpu:1]| Intel UHD Graphics 750| 1.3| 32| 512| 32| 62631M| 1.3.29138|
4139
4240
```
4341

44-
|Attribute|Note|
45-
|-|-|
46-
|compute capability 1.3|Level-zero running time, recommended |
47-
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|

ggml/src/ggml-backend.c

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -351,15 +351,10 @@ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t b
351351
}
352352

353353
// an async copy would normally happen after all the queued operations on both backends are completed
354-
// sync src, set_async dst
355-
if (ggml_backend_buffer_is_host(src->buffer)) {
356-
ggml_backend_synchronize(backend_src);
357-
ggml_backend_tensor_set_async(backend_dst, dst, src->data, 0, ggml_nbytes(src));
358-
} else {
359-
ggml_backend_synchronize(backend_src);
360-
ggml_backend_tensor_copy(src, dst);
361-
ggml_backend_synchronize(backend_dst);
362-
}
354+
// to simulate the same behavior, we need to synchronize both backends first, and do a blocking copy
355+
ggml_backend_synchronize(backend_src);
356+
ggml_backend_synchronize(backend_dst);
357+
ggml_backend_tensor_copy(src, dst);
363358
}
364359

365360
// events
@@ -1782,7 +1777,17 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
17821777
} else {
17831778
ggml_backend_synchronize(split_backend);
17841779
}
1785-
ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
1780+
// try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
1781+
// TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
1782+
if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
1783+
ggml_backend_synchronize(input_backend);
1784+
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1785+
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
1786+
} else {
1787+
ggml_backend_synchronize(split_backend);
1788+
}
1789+
ggml_backend_tensor_copy(input, input_cpy);
1790+
}
17861791
}
17871792
}
17881793

ggml/src/ggml-cuda.cu

Lines changed: 22 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2358,33 +2358,35 @@ GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend,
23582358
}
23592359

23602360
GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
2361-
GGML_ASSERT(ggml_backend_is_cuda(backend_src) || ggml_backend_is_cuda(backend_dst));
2362-
23632361
ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
23642362
ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
23652363

2366-
if (!ggml_backend_buffer_is_cuda(src->buffer)) {
2364+
if (!ggml_backend_is_cuda(backend_src) || !ggml_backend_is_cuda(backend_dst)) {
23672365
return false;
23682366
}
23692367

2370-
if (!ggml_backend_buffer_is_cuda(dst->buffer)) {
2368+
if (!ggml_backend_buffer_is_cuda(src->buffer) || !ggml_backend_buffer_is_cuda(dst->buffer)) {
23712369
return false;
23722370
}
23732371

2374-
// device -> device
2372+
// device -> device copy
23752373
ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
23762374
ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
23772375

2378-
if (backend_src != backend_dst) {
2379-
ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
2380-
ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
2376+
ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
2377+
ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
23812378

2382-
GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device);
2383-
GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device);
2379+
if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
2380+
#ifndef NDEBUG
2381+
GGML_CUDA_LOG_WARN("%s: backend and buffer devices do not match\n", __func__);
2382+
#endif
2383+
return false;
2384+
}
23842385

2386+
if (backend_src != backend_dst) {
23852387
// copy on src stream
23862388
if (cuda_ctx_src->device == cuda_ctx_dst->device) {
2387-
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
2389+
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
23882390
} else {
23892391
#ifdef GGML_CUDA_NO_PEER_COPY
23902392
return false;
@@ -2393,7 +2395,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
23932395
#endif
23942396
}
23952397

2396-
// record event on src stream
2398+
// record event on src stream after the copy
23972399
if (!cuda_ctx_src->copy_event) {
23982400
ggml_cuda_set_device(cuda_ctx_src->device);
23992401
CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
@@ -2405,7 +2407,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
24052407
CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_dst->stream(), cuda_ctx_src->copy_event, 0));
24062408
} else {
24072409
// src and dst are on the same backend
2408-
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
2410+
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
24092411
}
24102412
return true;
24112413
}
@@ -2742,11 +2744,12 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
27422744
case GGML_OP_MUL_MAT_ID:
27432745
{
27442746
struct ggml_tensor * a = op->src[0];
2745-
if (op->op == GGML_OP_MUL_MAT) {
2746-
struct ggml_tensor * b = op->src[1];
2747-
if (a->ne[3] != b->ne[3]) {
2748-
return false;
2749-
}
2747+
struct ggml_tensor * b = op->src[1];
2748+
if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) {
2749+
return false;
2750+
}
2751+
if (op->op == GGML_OP_MUL_MAT && a->ne[3] != b->ne[3]) {
2752+
return false;
27502753
}
27512754
switch (a->type) {
27522755
case GGML_TYPE_F32:
@@ -2877,7 +2880,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
28772880
return true;
28782881
case GGML_OP_FLASH_ATTN_EXT:
28792882
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
2880-
return op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128;
2883+
return (op->src[0]->ne[0] == 64 && op->src[1]->type == GGML_TYPE_F16) || op->src[0]->ne[0] == 128;
28812884
#else
28822885
if (op->src[0]->ne[0] == 128) {
28832886
return true;

ggml/src/ggml-sycl/dpct/helper.hpp

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -874,7 +874,7 @@ namespace dpct
874874
inline std::string get_preferred_gpu_platform_name() {
875875
std::string result;
876876
877-
std::string filter = "level-zero";
877+
std::string filter = "";
878878
char* env = getenv("ONEAPI_DEVICE_SELECTOR");
879879
if (env) {
880880
if (std::strstr(env, "level_zero")) {
@@ -892,11 +892,24 @@ namespace dpct
892892
else {
893893
throw std::runtime_error("invalid device filter: " + std::string(env));
894894
}
895+
} else {
896+
auto default_device = sycl::device(sycl::default_selector_v);
897+
auto default_platform_name = default_device.get_platform().get_info<sycl::info::platform::name>();
898+
899+
if (std::strstr(default_platform_name.c_str(), "Level-Zero") || default_device.is_cpu()) {
900+
filter = "level-zero";
901+
}
902+
else if (std::strstr(default_platform_name.c_str(), "CUDA")) {
903+
filter = "cuda";
904+
}
905+
else if (std::strstr(default_platform_name.c_str(), "HIP")) {
906+
filter = "hip";
907+
}
895908
}
896909
897-
auto plaform_list = sycl::platform::get_platforms();
910+
auto platform_list = sycl::platform::get_platforms();
898911
899-
for (const auto& platform : plaform_list) {
912+
for (const auto& platform : platform_list) {
900913
auto devices = platform.get_devices();
901914
auto gpu_dev = std::find_if(devices.begin(), devices.end(), [](const sycl::device& d) {
902915
return d.is_gpu();

include/llama.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -345,7 +345,7 @@ extern "C" {
345345
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
346346
enum llama_ftype ftype; // quantize to this llama_ftype
347347
enum ggml_type output_tensor_type; // output tensor type
348-
enum ggml_type token_embedding_type; // itoken embeddings tensor type
348+
enum ggml_type token_embedding_type; // token embeddings tensor type
349349
bool allow_requantize; // allow quantizing non-f32/f16 tensors
350350
bool quantize_output_tensor; // quantize output.weight
351351
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored

0 commit comments

Comments
 (0)