Skip to content

Commit bff9f6f

Browse files
support to malloc memory on device more than 4GB, update the doc and script
1 parent e509411 commit bff9f6f

File tree

7 files changed

+33
-9
lines changed

7 files changed

+33
-9
lines changed

docs/backend/SYCL.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@ The following releases are verified and recommended:
4242

4343
## News
4444

45+
- 2025.11
46+
- Support malloc memory on device more than 4GB.
47+
4548
- 2025.2
4649
- Optimize MUL_MAT Q4_0 on Intel GPU for all dGPUs and built-in GPUs since MTL. Increase the performance of LLM (llama-2-7b.Q4_0.gguf) 21%-87% on Intel GPUs (MTL, ARL-H, Arc, Flex, PVC).
4750
|GPU|Base tokens/s|Increased tokens/s|Percent|
@@ -789,6 +792,8 @@ use 1 SYCL GPUs: [0] with Max compute units:512
789792
| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because graph performance isn't yet better than non-graph performance. |
790793
| GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
791794
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
795+
| UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Support malloc device memory more than 4GB.|
796+
792797

793798

794799
## Known Issues
@@ -835,6 +840,14 @@ use 1 SYCL GPUs: [0] with Max compute units:512
835840
| The default context is too big. It leads to excessive memory usage.|Set `-c 8192` or a smaller value.|
836841
| The model is too big and requires more memory than what is available.|Choose a smaller model or change to a smaller quantization, like Q5 -> Q4;<br>Alternatively, use more than one device to load model.|
837842

843+
- `ggml_backend_sycl_buffer_type_alloc_buffer: can't allocate 5000000000 Bytes of memory on device`
844+
845+
You need to enable to support 4GB memory malloc by:
846+
```
847+
export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
848+
set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
849+
```
850+
838851
### **GitHub contribution**:
839852
Please add the `SYCL :` prefix/tag in issues/PRs titles to help the SYCL contributors to check/address them without delay.
840853

examples/sycl/run-llama2.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ MODEL_FILE=models/llama-2-7b.Q4_0.gguf
1515
NGL=99
1616
CONTEXT=4096
1717

18+
#support malloc device memory more than 4GB.
19+
export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
20+
1821
if [ $# -gt 0 ]; then
1922
GGML_SYCL_DEVICE=$1
2023
echo "use $GGML_SYCL_DEVICE as main GPU"

examples/sycl/run-llama3.sh

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
# If you want more control, DPC++ Allows selecting a specific device through the
88
# following environment variable
9-
#export ONEAPI_DEVICE_SELECTOR="level_zero:0"
9+
export ONEAPI_DEVICE_SELECTOR="level_zero:0"
1010
source /opt/intel/oneapi/setvars.sh
1111

1212
#export GGML_SYCL_DEBUG=1
@@ -18,11 +18,14 @@ MODEL_FILE=models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf
1818
NGL=99 # Layers offloaded to the GPU. If the device runs out of memory, reduce this value according to the model you are using.
1919
CONTEXT=4096
2020

21+
#support malloc device memory more than 4GB.
22+
export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
23+
2124
if [ $# -gt 0 ]; then
2225
GGML_SYCL_DEVICE=$1
2326
echo "Using $GGML_SYCL_DEVICE as the main GPU"
24-
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
27+
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
2528
else
2629
#use multiple GPUs with same max compute units
27-
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -c ${CONTEXT}
30+
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT}
2831
fi

examples/sycl/win-run-llama2.bat

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,7 @@
55
set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
66
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
77

8+
:: support malloc device memory more than 4GB.
9+
set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
810

911
.\build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 99 -s 0

examples/sycl/win-run-llama3.bat

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,7 @@
55
set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
66
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
77

8+
:: support malloc device memory more than 4GB.
9+
set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
810

9-
.\build\bin\llama-cli.exe -m models\Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf -p %INPUT2% -n 400 -e -ngl 99
11+
.\build\bin\llama-cli.exe -m models\Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf -p %INPUT2% -n 400 -s 0 -e -ngl 99

ggml/src/ggml-sycl/CMakeLists.txt

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,10 @@ if (GGML_SYCL_F16)
9191
add_compile_definitions(GGML_SYCL_F16)
9292
endif()
9393

94-
if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
94+
if (GGML_SYCL_TARGET STREQUAL "INTEL")
95+
add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
96+
target_link_options(ggml-sycl PRIVATE -Xs -ze-intel-greater-than-4GB-buffer-required)
97+
elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
9598
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
9699
elseif (GGML_SYCL_TARGET STREQUAL "AMD")
97100
# INFO: Allowed Sub_group_sizes are not consistent through all
@@ -100,7 +103,8 @@ elseif (GGML_SYCL_TARGET STREQUAL "AMD")
100103
# Target archs tested working: gfx1030, gfx1031, (Only tested sub_group_size = 32)
101104
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
102105
else()
103-
add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
106+
# default for other target
107+
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
104108
endif()
105109

106110
if (GGML_SYCL_GRAPH)

ggml/src/ggml-sycl/cpy.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -515,9 +515,6 @@ void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, co
515515
const int64_t ne = ggml_nelements(src0);
516516
GGML_ASSERT(ne == ggml_nelements(src1));
517517

518-
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
519-
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
520-
521518
GGML_TENSOR_BINARY_OP_LOCALS01;
522519

523520
SYCL_CHECK(ggml_sycl_set_device(ctx.device));

0 commit comments

Comments
 (0)