vulkan: Add memory detection for Intel GPU using Level Zero Sysman (PR ollama#12654)

iosub · iosub · commit 8a3856f41031 · 2025-10-25T17:38:50.000-05:00
Applied PR ollama#12654 which adds Intel GPU memory detection support using Level Zero Sysman API. Changes: - Added mem_l0_sysman.cpp for Level Zero Sysman integration - Updated CMakeLists.txt to include Level Zero support - Enhanced ggml-vulkan.cpp with improved GPU memory detection - Added Level Zero runtime libraries to Docker build - Created patch file 0032 for Level Zero integration This improves Vulkan memory detection accuracy for Intel integrated and discrete GPUs.
diff --git a/.gitignore b/.gitignore
@@ -24,3 +24,4 @@ logs/crash
 .ccache/
 
 
+temp_pr12654.patch
diff --git a/Dockerfile b/Dockerfile
@@ -122,6 +122,11 @@ RUN --mount=type=cache,target=/root/.ccache \
     cmake --preset 'Vulkan' -DOLLAMA_RUNNER_DIR="vulkan" \
         && cmake --build --parallel --preset 'Vulkan' \
         && cmake --install build --component Vulkan --strip --parallel 8 
+# Install Intel oneAPI Level Zero runtime
+# https://dgpu-docs.intel.com/driver/installation-lts2.html
+RUN rpm --import https://repositories.intel.com/gpu/intel-graphics.key \
+    && dnf config-manager --add-repo https://repositories.intel.com/gpu/rhel/8.10/lts/2523/unified/intel-gpu-8.10.repo \
+    && dnf install -y level-zero intel-level-zero-gpu intel-gmmlib
 
 
 FROM base AS build
@@ -143,6 +148,12 @@ FROM --platform=linux/amd64 scratch AS amd64
 COPY --from=cuda-12 dist/lib/ollama /lib/ollama/
 COPY --from=cuda-13 dist/lib/ollama /lib/ollama/
 COPY --from=vulkan  dist/lib/ollama  /lib/ollama/
+# Copy over minimal Intel oneAPI Level Zero runtime libraries to run Level Zero Sysman
+COPY --from=vulkan /usr/lib64/libze_loader.so.1 /lib/ollama/level_zero/
+COPY --from=vulkan /usr/lib64/libze_intel_gpu.so.1 /lib/ollama/level_zero/
+COPY --from=vulkan /usr/lib64/libigdgmm.so.12 /lib/ollama/level_zero/
+COPY --from=vulkan /usr/lib64/libze_tracing_layer.so.1 /lib/ollama/level_zero/
+COPY --from=vulkan /usr/lib64/libze_validation_layer.so.1 /lib/ollama/level_zero/
 
 FROM --platform=linux/arm64 scratch AS arm64
 # COPY --from=cuda-11 dist/lib/ollama/ /lib/ollama/
@@ -192,7 +203,7 @@ RUN apt-get update \
 COPY --from=archive /bin /usr/bin
 ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
 COPY --from=archive /lib/ollama /usr/lib/ollama
-ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
+ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/lib/ollama/level_zero/
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
 ENV NVIDIA_VISIBLE_DEVICES=all
 ENV OLLAMA_HOST=0.0.0.0:11434
diff --git a/llama/patches/0032-Add-memory-detection-for-Intel-GPU-using-Level-Zero.patch b/llama/patches/0032-Add-memory-detection-for-Intel-GPU-using-Level-Zero.patch
diff --git a/ml/backend/ggml/ggml/src/CMakeLists.txt b/ml/backend/ggml/ggml/src/CMakeLists.txt
@@ -211,6 +211,7 @@ add_library(ggml-base
             ggml-quants.h
             mem_hip.cpp
             mem_nvml.cpp
+            mem_l0_sysman.cpp
             gguf.cpp)
 
 target_include_directories(ggml-base PRIVATE .)
diff --git a/ml/backend/ggml/ggml/src/ggml-impl.h b/ml/backend/ggml/ggml/src/ggml-impl.h
@@ -645,6 +645,9 @@ GGML_API void ggml_nvml_release();
 GGML_API int ggml_hip_mgmt_init();
 GGML_API int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total);
 GGML_API void ggml_hip_mgmt_release();
+GGML_API int ggml_l0_sysman_init();
+GGML_API int ggml_l0_sysman_get_device_memory(const char *uuid, size_t *free, size_t *total);
+GGML_API void ggml_l0_sysman_release();
 
 #ifdef __cplusplus
 }
diff --git a/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -12450,11 +12450,11 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
     vk::PhysicalDeviceProperties2 props2;
     vkdev.getProperties2(&props2);
 
-    if (!ctx->is_integrated_gpu)
-    {
-        // Use vendor specific management libraries for best VRAM reporting if available
-        switch (props2.properties.vendorID) {
-        case VK_VENDOR_ID_AMD:
+    // Use vendor specific management libraries for best VRAM reporting if available
+    switch (props2.properties.vendorID) {
+    case VK_VENDOR_ID_AMD:
+        if (!ctx->is_integrated_gpu)
+        {
             if (ggml_hip_mgmt_init() == 0) {
                 int status = ggml_hip_get_device_memory(ctx->pci_bus_id, ctx->pci_device_id, free, total);
                 if (status == 0) {
@@ -12464,8 +12464,11 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
                 }
                 ggml_hip_mgmt_release();
             }
-            break;
-        case VK_VENDOR_ID_NVIDIA:
+        }
+        break;
+    case VK_VENDOR_ID_NVIDIA:
+        if (!ctx->is_integrated_gpu)
+        {
             if (ggml_nvml_init() == 0) {
                 int status = ggml_nvml_get_device_memory(ctx->uuid.c_str(), free, total);
                 if (status == 0) {
@@ -12475,8 +12478,23 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
                 }
                 ggml_nvml_release();
             }
-            break;
         }
+        break;
+    case VK_VENDOR_ID_INTEL:
+        // L0 sysman can support both iGPU and dGPU on Windows and Linux.
+        // If the driver is old on Windows we will fail to get memory info for iGPU.
+        // For Linux you need to run ollama with `sudo` or run `sudo setcap cap_perfmon=+ep /path/to/ollama_binary`
+        // to apply perfmon privilege to the ollama binary
+        if (ggml_l0_sysman_init() == 0) {
+            int status = ggml_l0_sysman_get_device_memory(ctx->uuid.c_str(), free, total);
+            if (status == 0) {
+                GGML_LOG_DEBUG("%s utilizing Level Zero Sysman memory reporting free: %zu total: %zu\n", __func__, *free, *total);
+                ggml_l0_sysman_release();
+                return;
+            }
+            ggml_l0_sysman_release();
+        }
+        break;
     }
     // else fallback to memory budget if supported
 
diff --git a/ml/backend/ggml/ggml/src/mem_l0_sysman.cpp b/ml/backend/ggml/ggml/src/mem_l0_sysman.cpp

Original file line number	Diff line number	Diff line change
`@@ -24,3 +24,4 @@ logs/crash`
`24`	`24`	`.ccache/`
`25`	`25`
`26`	`26`
	`27`	`+temp_pr12654.patch`
Original file line number	Diff line number	Diff line change
`@@ -645,6 +645,9 @@ GGML_API void ggml_nvml_release();`
`645`	`645`	`GGML_API int ggml_hip_mgmt_init();`
`646`	`646`	`GGML_API int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t free, size_t total);`
`647`	`647`	`GGML_API void ggml_hip_mgmt_release();`
	`648`	`+GGML_API int ggml_l0_sysman_init();`
	`649`	`+GGML_API int ggml_l0_sysman_get_device_memory(const char uuid, size_t free, size_t *total);`
	`650`	`+GGML_API void ggml_l0_sysman_release();`
`648`	`651`
`649`	`652`	`#ifdef __cplusplus`
`650`	`653`	`}`