ggml-org · Yangxiaoz · May 28, 2025 · May 28, 2025
diff --git a/docs/build.md b/docs/build.md
@@ -298,6 +298,8 @@ If your GPU is not officially supported you can use the environment variable [`H
 
 On Linux it is possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1`. However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
 
+When using NVIDIA Jetson series devices which have integrated GPU, setting environment variable `GGML_CUDA_JETSON_DEVICE=1` to optimize for the Unified Memory Architecture
+
 ## Vulkan
 
 **Windows**

diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -1064,6 +1064,9 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
 
     GGML_UNUSED(buft);
 }
+static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) {
+    return buft->iface.get_name == ggml_backend_cuda_host_buffer_type_name;
+}
 
 static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     CUDA_CHECK(cudaFreeHost(buffer->context));
@@ -3263,7 +3266,11 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
 }
 
 static bool ggml_backend_cuda_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
-    return (ggml_backend_buft_is_cuda(buft) || ggml_backend_buft_is_cuda_split(buft)) && buft->device == dev;
+        if (getenv("GGML_CUDA_JETSON_DEVICE") != nullptr){
+        return (ggml_backend_buft_is_cuda(buft) || ggml_backend_buft_is_cuda_split(buft)||ggml_backend_buft_is_cuda_host(buft)) && buft->device == dev;
+    }else{
+        return (ggml_backend_buft_is_cuda(buft) || ggml_backend_buft_is_cuda_split(buft)) && buft->device == dev;
+    }
 }
 
 static int64_t get_op_batch_size(const ggml_tensor * op) {