Introduce a new API torch.xpu.set_per_process_memory_fraction (pytorch#165510)

guangyey · pytorchmergebot · commit 753d9bd80633 · 2025-10-29T03:24:52.000Z
# Motivation Aligned with other backends, this PR introduces a new API `torch.xpu.set_per_process_memory_fraction` to allow user to customize the allowed memory per a single process. Pull Request resolved: pytorch#165510 Approved by: https://github.com/EikanWang, https://github.com/ezyang ghstack dependencies: pytorch#165508, pytorch#165509
diff --git a/c10/xpu/XPUCachingAllocator.cpp b/c10/xpu/XPUCachingAllocator.cpp
@@ -123,6 +123,8 @@ class DeviceCachingAllocator {
   ska::flat_hash_map<xpu::XPUStream, std::deque<std::pair<sycl::event, Block*>>>
       xpu_events;
   DeviceIndex device_index;
+  size_t allowed_memory_maximum = 0;
+  bool set_fraction = false;
 
   size_t try_merge_blocks(Block* dst, Block* src, BlockPool& pool) {
     if (!src || src->allocated || src->event_count > 0 ||
@@ -245,6 +247,12 @@ class DeviceCachingAllocator {
     if (isRetry) {
       stats.num_alloc_retries += 1;
     }
+    if (set_fraction &&
+        stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current +
+                size >
+            allowed_memory_maximum) {
+      return false;
+    }
     void* ptr = sycl::aligned_alloc_device(
         kDeviceAlignment,
         size,
@@ -435,6 +443,11 @@ class DeviceCachingAllocator {
         device_free =
             raw_device.get_info<sycl::ext::intel::info::device::free_memory>();
       }
+      std::string allowed_info;
+      if (set_fraction) {
+        allowed_info = format_size(allowed_memory_maximum) + " allowed; ";
+      }
+
       auto allocated_bytes =
           stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)]
               .current;
@@ -459,7 +472,9 @@ class DeviceCachingAllocator {
           format_size(device_total),
           " of which ",
           format_size(device_free),
-          " is free. Of the allocated memory ",
+          " is free. ",
+          allowed_info,
+          "Of the allocated memory ",
           format_size(allocated_bytes),
           " is allocated by PyTorch, and ",
           format_size(reserved_bytes - allocated_bytes),
@@ -538,6 +553,14 @@ class DeviceCachingAllocator {
       stats.requested_bytes[statType].reset_peak();
     }
   }
+
+  void setMemoryFraction(double fraction) {
+    c10::xpu::DeviceProp device_prop;
+    c10::xpu::get_device_properties(&device_prop, device_index);
+    auto device_total = device_prop.global_mem_size;
+    allowed_memory_maximum = static_cast<size_t>(fraction * device_total);
+    set_fraction = true;
+  }
 };
 
 static void local_raw_delete(void* ptr);
@@ -700,6 +723,16 @@ class XPUAllocator : public DeviceAllocator {
     assertValidDevice(device);
     device_allocators[device]->resetAccumulatedStats();
   }
+
+  void setMemoryFraction(double fraction, DeviceIndex device) {
+    assertValidDevice(device);
+    TORCH_CHECK_VALUE(
+        0 < fraction && fraction <= 1,
+        "invalid fraction:",
+        fraction,
+        ". Please set within (0, 1].");
+    device_allocators[device]->setMemoryFraction(fraction);
+  }
 };
 
 static XPUAllocator allocator;
@@ -744,6 +777,10 @@ void recordStream(const DataPtr& dataPtr, XPUStream stream) {
   return allocator.recordStream(dataPtr, stream);
 }
 
+void setMemoryFraction(double fraction, DeviceIndex device) {
+  return allocator.setMemoryFraction(fraction, device);
+}
+
 REGISTER_ALLOCATOR(kXPU, &allocator)
 
 } // namespace c10::xpu::XPUCachingAllocator
diff --git a/c10/xpu/XPUCachingAllocator.h b/c10/xpu/XPUCachingAllocator.h
@@ -25,4 +25,6 @@ C10_XPU_API void raw_delete(void* ptr);
 
 C10_XPU_API void recordStream(const DataPtr& dataPtr, XPUStream stream);
 
+C10_XPU_API void setMemoryFraction(double fraction, DeviceIndex device);
+
 } // namespace c10::xpu::XPUCachingAllocator
diff --git a/docs/source/xpu.md b/docs/source/xpu.md
@@ -85,6 +85,7 @@
      memory_stats_as_nested_dict
      reset_accumulated_memory_stats
      reset_peak_memory_stats
+     set_per_process_memory_fraction
 ```
 
 ```{eval-rst}
diff --git a/test/test_xpu.py b/test/test_xpu.py
@@ -25,6 +25,7 @@
     IS_LINUX,
     IS_WINDOWS,
     run_tests,
+    serialTest,
     suppress_warnings,
     TEST_XPU,
     TestCase,
@@ -482,6 +483,32 @@ def test_raises_oom(self):
         with self.assertRaises(torch.OutOfMemoryError):
             torch.empty(1024 * 1024 * 1024 * 1024, device="xpu")
 
+    @serialTest()
+    def test_set_per_process_memory_fraction(self):
+        gc.collect()
+        torch.xpu.empty_cache()
+        total_memory = torch.xpu.get_device_properties().total_memory
+        fraction = 0.5
+        with self.assertRaisesRegex(ValueError, "invalid fraction:"):
+            torch.xpu.set_per_process_memory_fraction(-0.1)
+        with self.assertRaisesRegex(ValueError, "invalid fraction:"):
+            torch.xpu.set_per_process_memory_fraction(1.1)
+
+        torch.xpu.set_per_process_memory_fraction(fraction)
+        allowed_memory = int(total_memory * 0.49)
+        reserved_memory = torch.xpu.memory_reserved()
+        application_memory = allowed_memory - reserved_memory
+        tensor = torch.empty(application_memory, dtype=torch.int8, device="xpu")
+        del tensor
+        gc.collect()
+        torch.xpu.empty_cache()
+
+        application_memory = int(total_memory * 0.51)
+        with self.assertRaises(torch.OutOfMemoryError):
+            _ = torch.empty(application_memory, dtype=torch.int8, device="xpu")
+
+        torch.xpu.set_per_process_memory_fraction(1.0)
+
     def test_memory_allocation(self):
         torch.xpu.empty_cache()
         prev_allocated = torch.xpu.memory_allocated()
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
@@ -2391,6 +2391,7 @@ def _xpu_resetAccumulatedMemoryStats(device: _int) -> None: ...
 def _xpu_resetPeakMemoryStats(device: _int) -> None: ...
 def _xpu_getMemoryInfo(device: _int) -> tuple[_int, _int]: ...
 def _xpu_canDeviceAccessPeer(device: _int, peer: _int) -> _bool: ...
+def _xpu_setMemoryFraction(fraction: _float, device: _int) -> None: ...
 
 class _XpuDeviceProperties:
     name: str
diff --git a/torch/csrc/xpu/Module.cpp b/torch/csrc/xpu/Module.cpp
@@ -420,6 +420,9 @@ static void initXpuMethodBindings(PyObject* module) {
       [](c10::DeviceIndex device, c10::DeviceIndex peer) {
         return at::xpu::canDeviceAccessPeer(device, peer);
       });
+  m.def("_xpu_setMemoryFraction", [](double fraction, c10::DeviceIndex device) {
+    c10::xpu::XPUCachingAllocator::setMemoryFraction(fraction, device);
+  });
 }
 
 // Callback for python part. Used for additional initialization of python
diff --git a/torch/xpu/__init__.py b/torch/xpu/__init__.py
@@ -530,6 +530,7 @@ def _get_rng_state_offset(device: Union[int, str, torch.device] = "xpu") -> int:
     memory_stats_as_nested_dict,
     reset_accumulated_memory_stats,
     reset_peak_memory_stats,
+    set_per_process_memory_fraction,
 )
 from .random import (
     get_rng_state,
@@ -584,6 +585,7 @@ def _get_rng_state_offset(device: Union[int, str, torch.device] = "xpu") -> int:
     "seed",
     "seed_all",
     "set_device",
+    "set_per_process_memory_fraction",
     "set_rng_state",
     "set_rng_state_all",
     "set_stream",
diff --git a/torch/xpu/memory.py b/torch/xpu/memory.py
@@ -4,7 +4,7 @@
 import torch
 from torch.types import Device
 
-from . import _get_device_index, is_initialized
+from . import _get_device_index, _lazy_init, is_initialized
 
 
 _device_t = Union[Device, str, int, None]
@@ -194,6 +194,31 @@ def mem_get_info(device: _device_t = None) -> tuple[int, int]:
     return torch._C._xpu_getMemoryInfo(device)
 
 
+def set_per_process_memory_fraction(fraction: float, device: _device_t = None) -> None:
+    r"""
+    Set the memory fraction for a single process on XPU device.
+    This function limits the amount of memory that the caching allocator can allocate
+    on the specified XPU device. The allowed memory is computed as:
+
+    .. math:: \text{allowed\_memory} = \text{total\_memory} \times \text{fraction}
+
+    If the process attempts to allocate more than this allowed memory,
+    an out-of-memory error will be raised by the allocator.
+
+    Arguments:
+        fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction.
+        device (torch.device or int or str, optional): selected device. It uses the current device,
+            given by :func:`~torch.xpu.current_device`, if :attr:`device` is ``None`` (default).
+
+    .. note:: In general, the total available free memory is less than the total capacity.
+    """
+    _lazy_init()
+    device = _get_device_index(device, optional=True)
+    if not isinstance(fraction, float):
+        raise TypeError("Invalid type for fraction argument, must be `float`")
+    torch._C._xpu_setMemoryFraction(fraction, device)
+
+
 __all__ = [
     "empty_cache",
     "max_memory_allocated",
@@ -205,4 +230,5 @@ def mem_get_info(device: _device_t = None) -> tuple[int, int]:
     "memory_stats_as_nested_dict",
     "reset_accumulated_memory_stats",
     "reset_peak_memory_stats",
+    "set_per_process_memory_fraction",
 ]

Original file line number	Diff line number	Diff line change
`@@ -420,6 +420,9 @@ static void initXpuMethodBindings(PyObject* module) {`
`420`	`420`	`[](c10::DeviceIndex device, c10::DeviceIndex peer) {`
`421`	`421`	`return at::xpu::canDeviceAccessPeer(device, peer);`
`422`	`422`	`});`
	`423`	`+ m.def("_xpu_setMemoryFraction", [](double fraction, c10::DeviceIndex device) {`
	`424`	`+ c10::xpu::XPUCachingAllocator::setMemoryFraction(fraction, device);`
	`425`	`+ });`
`423`	`426`	`}`
`424`	`427`
`425`	`428`	`// Callback for python part. Used for additional initialization of python`