1313from pytabkit .models .utils import FunctionProcess
1414
1515
16+ def get_gpu_rams_gb (use_reserved : bool = True ):
17+ """
18+ Returns:
19+ gpu_rams_gb: total GPU memory per visible device (GB)
20+ gpu_rams_fixed_gb: this process GPU memory per visible device (GB)
21+ - reserved (default): torch caching allocator reserved bytes (often matches "process used" better)
22+ - allocated: live tensor bytes only
23+ """
24+ # do it in torch, it respects CUDA_VISIBLE_DEVICES and doesn't need the pynvml dependency
25+ BYTES_TO_GB = 1024.0 ** 3
26+ import torch
27+
28+ gpu_rams_gb = []
29+ gpu_rams_fixed_gb = []
30+
31+ n = torch .cuda .device_count () # respects CUDA_VISIBLE_DEVICES ("" => 0)
32+ for i in range (n ):
33+ with torch .cuda .device (i ):
34+ _free_b , total_b = torch .cuda .mem_get_info ()
35+
36+ gpu_rams_gb .append (total_b / BYTES_TO_GB )
37+
38+ if use_reserved :
39+ used_b = torch .cuda .memory_reserved (i )
40+ else :
41+ used_b = torch .cuda .memory_allocated (i )
42+
43+ gpu_rams_fixed_gb .append (used_b / BYTES_TO_GB )
44+
45+ return gpu_rams_gb , gpu_rams_fixed_gb
46+
47+
1648def measure_node_resources (node_id : int ) -> Tuple [NodeResources , NodeResources ]:
1749 """
1850 Function that measures available resources.
@@ -29,21 +61,22 @@ def measure_node_resources(node_id: int) -> Tuple[NodeResources, NodeResources]:
2961 # init cuda
3062 # alloc dummy tensors to know how much memory PyTorch uses for its runtime
3163 dummy_tensors = [torch .ones (1 ).to (f'cuda:{ i } ' ) for i in range (n_gpus )]
32- import pynvml
33- pynvml .nvmlInit ()
34-
35- gpu_rams_gb = []
36- gpu_rams_fixed_gb = []
37-
38- for i in range (n_gpus ):
39- # adapted torch.cuda.list_gpu_processes(gpu)
40- h = pynvml .nvmlDeviceGetHandleByIndex (i )
41- info = pynvml .nvmlDeviceGetMemoryInfo (h )
42- total = info .total
43- # print(f'free : {info.free}')
44- used = info .used
45- gpu_rams_gb .append (total / (1024. ** 3 ))
46- gpu_rams_fixed_gb .append (used / (1024. ** 3 ))
64+ gpu_rams_gb , gpu_rams_fixed_gb = get_gpu_rams_gb ()
65+ # import pynvml
66+ # pynvml.nvmlInit()
67+ #
68+ # gpu_rams_gb = []
69+ # gpu_rams_fixed_gb = []
70+ #
71+ # for i in range(n_gpus):
72+ # # adapted torch.cuda.list_gpu_processes(gpu)
73+ # h = pynvml.nvmlDeviceGetHandleByIndex(i)
74+ # info = pynvml.nvmlDeviceGetMemoryInfo(h)
75+ # total = info.total
76+ # # print(f'free : {info.free}')
77+ # used = info.used
78+ # gpu_rams_gb.append(total / (1024. ** 3))
79+ # gpu_rams_fixed_gb.append(used / (1024. ** 3))
4780 else :
4881 gpu_rams_gb = []
4982 gpu_rams_fixed_gb = []
0 commit comments