Skip to content

Commit edc1ef0

Browse files
committed
v1.7.2: removed debug print, scheduler fix
1 parent d944b1c commit edc1ef0

File tree

7 files changed

+62
-20
lines changed

7 files changed

+62
-20
lines changed

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,9 @@ and https://docs.ray.io/en/latest/cluster/vms/user-guides/community/slurm.html
200200

201201
## Releases (see git tags)
202202

203+
- v1.7.2:
204+
- Removed debug print in RealMLP.
205+
- fixed device memory estimation error in the scheduler when `CUDA_VISIBLE_DEVICES` was used.
203206
- v1.7.1:
204207
- LightGBM now processes the `extra_trees`, `max_cat_to_onehot`, and `min_data_per_group` parameters
205208
used in the `'tabarena'` search space, which should improve results.

pytabkit/__about__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,4 @@
22
#
33
# SPDX-License-Identifier: Apache-2.0
44

5-
__version__ = "1.7.1"
5+
__version__ = "1.7.2"

pytabkit/bench/scheduling/execution.py

Lines changed: 48 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,38 @@
1313
from pytabkit.models.utils import FunctionProcess
1414

1515

16+
def get_gpu_rams_gb(use_reserved: bool = True):
17+
"""
18+
Returns:
19+
gpu_rams_gb: total GPU memory per visible device (GB)
20+
gpu_rams_fixed_gb: this process GPU memory per visible device (GB)
21+
- reserved (default): torch caching allocator reserved bytes (often matches "process used" better)
22+
- allocated: live tensor bytes only
23+
"""
24+
# do it in torch, it respects CUDA_VISIBLE_DEVICES and doesn't need the pynvml dependency
25+
BYTES_TO_GB = 1024.0 ** 3
26+
import torch
27+
28+
gpu_rams_gb = []
29+
gpu_rams_fixed_gb = []
30+
31+
n = torch.cuda.device_count() # respects CUDA_VISIBLE_DEVICES ("" => 0)
32+
for i in range(n):
33+
with torch.cuda.device(i):
34+
_free_b, total_b = torch.cuda.mem_get_info()
35+
36+
gpu_rams_gb.append(total_b / BYTES_TO_GB)
37+
38+
if use_reserved:
39+
used_b = torch.cuda.memory_reserved(i)
40+
else:
41+
used_b = torch.cuda.memory_allocated(i)
42+
43+
gpu_rams_fixed_gb.append(used_b / BYTES_TO_GB)
44+
45+
return gpu_rams_gb, gpu_rams_fixed_gb
46+
47+
1648
def measure_node_resources(node_id: int) -> Tuple[NodeResources, NodeResources]:
1749
"""
1850
Function that measures available resources.
@@ -29,21 +61,22 @@ def measure_node_resources(node_id: int) -> Tuple[NodeResources, NodeResources]:
2961
# init cuda
3062
# alloc dummy tensors to know how much memory PyTorch uses for its runtime
3163
dummy_tensors = [torch.ones(1).to(f'cuda:{i}') for i in range(n_gpus)]
32-
import pynvml
33-
pynvml.nvmlInit()
34-
35-
gpu_rams_gb = []
36-
gpu_rams_fixed_gb = []
37-
38-
for i in range(n_gpus):
39-
# adapted torch.cuda.list_gpu_processes(gpu)
40-
h = pynvml.nvmlDeviceGetHandleByIndex(i)
41-
info = pynvml.nvmlDeviceGetMemoryInfo(h)
42-
total = info.total
43-
# print(f'free : {info.free}')
44-
used = info.used
45-
gpu_rams_gb.append(total / (1024. ** 3))
46-
gpu_rams_fixed_gb.append(used / (1024. ** 3))
64+
gpu_rams_gb, gpu_rams_fixed_gb = get_gpu_rams_gb()
65+
# import pynvml
66+
# pynvml.nvmlInit()
67+
#
68+
# gpu_rams_gb = []
69+
# gpu_rams_fixed_gb = []
70+
#
71+
# for i in range(n_gpus):
72+
# # adapted torch.cuda.list_gpu_processes(gpu)
73+
# h = pynvml.nvmlDeviceGetHandleByIndex(i)
74+
# info = pynvml.nvmlDeviceGetMemoryInfo(h)
75+
# total = info.total
76+
# # print(f'free : {info.free}')
77+
# used = info.used
78+
# gpu_rams_gb.append(total / (1024. ** 3))
79+
# gpu_rams_fixed_gb.append(used / (1024. ** 3))
4780
else:
4881
gpu_rams_gb = []
4982
gpu_rams_fixed_gb = []

pytabkit/bench/scheduling/resources.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,8 @@ def try_assign(self, required_resources: RequiredResources,
147147
gpu_rams_gb_all = fr.get_gpu_rams_gb() + rr.gpu_ram_gb
148148
gpu_availability = np.logical_and(gpu_usages_all <= self.get_gpu_usages() + 1e-8,
149149
gpu_rams_gb_all <= self.get_gpu_rams_gb())
150+
# print(f'{fr.get_gpu_rams_gb()=}, {rr.gpu_ram_gb=}')
151+
# print(f'{gpu_usages_all=}, {gpu_rams_gb_all=}, {self.get_gpu_usages()=}, {self.get_gpu_rams_gb()=}, {gpu_availability=}')
150152
available_gpus = np.argwhere(gpu_availability)[:, 0] # squeeze second dimension
151153
# sort available gpus by usage
152154
available_gpu_usages = self.get_gpu_usages()[available_gpus]

pytabkit/bench/scheduling/schedulers.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -318,7 +318,11 @@ def _submit_more_jobs(self) -> None:
318318

319319
# otherwise, try assigning the job
320320
for node_idx, r in enumerate(free_resources.resources):
321+
# print(f'{fixed_resources.__dict__=}')
322+
# print(f'{job_info.required_resources.__dict__=}')
323+
# print(f'{r.data=}, {r.get_resource_vector()=}, {node_idx=}')
321324
assigned_resources = r.try_assign(job_info.required_resources, fixed_resources)
325+
# print(f'{bool(assigned_resources)=}')
322326
if assigned_resources is not None:
323327
job_info.set_started(assigned_resources)
324328
self.job_manager.submit_job(job_info)

pytabkit/models/alg_interfaces/other_interfaces.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1250,7 +1250,7 @@ def _create_sklearn_model(self, seed: int, n_threads: int, gpu_devices: List[str
12501250
# print(f'{gpu_devices=}')
12511251
if self.n_classes > 0:
12521252
if self.config.get('use_tabiclex', False):
1253-
from tabiclex import TabICLClassifier
1253+
from tabiclv2 import TabICLClassifier
12541254
else:
12551255
from tabicl import TabICLClassifier
12561256
return TabICLClassifier(random_state=seed,
@@ -1304,9 +1304,9 @@ def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_spl
13041304
assert n_splits == 1
13051305
updated_config = utils.join_dicts(dict(n_estimators=100), self.config)
13061306
time_params = {'': 0.5, 'ds_size_gb': 10.0, '1/n_threads*n_samples*n_estimators*n_tree_repeats': 4e-8}
1307-
ram_params = {'': 0.5, 'ds_size_gb': 3.0, 'n_samples*n_estimators*n_tree_repeats': 3e-9}
1307+
ram_params = {'': 0.5}
13081308
rc = ResourcePredictor(config=updated_config, time_params=time_params,
1309-
cpu_ram_params=ram_params, n_gpus=1, gpu_usage=1.0, gpu_ram_params={'': 10.0})
1309+
cpu_ram_params=ram_params, n_gpus=1, gpu_usage=0.999, gpu_ram_params={'': 10.0})
13101310
return rc.get_required_resources(ds)
13111311

13121312

pytabkit/models/training/lightning_modules.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -341,5 +341,5 @@ def on_predict_model_eval(self) -> None: # redundant with on_predict_start()
341341

342342
def to(self, *args: Any, **kwargs: Any) -> 'TabNNModule':
343343
super().to(*args, **kwargs)
344-
print(f'moving static model to {args} {kwargs}')
344+
# print(f'moving static model to {args} {kwargs}')
345345
self.creator.static_model.to(*args, **kwargs)

0 commit comments

Comments
 (0)