From 199ece64c4699583401fdb879b714e652f2bea6f Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 14 Jun 2022 11:45:56 -0700 Subject: [PATCH 1/3] FEAT-#4574: Warn when pre-initialized Ray cluster is not using all available memory Signed-off-by: Rehan Durrani --- modin/core/execution/ray/common/utils.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/modin/core/execution/ray/common/utils.py b/modin/core/execution/ray/common/utils.py index b08b30c7c9b..dde04e30cbe 100644 --- a/modin/core/execution/ray/common/utils.py +++ b/modin/core/execution/ray/common/utils.py @@ -219,6 +219,17 @@ def initialize_ray( if not GPU_MANAGERS: for i in range(GpuCount.get()): GPU_MANAGERS.append(GPUManager.remote(i)) + else: + ray_obj_store_mem = ray.available_resources()["object_store_memory"] + virtual_memory = psutil.virtual_memory().total + if (ray_obj_store_mem // 1e9) < (0.6 * virtual_memory) // 1e9: + warnings.warn( + "Modin has detected that it is running on a pre-initialized Ray cluster. " + + f"This cluster has currently allocated {ray_obj_store_mem // 1e9} GB for its " + + f"object store, but the device has {virtual_memory // 1e9} GB of RAM available. " + + "Modin recommends initializing Ray with at least 60% of available RAM to prevent " + + "Out Of Memory errors." + ) _move_stdlib_ahead_of_site_packages() ray.worker.global_worker.run_function_on_all_workers( _move_stdlib_ahead_of_site_packages From 487f563321c2a429199e47909c3887d9f0467ca5 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 14 Jun 2022 11:47:37 -0700 Subject: [PATCH 2/3] update release notes Signed-off-by: Rehan Durrani --- docs/release_notes/release_notes-0.16.0.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/release_notes/release_notes-0.16.0.rst b/docs/release_notes/release_notes-0.16.0.rst index 1d1adaf0105..6ce3b491450 100644 --- a/docs/release_notes/release_notes-0.16.0.rst +++ b/docs/release_notes/release_notes-0.16.0.rst @@ -20,7 +20,7 @@ Key Features and Updates * XGBoost enhancements * * Developer API enhancements - * + * FEAT-#4574: Warn users when pre-initialized Ray cluster is not using all available memory (#4575) * Update testing suite * TEST-#4508: Reduce test_partition_api pytest threads to deflake it (#4551) * TEST-#4550: Use much less data in test_partition_api (#4554) @@ -34,3 +34,4 @@ Contributors ------------ @mvashishtha @prutskov +@RehanSD From 3286cccce8ca5bfe0a05d801ecea8ef4a250b52d Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 14 Jun 2022 12:11:11 -0700 Subject: [PATCH 3/3] Update sys mem calc Signed-off-by: Rehan Durrani --- modin/core/execution/ray/common/utils.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/modin/core/execution/ray/common/utils.py b/modin/core/execution/ray/common/utils.py index dde04e30cbe..c93b86361a2 100644 --- a/modin/core/execution/ray/common/utils.py +++ b/modin/core/execution/ray/common/utils.py @@ -221,12 +221,19 @@ def initialize_ray( GPU_MANAGERS.append(GPUManager.remote(i)) else: ray_obj_store_mem = ray.available_resources()["object_store_memory"] - virtual_memory = psutil.virtual_memory().total - if (ray_obj_store_mem // 1e9) < (0.6 * virtual_memory) // 1e9: + system_memory = psutil.virtual_memory().total + if sys.platform.startswith("linux"): + shm_fd = os.open("/dev/shm", os.O_RDONLY) + try: + shm_stats = os.fstatvfs(shm_fd) + system_memory = shm_stats.f_bsize * shm_stats.f_bavail + finally: + os.close(shm_fd) + if (ray_obj_store_mem // 1e9) < (0.6 * system_memory) // 1e9: warnings.warn( "Modin has detected that it is running on a pre-initialized Ray cluster. " + f"This cluster has currently allocated {ray_obj_store_mem // 1e9} GB for its " - + f"object store, but the device has {virtual_memory // 1e9} GB of RAM available. " + + f"object store, but the device has {system_memory // 1e9} GB of RAM available. " + "Modin recommends initializing Ray with at least 60% of available RAM to prevent " + "Out Of Memory errors." )