-
Notifications
You must be signed in to change notification settings - Fork 5
Fix RDMA memory leak by properly deregistering buffers #64
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
ecf5863
88d6523
b8623ea
61c5f99
c1c8011
d9eb9fa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -59,6 +59,10 @@ async def read_into(self, tensor: Optional[torch.Tensor]) -> torch.Tensor: | |
async def write_from(self, tensor: Optional[torch.Tensor]) -> None: | ||
raise NotImplementedError() | ||
|
||
async def drop(self) -> None: | ||
"""Clean up any resources held by this buffer. Override in subclasses if needed.""" | ||
pass | ||
|
||
|
||
class RDMATransportBuffer(TransportBuffer): | ||
# TODO: when we try this with rdma, I should be able to write rdma directly to the tensor | ||
|
@@ -72,6 +76,31 @@ def __init__(self) -> None: | |
self.shape: Optional[torch.Size] = None | ||
self.dtype: Optional[torch.dtype] = None | ||
|
||
async def drop(self) -> None: | ||
"""Explicitly clean up RDMA buffers to prevent kernel memory leak. | ||
|
||
When RDMA buffers are created, they register memory regions with the RDMA | ||
hardware which pins pages in kernel memory. Without explicit cleanup, these | ||
pages remain pinned even after the Python objects are garbage collected, | ||
leading to a memory leak that manifests as unbounded Inactive(anon) growth. | ||
""" | ||
if self.rdma_buffers is not None: | ||
for rdma_buf in self.rdma_buffers: | ||
try: | ||
# Drop the RDMA buffer to deregister the memory region | ||
await rdma_buf.drop() | ||
except Exception as e: | ||
# Log but don't raise - cleanup should be best-effort | ||
logging.warning(f"Failed to drop RDMA buffer during cleanup: {e}") | ||
self.rdma_buffers = None | ||
self.tensor_refs = None | ||
|
||
def __del__(self) -> None: | ||
"""Destructor that ensures RDMA buffers are cleaned up.""" | ||
# Note: Not calling cleanup() here to avoid issues with destructor timing | ||
# and to make cleanup explicit only where we control the lifecycle | ||
pass | ||
|
||
|
||
def __getstate__(self) -> Dict[str, Any]: | ||
# Any time that we serialize the transport buffer, the idea is | ||
# that tensors will be transported via tensor_enginer.RDMABuffer, so it makes | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Lucas: maybe remove
__del__
altogether if it's a no-op.