[release/2.7][SWDEV-544125] update test buffer fudge factor for hipblaslt for test_fully_shard_training_memory test (#2493)

akashveramd · ethanwee1 · web-flow · commit b0c5b245fa47 · 2025-08-15T14:58:06.000-04:00
In this PR, I cherry picked upstream commit 78300c8. This fixes the test_fully_shard_training_memory test under /distributed/_composable/fsdp/test_fully_shard_memory.py. It was a failing test in Jira https://ontrack-internal.amd.com/browse/SWDEV-544125 Co-authored-by: Ethan Wee <Ethan.Wee@amd.com>
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_memory.py b/test/distributed/_composable/fsdp/test_fully_shard_memory.py
@@ -117,6 +117,9 @@ def _test_fully_shard_training_memory(
         # number is kept much smaller than the actual memory usage, which is on
         # the order of 100-200+ MB)
         buffer_mb = 16
+        # The default workspace for hipblaslt is larger than for cublas/cublaslt
+        # which requires a slight increase to this buffer value.
+        buffer_mb = 16 if torch.version.cuda else 18
         if reshard_after_forward:
             # 3x max unsharded block parameters (current all-gather + copy-out
             # and next all-gather), non-block parameters, and other