Disable TMA by default (#607)

yzhangcs · yzhangcs · commit 2eade97c9f4e · 2025-10-10T05:57:25.000Z
diff --git a/ENVs.md b/ENVs.md
@@ -1,9 +1,9 @@
 # FLA Environment Variables
 
-| Variable | Default | Options | Description |
-| --- | --- | --- | --- |
-| `FLA_NO_USE_TMA` | `0` | `0` or `1` | Set to `1` to disable Tensor Memory Accelerator (TMA) on Hopper or Blackwell GPUs. |
-| `FLA_CONV_BACKEND` | `cuda` | `triton` or `cuda` | Choose the convolution backend. `cuda` is the default and preferred for most cases. |
-| `FLA_USE_FAST_OPS` | `0` | `0` or `1` | Enable faster, but potentially less accurate, operations when set to `1`. |
-| `FLA_CACHE_RESULTS` | `1` | `0` or `1` | Whether to cache autotune timings to disk. Defaults to `1` (enabled). |
-| `FLA_TRIL_PRECISION` | `ieee` | `ieee`, `tf32`, `tf32x3` | Controls the precision for triangular operations. `tf32x3` is only available on NV GPUs. |
+| Variable             | Default | Options                  | Description                                                                              |
+| -------------------- | ------- | ------------------------ | ---------------------------------------------------------------------------------------- |
+| `FLA_CONV_BACKEND`   | `cuda`  | `triton` or `cuda`       | Choose the convolution backend. `cuda` is the default and preferred for most cases.      |
+| `FLA_USE_TMA`        | `0`     | `0` or `1`               | Set to `1` to enable Tensor Memory Accelerator (TMA) on Hopper or Blackwell GPUs.        |
+| `FLA_USE_FAST_OPS`   | `0`     | `0` or `1`               | Enable faster, but potentially less accurate, operations when set to `1`.                |
+| `FLA_CACHE_RESULTS`  | `1`     | `0` or `1`               | Whether to cache autotune timings to disk. Defaults to `1` (enabled).                    |
+| `FLA_TRIL_PRECISION` | `ieee`  | `ieee`, `tf32`, `tf32x3` | Controls the precision for triangular operations. `tf32x3` is only available on NV GPUs. |
diff --git a/fla/utils.py b/fla/utils.py
@@ -399,7 +399,7 @@ def map_triton_backend_to_torch_device() -> str:
 is_tf32_supported = (is_nvidia and torch.cuda.get_device_capability(0)[0] >= 8)
 is_gather_supported = hasattr(triton.language, 'gather')
 is_tma_supported = (is_nvidia and torch.cuda.get_device_capability(0)[0] >= 9) \
-    and os.environ.get('FLA_NO_USE_TMA', '0') != '1' and \
+    and os.environ.get('FLA_USE_TMA', '0') != '1' and \
     (hasattr(triton.language, '_experimental_make_tensor_descriptor') or hasattr(triton.language, 'make_tensor_descriptor'))
 
 if is_nvidia and not is_tf32_supported: