2026-03-21 nightly release (0e17236)

pytorchbot · pytorchbot · commit c7ff83579b91 · 2026-03-21T11:39:46.000Z
diff --git a/.github/scripts/nova_dir.bash b/.github/scripts/nova_dir.bash
@@ -9,7 +9,15 @@
 MSLK_DIR="/__w/MSLK/MSLK"
 export MSLK_REPO="${MSLK_DIR}/${REPOSITORY}"
 
-export BUILD_FROM_NOVA=0
+################################################################################
+# Because we have a custom setup.py with extra flags, we have to do clean /
+# build_wheel during the pre-script stage, since we have no control over the
+# invocation of setup.py in the Nova build stage.
+#
+# As such, set the flag here so that setup.py will skip these steps in Nova
+# workflow in the build stage.
+################################################################################
+export BUILD_FROM_NOVA=1
 
 # Disable HIP FMHA build in the manywheel CI (the runner is too small)
 export MSLK_BUILD_HIP_FMHA=0
diff --git a/.github/scripts/nova_prescript.bash b/.github/scripts/nova_prescript.bash
@@ -136,7 +136,14 @@ if [[ ${CHANNEL} == "" ]]; then
   export CHANNEL="nightly"
 fi
 
+################################################################################
+# Build the wheel
+#
+# The build is performed in the pre-script stage of the build workflow since we
+# have no control over the invocation of setup.py in the actual build stage.
+################################################################################
+
+build_mslk_package "${BUILD_ENV_NAME}" "${CHANNEL}" "${mslk_build_target}/${mslk_build_variant}"
 end_time=$(date +%s)
 runtime=$((end_time-start_time))
-start_time=${end_time}
-echo "[NOVA] Time taken to prepare to build the package: ${runtime} seconds / $(display_time ${runtime})"
+echo "[NOVA] Time taken to build the package: ${runtime} seconds / $(display_time ${runtime})"
diff --git a/mslk/attention/fmha/utils/op_common.py b/mslk/attention/fmha/utils/op_common.py
@@ -9,6 +9,8 @@
 
 import torch
 
+from . import cpp_lib as _cpp_lib  # noqa: F401 -- loads _C_hip native extension
+
 
 def get_operator(library: str, name: str):
     def no_such_operator(*args, **kwargs):
diff --git a/mslk/quantize/triton/fp4_quantize.py b/mslk/quantize/triton/fp4_quantize.py
@@ -5370,6 +5370,9 @@ def triton_quantize_nvfp4(
         # Pass a dummy pointer; the kernel won't load from it.
         global_scale = x.new_empty(())
 
+    # Use int64 indexing when pointer offsets can exceed INT32_MAX
+    use_int64_indexing = M * N > 2**31 - 1
+
     triton_quantize_nvfp4_kernel[grid](
         x,
         global_scale,
@@ -5389,6 +5392,8 @@ def triton_quantize_nvfp4(
         USE_PRECISE_MATH=use_precise_math,
         # pyre-ignore[6]
         USE_GLOBAL_SCALE=use_global_scale,
+        # pyre-ignore[6]
+        USE_INT64_INDEXING=use_int64_indexing,
     )
 
     # reshape back to original shape
@@ -5413,6 +5418,7 @@ def triton_quantize_nvfp4_kernel(
     USE_E8M0_SCALE: tl.constexpr,
     USE_PRECISE_MATH: tl.constexpr,
     USE_GLOBAL_SCALE: tl.constexpr,
+    USE_INT64_INDEXING: tl.constexpr,
 ):
     E4M3_EPS = 1.5258789e-05
     FP8_E4M3_MAX = 448.0
@@ -5444,6 +5450,10 @@ def triton_quantize_nvfp4_kernel(
 
     offs_m = pid_m * M_PER_BLOCK + tl.arange(0, M_PER_BLOCK)[:, None]
     offs_n = pid_n * 64 + tl.arange(0, 64)[None, :]
+    if USE_INT64_INDEXING:
+        offs_m = offs_m.to(tl.int64)
+        offs_n = offs_n.to(tl.int64)
+
     if USE_MASK:
         mask = (offs_m < M) & (offs_n < N)
         other = 0.0
@@ -5456,9 +5466,8 @@ def triton_quantize_nvfp4_kernel(
     else:
         global_scale = 1.0
 
-    x = tl.load(
-        x_ptr + offs_m * stride_xm + offs_n * stride_xn, mask=mask, other=other
-    )  # [M_PER_BLOCK, 64]
+    load_offsets = offs_m * stride_xm + offs_n * stride_xn
+    x = tl.load(x_ptr + load_offsets, mask=mask, other=other)  # [M_PER_BLOCK, 64]
     x_blocks = x.to(tl.float32).reshape(M_PER_BLOCK, 4, 16)  # [M_PER_BLOCK, 4, 16]
 
     # Block-wise max
@@ -5519,7 +5528,13 @@ def triton_quantize_nvfp4_kernel(
         mask = (offs_m < M) & (offs_n < N // 2)
     else:
         mask = None
-    tl.store(q_ptr + offs_m * (N // 2) + offs_n, x_fp4x2, mask=mask)
+
+    if USE_INT64_INDEXING:
+        offs_m = offs_m.to(tl.int64)
+        offs_n = offs_n.to(tl.int64)
+
+    store_offsets = offs_m * (N // 2) + offs_n
+    tl.store(q_ptr + store_offsets, x_fp4x2, mask=mask)
 
 
 @triton.jit
diff --git a/test/attention/fmha/test_fmha_merge_attentions.py b/test/attention/fmha/test_fmha_merge_attentions.py
@@ -22,6 +22,7 @@
 
 from .utils import (
     assert_allclose,
+    cuda_only,
     disable_on_rocm,
     sm80_or_better_only,
     UNSUPPORTED_OP_PASSES,
@@ -479,6 +480,7 @@ def test_merge_attentions_sharedinput(
     )
 
 
+@cuda_only
 @sm80_or_better_only
 @pytest.mark.parametrize("bmghk", (False, True))
 def test_merge_attentions_against_ref(bmghk: bool):
@@ -685,6 +687,7 @@ def test_merge_training_zilch():
 
 
 @sm80_or_better_only
+@cuda_only
 def test_merge_training_undilate():
     torch.manual_seed(1)
 
diff --git a/test/attention/fmha/test_fmha_split_blocks_fairinternal.py b/test/attention/fmha/test_fmha_split_blocks_fairinternal.py
@@ -46,6 +46,7 @@ def test_split_blocks_for_decoding():
     assert (chunked_bias.k_seqinfo.seqstart >= attn_bias.k_seqinfo.seqstart).all()
 
 
+@cuda_only
 def test_split_blocks_for_decoding_with_paged():
     torch.manual_seed(0)
     max_len_kv = 2048
diff --git a/test/attention/fmha/test_mem_eff_attention.py b/test/attention/fmha/test_mem_eff_attention.py
@@ -313,7 +313,7 @@ def test_dropout_ck(q_len, kv_len, batch_size, k_len, p, seed, attn_bias):
 def test_dropout_backward_ck(q_len, kv_len, batch_size, k, p):
     op = fmha.ck.FwOp
     dtype = torch.float16
-    if not op.is_available():
+    if not fmha.ck.BwOp.is_available():
         if UNSUPPORTED_OP_PASSES:
             return
         pytest.skip()
@@ -614,6 +614,7 @@ def test_unsupported_stride_alignment(op: Type[fmha.AttentionFwOpBase]):
 
 
 @sm75_or_better_only
+@cuda_only
 def test_unsupported_dropout_combine_flash_cutlass() -> None:
     q = torch.empty(
         [1, 4, 1, 16], device="cuda", dtype=torch.float16, requires_grad=True
@@ -1893,6 +1894,10 @@ def test_memeff_compile(bias_t, create_bias_inside_compiled: bool, op) -> None:
         if UNSUPPORTED_OP_PASSES:
             return
         pytest.skip("Op is not available")
+    if (not not torch.version.hip) and not fmha.ck.BwOp.is_available():
+        if UNSUPPORTED_OP_PASSES:
+            return
+        pytest.skip("Op is not available")
     torch._dynamo.reset_code_caches()  # avoids hitting recompilation limit
     B, M, H, K = 1, 256, 2, 64
     q, k, v, bias = create_tensors(
diff --git a/test/quantize/triton/fp4_quantize_test.py b/test/quantize/triton/fp4_quantize_test.py
@@ -284,6 +284,7 @@ def test_fake_quantize_nvfp4_per_tensor(
             (4000, 4096),  # large matrix with m padding
             (4096, 4080),  # large square matrix with n padding
             (4000, 4080),  # large square matrix with m and n padding
+            (147456, 15360),  # > int32 addressing
         ],
     )
     @pytest.mark.parametrize("use_global_scale", [True, False])

Original file line number	Diff line number	Diff line change
`@@ -284,6 +284,7 @@ def test_fake_quantize_nvfp4_per_tensor(`
`284`	`284`	`(4000, 4096), # large matrix with m padding`
`285`	`285`	`(4096, 4080), # large square matrix with n padding`
`286`	`286`	`(4000, 4080), # large square matrix with m and n padding`
	`287`	`+ (147456, 15360), # > int32 addressing`
`287`	`288`	`],`
`288`	`289`	`)`
`289`	`290`	`@pytest.mark.parametrize("use_global_scale", [True, False])`