mlc-ai
diff --git a/‎3rdparty/tvm‎ b/‎3rdparty/tvm‎
diff --git a/‎python/mlc_llm/compiler_pass/attach_logit_processor.py‎
Lines changed: 5 additions & 5 deletions b/‎python/mlc_llm/compiler_pass/attach_logit_processor.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎python/mlc_llm/compiler_pass/attach_sampler.py‎
Lines changed: 2 additions & 2 deletions b/‎python/mlc_llm/compiler_pass/attach_sampler.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/mlc_llm/compiler_pass/attach_softmax_with_temperature.py‎
Lines changed: 8 additions & 8 deletions b/‎python/mlc_llm/compiler_pass/attach_softmax_with_temperature.py‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎python/mlc_llm/compiler_pass/attach_spec_decode_aux_funcs.py‎
Lines changed: 2 additions & 2 deletions b/‎python/mlc_llm/compiler_pass/attach_spec_decode_aux_funcs.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/mlc_llm/compiler_pass/fuse_add_norm.py‎
Lines changed: 12 additions & 12 deletions b/‎python/mlc_llm/compiler_pass/fuse_add_norm.py‎
Lines changed: 12 additions & 12 deletions
diff --git a/‎python/mlc_llm/compiler_pass/fuse_dequantize_transpose.py‎
Lines changed: 3 additions & 3 deletions b/‎python/mlc_llm/compiler_pass/fuse_dequantize_transpose.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎python/mlc_llm/compiler_pass/lift_global_buffer_alloc.py‎
Lines changed: 3 additions & 3 deletions b/‎python/mlc_llm/compiler_pass/lift_global_buffer_alloc.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎python/mlc_llm/compiler_pass/low_batch_specialization.py‎
Lines changed: 2 additions & 2 deletions b/‎python/mlc_llm/compiler_pass/low_batch_specialization.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/mlc_llm/model/phi3v/phi3v_image.py‎
Lines changed: 3 additions & 3 deletions b/‎python/mlc_llm/model/phi3v/phi3v_image.py‎
Lines changed: 3 additions & 3 deletions
@@ -101,7 +101,7 @@ def _apply_logit_bias_inplace(
 
         for p0 in T.thread_binding(0, (num_token + tx - 1) // tx, "blockIdx.x"):
             for p1 in T.thread_binding(0, tx, "threadIdx.x"):
-                with T.block("block"):
+                with T.sblock("block"):
                     vp = T.axis.spatial(num_token, p0 * tx + p1)
                     T.where(p0 * tx + p1 < num_token)
                     logits[pos2seq_id[vp], token_ids[vp]] += logit_bias[vp]
@@ -139,7 +139,7 @@ def _apply_penalty_inplace(  # pylint: disable=too-many-arguments,too-many-local
         penalties = T.match_buffer(var_penalties, (num_seq, 3), "float32")
 
         for token in T.serial(num_token):
-            with T.block("block"):
+            with T.sblock("block"):
                 vp = T.axis.spatial(num_token, token)
                 logits[seq_ids[pos2seq_id[vp]], token_ids[vp]] -= (
                     penalties[pos2seq_id[vp], 0] + token_cnt[vp] * penalties[pos2seq_id[vp], 1]
@@ -189,7 +189,7 @@ def _apply_penalty_inplace(  # pylint: disable=too-many-arguments,too-many-local
 
         for p0 in T.thread_binding(0, (num_token + tx - 1) // tx, "blockIdx.x"):
             for p1 in T.thread_binding(0, tx, "threadIdx.x"):
-                with T.block("block"):
+                with T.sblock("block"):
                     vp = T.axis.spatial(num_token, p0 * tx + p1)
                     T.where(p0 * tx + p1 < num_token)
                     # Penalties: (presence_penalty, frequency_penalty, repetition_penalty)
@@ -230,7 +230,7 @@ def _apply_bitmask_inplace(
         bitmask = T.match_buffer(var_bitmask, (batch_size, (vocab_size + 31) // 32), "int32")
 
         for token in T.serial(num_seq * vocab_size):
-            with T.block("block"):
+            with T.sblock("block"):
                 vs = T.axis.spatial(num_seq, (token) // vocab_size)
                 vv = T.axis.spatial(vocab_size, (token) % vocab_size)
 
@@ -272,7 +272,7 @@ def _apply_bitmask_inplace(
 
         for fused_s_v_0 in T.thread_binding(0, (num_seq * vocab_size + tx - 1) // tx, "blockIdx.x"):
             for fused_s_v_1 in T.thread_binding(0, tx, "threadIdx.x"):
-                with T.block("block"):
+                with T.sblock("block"):
                     vs = T.axis.spatial(num_seq, (fused_s_v_0 * tx + fused_s_v_1) // vocab_size)
                     vv = T.axis.spatial(vocab_size, (fused_s_v_0 * tx + fused_s_v_1) % vocab_size)
                     T.where(fused_s_v_0 * tx + fused_s_v_1 < num_seq * vocab_size)
 
@@ -144,7 +144,7 @@ def full(var_result: T.handle, value: T.int32):
     batch_size = T.int32(is_size_var=True)
     result = T.match_buffer(var_result, (batch_size, 1), "int32")
     for i in T.serial(batch_size):
-        with T.block("block"):
+        with T.sblock("block"):
             vi = T.axis.spatial(batch_size, i)
             result[vi, 0] = value
 
@@ -305,7 +305,7 @@ def sampler_take_probs_tir(  # pylint: disable=too-many-locals,too-many-argument
         top_prob_probs = T.match_buffer(var_top_prob_probs, (num_positions,), "float32")
         top_prob_indices = T.match_buffer(var_top_prob_indices, (num_positions,), "int32")
         for i in T.serial(num_positions + num_samples):
-            with T.block("block"):
+            with T.sblock("block"):
                 vi = T.axis.spatial(num_positions + num_samples, i)
                 if vi < num_positions:
                     row = T.floordiv(top_prob_offsets[vi], vocab_size)
 
@@ -131,7 +131,7 @@ def chunk_lse(  # pylint: disable=too-many-locals
         temp_sum = T.alloc_buffer((batch_size, num_chunks), dtype="float32")
 
         for l0, l1, l2 in T.grid(batch_size, num_chunks, T.int64(chunk_size)):
-            with T.block("pad"):
+            with T.sblock("pad"):
                 v0, v1, v2 = T.axis.remap("SSS", [l0, l1, l2])
                 A_pad[v0, v1, v2] = T.Select(
                     v1 * T.int64(chunk_size) + v2
@@ -144,13 +144,13 @@ def chunk_lse(  # pylint: disable=too-many-locals
                     T.min_value("float32"),
                 )
         for l0, l1, l2 in T.grid(batch_size, num_chunks, T.int64(chunk_size)):
-            with T.block("max"):
+            with T.sblock("max"):
                 v0, v1, v2 = T.axis.remap("SSR", [l0, l1, l2])
                 with T.init():
                     temp_max[v0, v1] = T.min_value("float32")
                 temp_max[v0, v1] = T.max(temp_max[v0, v1], A_pad[v0, v1, v2])
         for l0, l1, l2 in T.grid(batch_size, num_chunks, T.int64(chunk_size)):
-            with T.block("sum_exp"):
+            with T.sblock("sum_exp"):
                 v0, v1, v2 = T.axis.remap("SSR", [l0, l1, l2])
                 with T.init():
                     temp_sum[v0, v1] = T.float32(0)
@@ -165,7 +165,7 @@ def chunk_lse(  # pylint: disable=too-many-locals
                     T.float32(0),
                 )
         for l0, l1, l2 in T.grid(batch_size, num_chunks, T.int64(1)):
-            with T.block("log"):
+            with T.sblock("log"):
                 v0, v1, v2 = T.axis.remap("SSS", [l0, l1, l2])
                 chunked_sum[v0, v1] = T.Select(
                     temperature[v0] > T.float32(1e-5),
@@ -194,13 +194,13 @@ def softmax_with_chunked_sum(
         temp_max = T.alloc_buffer((batch_size,), dtype="float32")
         temp_sum = T.alloc_buffer((batch_size,), dtype="float32")
         for l0, l1 in T.grid(batch_size, num_chunks):
-            with T.block("max"):
+            with T.sblock("max"):
                 v0, v1 = T.axis.remap("SR", [l0, l1])
                 with T.init():
                     temp_max[v0] = T.min_value("float32")
                 temp_max[v0] = T.max(temp_max[v0], chunked_max[v0, v1])
         for l0, l1 in T.grid(batch_size, num_chunks):
-            with T.block("sum_exp"):
+            with T.sblock("sum_exp"):
                 v0, v1 = T.axis.remap("SR", [l0, l1])
                 with T.init():
                     temp_sum[v0] = T.float32(0)
@@ -210,7 +210,7 @@ def softmax_with_chunked_sum(
                     T.cast(chunked_max[v0, v1] == temp_max[v0], "float32") * chunked_sum[v0, v1],
                 )
         for l0, l1, l2 in T.grid(batch_size, num_chunks, T.int64(chunk_size)):
-            with T.block("log_pad"):
+            with T.sblock("log_pad"):
                 v0, v1, v2 = T.axis.remap("SSS", [l0, l1, l2])
                 if v1 * T.int64(chunk_size) + v2 < vocab_size:
                     softmax[v0, v1 * T.int64(chunk_size) + v2] = T.Select(
@@ -248,7 +248,7 @@ def apply_gpu_schedule(target, sch):
         sch.annotate(unroll, ann_key="pragma_unroll_explicit", ann_val=1)
 
         for block_name in ["sum_exp", "max"]:
-            block = sch.get_block(block_name)
+            block = sch.get_sblock(block_name)
             sch.set_scope(block, buffer_index=0, storage_scope="shared")
             sch.compute_at(block, bx)
             r_loop = sch.get_loops(block)[-1]
 
@@ -46,7 +46,7 @@ def _scatter_2d(var_src: T.handle, var_indices: T.handle, var_dst: T.handle):
         indices = T.match_buffer(var_indices, (batch_size,), "int32")
         dst = T.match_buffer(var_dst, (m, n), dtype)
         for b, j in T.grid(batch_size, n):
-            with T.block("scatter_2d"):
+            with T.sblock("scatter_2d"):
                 vb, vj = T.axis.remap("SS", [b, j])
                 dst[indices[vb], vj] = src[vb, vj]
 
@@ -64,7 +64,7 @@ def _gather_2d(var_src: T.handle, var_indices: T.handle, var_dst: T.handle):
         indices = T.match_buffer(var_indices, (batch_size,), "int32")
         dst = T.match_buffer(var_dst, (batch_size, n), dtype)
         for b, j in T.grid(batch_size, n):
-            with T.block("gather_2d"):
+            with T.sblock("gather_2d"):
                 vb, vj = T.axis.remap("SS", [b, j])
                 dst[vb, vj] = src[indices[vb], vj]
 
 
@@ -41,25 +41,25 @@ def decode_add_rms(  # pylint: disable=too-many-locals
                 annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1},
             ):
                 for i in range(add_local_size):
-                    with T.block("T_add"):
+                    with T.sblock("T_add"):
                         bx = T.axis.spatial(batch_size, v_bx)
                         h = T.axis.spatial(hidden_size, i * TX + v_tx)
                         add_local[h // TX] = A[bx, 0, h] + B[bx, 0, h]
-                    with T.block("T_write_back"):
+                    with T.sblock("T_write_back"):
                         bx = T.axis.spatial(batch_size, v_bx)
                         v_ax1 = T.axis.spatial(1, 0)
                         h = T.axis.spatial(hidden_size, i * TX + v_tx)
                         add[bx, v_ax1, h] = add_local[h // TX]
-                with T.block("T_multiply_red_rf_init"):
+                with T.sblock("T_multiply_red_rf_init"):
                     tx, bx = T.axis.remap("SS", [v_tx, v_bx])
                     sum_local[tx, bx, 0] = T.float32(0)
                 for v_i, _j in T.grid(add_local_size, 1):
-                    with T.block("T_multiply_red_rf_update"):
+                    with T.sblock("T_multiply_red_rf_update"):
                         tx, bx, i = T.axis.remap("SSR", [v_tx, v_bx, v_i])
                         sum_local[tx, bx, 0] += T.float32(add_local[i]) * T.float32(add_local[i])
             for _j in range(1):
                 for v_tx_2 in T.thread_binding(TX, thread="threadIdx.x"):
-                    with T.block("T_multiply_red"):
+                    with T.sblock("T_multiply_red"):
                         tx, bx = T.axis.remap("RS", [v_tx_2, v_bx])
                         T.reads(sum_local[tx, bx, 0])
                         T.writes(sum_shared[bx, 0])
@@ -68,7 +68,7 @@ def decode_add_rms(  # pylint: disable=too-many-locals
                         sum_shared[bx, 0] += sum_local[tx, bx, 0]
             for i in range(add_local_size):
                 for v_tx_2 in T.thread_binding(TX, thread="threadIdx.x"):
-                    with T.block("T_cast_2"):
+                    with T.sblock("T_cast_2"):
                         bx = T.axis.spatial(batch_size, v_bx)
                         h = T.axis.spatial(hidden_size, i * TX + v_tx_2)
                         O[bx, 0, h] = T.cast(
@@ -109,31 +109,31 @@ def prefill_add_rms(  # pylint: disable=too-many-locals
                 annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1},
             ):
                 for v_i in range(add_local_size):
-                    with T.block("T_add"):
+                    with T.sblock("T_add"):
                         bx = T.axis.spatial(seq_len, v_bx)
                         h = T.axis.spatial(hidden_size, v_i * TX + v_tx)
                         add_local[h // TX] = A[0, bx, h] + B[0, bx, h]
-                    with T.block("T_write_back"):
+                    with T.sblock("T_write_back"):
                         bx = T.axis.spatial(seq_len, v_bx)
                         h = T.axis.spatial(hidden_size, v_i * TX + v_tx)
                         add[0, bx, h] = add_local[h // TX]
-                with T.block("T_multiply_red_rf_init"):
+                with T.sblock("T_multiply_red_rf_init"):
                     tx, bx = T.axis.remap("SS", [v_tx, v_bx])
                     sum_local[tx, 0, bx] = T.float32(0)
                 for v_i, _j in T.grid(add_local_size, 1):
-                    with T.block("T_multiply_red_rf_update"):
+                    with T.sblock("T_multiply_red_rf_update"):
                         tx, bx, i = T.axis.remap("SSR", [v_tx, v_bx, v_i])
                         sum_local[tx, 0, bx] += T.float32(add_local[i]) * T.float32(add_local[i])
             for _j in range(1):
                 for v_tx_2 in T.thread_binding(TX, thread="threadIdx.x"):
-                    with T.block("T_multiply_red"):
+                    with T.sblock("T_multiply_red"):
                         tx, bx = T.axis.remap("RS", [v_tx_2, v_bx])
                         with T.init():
                             sum_shared[0, bx] = T.float32(0)
                         sum_shared[0, bx] = sum_shared[0, bx] + sum_local[tx, 0, bx]
             for v_i in range(add_local_size):
                 for v_tx_2 in T.thread_binding(TX, thread="threadIdx.x"):
-                    with T.block("T_cast_2"):
+                    with T.sblock("T_cast_2"):
                         bx = T.axis.spatial(seq_len, v_bx)
                         v1 = T.axis.spatial(hidden_size, v_i * TX + v_tx_2)
                         O[0, bx, v1] = T.cast(
 
@@ -74,7 +74,7 @@ def visit_call_(  # pylint: disable=arguments-renamed
             or not isinstance(dequantize_tir_func.body.block.body, tir.SeqStmt)
             or len(dequantize_tir_func.body.block.body) != 2
             or not isinstance(dequantize_tir_func.body.block.body[1], tir.For)
-            or not isinstance(dequantize_tir_func.body.block.body[1].body.body, tir.BlockRealize)
+            or not isinstance(dequantize_tir_func.body.block.body[1].body.body, tir.SBlockRealize)
             or dequantize_tir_func.body.block.body[1].body.body.block.name_hint != "T_transpose"
         ):
             return call
@@ -85,10 +85,10 @@ def visit_call_(  # pylint: disable=arguments-renamed
         new_func_buffers[-1] = dequantize_tir_func.body.block.alloc_buffers[0]
         new_func = tir.PrimFunc(
             params=new_func_buffers,
-            body=tir.BlockRealize(
+            body=tir.SBlockRealize(
                 iter_values=[],
                 predicate=True,
-                block=tir.Block(
+                block=tir.SBlock(
                     iter_vars=[],
                     reads=[],
                     writes=[],
 
@@ -93,7 +93,7 @@ def remove_global_buf_alloc(
     func: tir.PrimFunc,
 ) -> Tuple[tir.PrimFunc, List[relax.TensorStructInfo]]:
     """Remove the global buffer allocation for a given TIR PrimFunc."""
-    assert isinstance(func.body, tir.BlockRealize)
+    assert isinstance(func.body, tir.SBlockRealize)
     params = list(func.params)
     buffer_map = dict(func.buffer_map)
     tensor_sinfo = []
@@ -124,7 +124,7 @@ def remove_global_buf_alloc(
     assert len(prev_root_block.match_buffers) == 0
     assert prev_root_block.name_hint == "root"
     assert prev_root_block.init is None
-    root_block = tir.Block(
+    root_block = tir.SBlock(
         iter_vars=[],
         reads=[],
         writes=[],
@@ -136,7 +136,7 @@ def remove_global_buf_alloc(
 
     updated_func = tir.PrimFunc(
         params=params,
-        body=tir.BlockRealize(iter_values=[], predicate=True, block=root_block),
+        body=tir.SBlockRealize(iter_values=[], predicate=True, block=root_block),
         ret_type=func.ret_type,
         buffer_map=buffer_map,
         attrs=func.attrs,
 
@@ -56,8 +56,8 @@ def transform_module(
                         low_batch_funcs[i].body,
                         body,
                     )
-                body = tir.Block([], [], [], "root", body)
-                body = tir.BlockRealize([], True, body)
+                body = tir.SBlock([], [], [], "root", body)
+                body = tir.SBlockRealize([], True, body)
                 new_func = func.with_body(body)
                 new_func = new_func.with_attr("tir.is_scheduled", 1)
                 new_func = new_func.with_attr("tir.HoistIfThenElseExprWithBlock", 1)
 
@@ -98,7 +98,7 @@ def dyn_repeat_4d_tensor_func(  # pylint disable=too-many-locals
                 for n_idx in T.thread_binding(n * ch0, thread="blockIdx.x"):
                     for c_idx in T.thread_binding(c * ch1, thread="blockIdx.y"):
                         for h_idx, w_idx in T.grid(h * ch2, w * ch3):
-                            with T.block("dyn_repeat_4d_tensor"):
+                            with T.sblock("dyn_repeat_4d_tensor"):
                                 T.reads(input_tensor_buf[n_idx, c_idx, h_idx, w_idx])
                                 T.writes(out_buf[n_idx, c_idx, h_idx, w_idx])
                                 out_buf[n_idx, c_idx, h_idx, w_idx] = input_tensor_buf[
@@ -129,7 +129,7 @@ def dyn_concate_dim_2_func(input_1: T.handle, input_2: T.handle, output: T.handl
                 for n_idx in T.thread_binding(n, thread="blockIdx.x"):
                     for c_idx in T.thread_binding(c, thread="blockIdx.y"):
                         for h_idx, w_idx in T.grid(h1 + h2, w):
-                            with T.block("dyn_concate_dim_2"):
+                            with T.sblock("dyn_concate_dim_2"):
                                 T.reads(input_1_buf[n_idx, c_idx, h_idx, w_idx])
                                 T.writes(out_buf[n_idx, c_idx, h_idx, w_idx])
                                 if h_idx < h1:
@@ -167,7 +167,7 @@ def dyn_concate_dim_1_func(input_1: T.handle, input_2: T.handle, output: T.handl
 
                 for c_idx in T.thread_binding(c, thread="blockIdx.y"):
                     for h_idx, w_idx in T.grid(h1 + h2, w):
-                        with T.block("dyn_concate_dim_1"):
+                        with T.sblock("dyn_concate_dim_1"):
                             T.reads(input_1_buf[c_idx, h_idx, w_idx])
                             T.writes(out_buf[c_idx, h_idx, w_idx])
                             if h_idx < h1: