Skip to content

Commit 633d32d

Browse files
Merge commit '43dbdd1685625ce71daea1caf8a4d90fdea6457f'
2 parents 5c020ef + 43dbdd1 commit 633d32d

File tree

39 files changed

+2005
-1249
lines changed

39 files changed

+2005
-1249
lines changed

include/triton/Dialect/TritonGPU/Transforms/Schedule.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ class CoarseSchedule {
5454
iterator end() { return orderClusters.end(); }
5555
const_iterator end() const { return orderClusters.end(); }
5656
size_t size() const { return orderClusters.size(); }
57+
void clear() { orderClusters.clear(); }
5758
iterator newAtBack() {
5859
orderClusters.push_back(orderClusters.size());
5960
return std::prev(orderClusters.end());
@@ -157,7 +158,10 @@ class CoarseSchedule {
157158
// Set <stage, cluster> based on CoarseSchedule.
158159
void serialize(scf::ForOp &forOp) const;
159160
// Create a CoarseSchedule based on forOp's <stage, cluster>.
160-
LogicalResult deSerialize(scf::ForOp &forOp);
161+
// If normalizeClusterId is true, clusters [minClusterId, maxClusterId] will
162+
// be remapped to [0, maxClusterId - minClusterId].
163+
// If false, it won't remap and clusters [0, maxClusterId] will be created.
164+
LogicalResult deSerialize(scf::ForOp &forOp, bool normalizeClusterId = true);
161165

162166
static ClusterHash hashCluster(Cluster cluster) {
163167
return reinterpret_cast<ClusterHash>(&*cluster);

lib/Dialect/TritonGPU/Transforms/Pipeliner/Schedule.cpp

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,8 @@ void tt::CoarseSchedule::serialize(scf::ForOp &forOp) const {
263263
}
264264

265265
// Create a CoarseSchedule based on forOp's <stage, cluster>.
266-
LogicalResult tt::CoarseSchedule::deSerialize(scf::ForOp &forOp) {
266+
LogicalResult tt::CoarseSchedule::deSerialize(scf::ForOp &forOp,
267+
bool normalizeClusterId) {
267268
auto [minClusterId, maxClusterId] = getMinMaxCluster(forOp);
268269
std::optional<int> maxStage = tryGetMaxStage(forOp);
269270
if (!maxStage) {
@@ -272,9 +273,16 @@ LogicalResult tt::CoarseSchedule::deSerialize(scf::ForOp &forOp) {
272273
numStages = *maxStage + 1;
273274

274275
DenseMap<int, tt::CoarseSchedule::Cluster> clustersMap;
275-
for (int i = minClusterId; i < maxClusterId + 1; i++) {
276-
clustersMap.insert({i, clusters.newAtBack()});
276+
if (normalizeClusterId) {
277+
for (int i = minClusterId; i < maxClusterId + 1; i++) {
278+
clustersMap.insert({i, clusters.newAtBack()});
279+
}
280+
} else {
281+
for (int i = 0; i < maxClusterId + 1; i++) {
282+
clustersMap.insert({i, clusters.newAtBack()});
283+
}
277284
}
285+
278286
for (Operation &op : forOp.getBody()->without_terminator()) {
279287
if (!op.hasAttr(mlir::triton::kLoopStageAttrName))
280288
continue;

python/test/gluon/test_consan.py

Lines changed: 65 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -86,40 +86,43 @@ def failing_kernel(input):
8686
ampere.async_copy.wait_group(0)
8787

8888

89-
def run_failing_kernel(device):
89+
def run_failing_kernel(device, enable_consan, mode):
9090
# ConSan requires a global memory allocation
9191
def alloc_fn(size: int, alignment: int, stream: Optional[int]):
9292
return torch.empty(size, device="cuda", dtype=torch.int8)
9393

9494
triton.set_allocator(alloc_fn)
9595

96+
if enable_consan:
97+
if mode == "env":
98+
os.environ["TRITON_INSTRUMENTATION_MODE"] = "consan"
99+
knobs.refresh_knobs()
100+
elif mode == "knob":
101+
knobs.compilation.instrumentation_mode = "consan"
102+
96103
input = torch.randn((XBLOCK, XBLOCK), device=device, dtype=torch.float16)
97104
failing_kernel[(1, )](input)
98105

99106

100107
@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] < 9, reason="Requires hopper", run=False)
101-
def test_cache_miss_knob(device, fresh_knobs, monkeypatch):
108+
def test_cache_miss_knob(device, monkeypatch):
102109
# First run without consan
103-
knobs.compilation.enable_experimental_consan = False
104-
run_failing_kernel(device)
110+
run_in_process(run_failing_kernel, (device, False, "knob"))
105111

106112
# Then run with consan and assert that if fails
107-
knobs.compilation.enable_experimental_consan = True
108113
monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
109-
result = run_in_process(run_failing_kernel, (device, ))
114+
result = run_in_process(run_failing_kernel, (device, True, "knob"))
110115
assert "device-side assert" in str(result.exc)
111116

112117

113118
@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] < 9, reason="Requires hopper", run=False)
114119
def test_cache_miss_env(device, monkeypatch):
115120
# First run without consan
116-
knobs.compilation.enable_experimental_consan = False
117-
run_failing_kernel(device)
121+
run_in_process(run_failing_kernel, (device, False, "env"))
118122

119123
# Then run with consan and assert that if fails
120-
monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
121124
monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
122-
result = run_in_process(run_failing_kernel, (device, ))
125+
result = run_in_process(run_failing_kernel, (device, True, "env"))
123126
assert "device-side assert" in str(result.exc)
124127

125128

@@ -134,8 +137,9 @@ def test_async_tma_kernel(FAILURE, device, run_wrapper, monkeypatch):
134137
assert "Buffer being accessed has outstanding writes" in result.driver_stderr_output
135138
return
136139

137-
monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
140+
monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
138141
monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
142+
knobs.refresh_knobs()
139143

140144
# ConSan requires a global memory allocation
141145
def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -184,8 +188,9 @@ def test_tma_interleave_kernel(FAILURE, device, run_wrapper, monkeypatch):
184188
assert result.driver_stderr_output == ""
185189
return
186190

187-
monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
191+
monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
188192
monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
193+
knobs.refresh_knobs()
189194

190195
# ConSan requires a global memory allocation
191196
def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -243,8 +248,9 @@ def test_async_copy(FAILURE, device, run_wrapper, monkeypatch):
243248
assert result.driver_stderr_output == ""
244249
return
245250

246-
monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
251+
monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
247252
monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
253+
knobs.refresh_knobs()
248254

249255
# ConSan requires a global memory allocation
250256
def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -296,8 +302,9 @@ def test_tcgen5_mma(FAILURE, MEM_ACCESS_KIND, device, run_wrapper, monkeypatch):
296302
assert result.driver_stderr_output == ""
297303
return
298304

299-
monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
305+
monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
300306
monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
307+
knobs.refresh_knobs()
301308

302309
# ConSan requires a global memory allocation
303310
def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -359,8 +366,9 @@ def test_warpgroup_mma(FAILURE, device, run_wrapper, monkeypatch):
359366
assert result.driver_stderr_output == ""
360367
return
361368

362-
monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
369+
monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
363370
monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
371+
knobs.refresh_knobs()
364372

365373
# ConSan requires a global memory allocation
366374
def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -403,8 +411,9 @@ def test_warpgroup_mma2(FAILURE, device, run_wrapper, monkeypatch):
403411
assert result.driver_stderr_output == ""
404412
return
405413

406-
monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
414+
monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
407415
monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
416+
knobs.refresh_knobs()
408417

409418
# ConSan requires a global memory allocation
410419
def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -452,8 +461,9 @@ def test_tcgen5_mma_multibar(BUF_IDX, BAR_IDX, device, run_wrapper, monkeypatch)
452461
assert result.exc is None
453462
assert result.driver_stderr_output == ""
454463
return
455-
monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
464+
monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
456465
monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
466+
knobs.refresh_knobs()
457467

458468
# ConSan requires a global memory allocation
459469
def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -511,8 +521,9 @@ def test_multibuffered_loop(FAILURE, device, run_wrapper, monkeypatch):
511521
assert result.driver_stderr_output == ""
512522
return
513523

514-
monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
524+
monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
515525
monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
526+
knobs.refresh_knobs()
516527

517528
# ConSan requires a global memory allocation
518529
def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -625,8 +636,9 @@ def test_multibuffered_wgmma_loop(FAILURE, device, run_wrapper, monkeypatch):
625636
assert result.driver_stderr_output == ""
626637
return
627638

628-
monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
639+
monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
629640
monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
641+
knobs.refresh_knobs()
630642

631643
# ConSan requires a global memory allocation
632644
def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -705,8 +717,9 @@ def test_ws_store_wait_load(FAILURE, device, run_wrapper, monkeypatch):
705717
assert result.exc is None
706718
assert result.driver_stderr_output == ""
707719
return
708-
monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
720+
monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
709721
monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
722+
knobs.refresh_knobs()
710723

711724
# ConSan requires a global memory allocation
712725
def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -758,8 +771,9 @@ def test_ws_load_wait_store(FAILURE, device, run_wrapper, monkeypatch):
758771
assert result.exc is None
759772
assert result.driver_stderr_output == ""
760773
return
761-
monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
774+
monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
762775
monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
776+
knobs.refresh_knobs()
763777

764778
# ConSan requires a global memory allocation
765779
def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -811,8 +825,9 @@ def test_ws_two_loads_two_bars(MISSING_BAR, device, run_wrapper, monkeypatch):
811825
assert result.exc is None
812826
assert result.driver_stderr_output == ""
813827
return
814-
monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
828+
monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
815829
monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
830+
knobs.refresh_knobs()
816831

817832
# ConSan requires a global memory allocation
818833
def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -873,8 +888,9 @@ def test_ws_two_loads_one_bar(FAILURE, device, run_wrapper, monkeypatch):
873888
assert result.exc is None
874889
assert result.driver_stderr_output == ""
875890
return
876-
monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
891+
monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
877892
monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
893+
knobs.refresh_knobs()
878894

879895
# ConSan requires a global memory allocation
880896
def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -935,8 +951,9 @@ def test_ws_two_loads_two_bars_loop(MISSING_BAR, device, run_wrapper, monkeypatc
935951
assert result.exc is None
936952
assert result.driver_stderr_output == ""
937953
return
938-
monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
954+
monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
939955
monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
956+
knobs.refresh_knobs()
940957

941958
# ConSan requires a global memory allocation
942959
def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -1015,8 +1032,9 @@ def test_ws_load_ordering(FAILURE, device, run_wrapper, monkeypatch):
10151032
assert result.exc is None
10161033
assert result.driver_stderr_output == ""
10171034
return
1018-
monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
1035+
monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
10191036
monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
1037+
knobs.refresh_knobs()
10201038

10211039
# ConSan requires a global memory allocation
10221040
def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -1079,8 +1097,9 @@ def test_ws_two_producers_two_consumers(MISSING_BAR, device, run_wrapper, monkey
10791097
assert result.exc is None
10801098
assert result.driver_stderr_output == ""
10811099
return
1082-
monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
1100+
monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
10831101
monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
1102+
knobs.refresh_knobs()
10841103

10851104
# ConSan requires a global memory allocation
10861105
def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -1166,8 +1185,9 @@ def test_ws_different_warp_sizes(MISSING_BAR, device, run_wrapper, monkeypatch):
11661185
assert result.exc is None
11671186
assert result.driver_stderr_output == ""
11681187
return
1169-
monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
1188+
monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
11701189
monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
1190+
knobs.refresh_knobs()
11711191

11721192
# ConSan requires a global memory allocation
11731193
def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -1236,8 +1256,9 @@ def test_ws_async_copy_commits(FAILURE, device, run_wrapper, monkeypatch):
12361256
assert result.driver_stderr_output == ""
12371257
return
12381258

1239-
monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
1259+
monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
12401260
monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
1261+
knobs.refresh_knobs()
12411262

12421263
def alloc_fn(size: int, alignment: int, stream: Optional[int]):
12431264
return torch.empty(size, device="cuda", dtype=torch.int8)
@@ -1300,8 +1321,9 @@ def test_ws_async_copy_wait_visibility(FAILURE, device, run_wrapper, monkeypatch
13001321
assert result.driver_stderr_output == ""
13011322
return
13021323

1303-
monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
1324+
monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
13041325
monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
1326+
knobs.refresh_knobs()
13051327

13061328
def alloc_fn(size: int, alignment: int, stream: Optional[int]):
13071329
return torch.empty(size, device="cuda", dtype=torch.int8)
@@ -1352,8 +1374,9 @@ def test_ws_wgmma_wait_visibility(FAILURE, device, run_wrapper, monkeypatch):
13521374
assert result.driver_stderr_output == ""
13531375
return
13541376

1355-
monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
1377+
monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
13561378
monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
1379+
knobs.refresh_knobs()
13571380

13581381
def alloc_fn(size: int, alignment: int, stream: Optional[int]):
13591382
return torch.empty(size, device="cuda", dtype=torch.int8)
@@ -1400,8 +1423,9 @@ def test_deadlock_two_partitions(device, run_wrapper, monkeypatch):
14001423
assert "device-side assert" in str(result.exc)
14011424
assert "Deadlock detected" in result.driver_stderr_output
14021425
return
1403-
monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
1426+
monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
14041427
monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
1428+
knobs.refresh_knobs()
14051429

14061430
# ConSan requires a global memory allocation
14071431
def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -1434,8 +1458,9 @@ def test_deadlock_overarrival(device, run_wrapper, monkeypatch):
14341458
assert "device-side assert" in str(result.exc)
14351459
assert "Deadlock detected" in result.driver_stderr_output
14361460
return
1437-
monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
1461+
monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
14381462
monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
1463+
knobs.refresh_knobs()
14391464

14401465
# ConSan requires a global memory allocation
14411466
def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -1463,8 +1488,9 @@ def test_deadlock_underarrival(device, run_wrapper, monkeypatch):
14631488
assert "device-side assert" in str(result.exc)
14641489
assert "Deadlock detected" in result.driver_stderr_output
14651490
return
1466-
monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
1491+
monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
14671492
monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
1493+
knobs.refresh_knobs()
14681494

14691495
# ConSan requires a global memory allocation
14701496
def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -1499,8 +1525,9 @@ def test_deadlock_different_phases(device, run_wrapper, monkeypatch):
14991525
assert result.exc is None
15001526
assert result.driver_stderr_output == ""
15011527
return
1502-
monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
1528+
monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
15031529
monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
1530+
knobs.refresh_knobs()
15041531

15051532
# ConSan requires a global memory allocation
15061533
def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -1534,8 +1561,9 @@ def test_deadlock_exempt_when_tma_signals(device, run_wrapper, monkeypatch):
15341561
assert result.exc is None
15351562
assert result.driver_stderr_output == ""
15361563
return
1537-
monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
1564+
monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
15381565
monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
1566+
knobs.refresh_knobs()
15391567

15401568
# ConSan requires a global memory allocation
15411569
def alloc_fn(size: int, alignment: int, stream: Optional[int]):
@@ -1577,8 +1605,9 @@ def test_barrier_underflow(device, run_wrapper, monkeypatch):
15771605
assert "device-side assert" in str(result.exc)
15781606
assert "Barrier arrive underflow: current count would become negative" in result.driver_stderr_output
15791607
return
1580-
monkeypatch.setenv("TRITON_ENABLE_EXPERIMENTAL_CONSAN", "1")
1608+
monkeypatch.setenv("TRITON_INSTRUMENTATION_MODE", "consan")
15811609
monkeypatch.setenv("CUDA_LAUNCH_BLOCKING", "1")
1610+
knobs.refresh_knobs()
15821611

15831612
# ConSan requires a global memory allocation
15841613
def alloc_fn(size: int, alignment: int, stream: Optional[int]):

python/test/gluon/test_core.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -696,8 +696,6 @@ def kernel(in_ptr, out_ptr, smem_h: ttgl.constexpr, smem_w: ttgl.constexpr, num_
696696
tmem_alias: ttgl.constexpr = TensorMemoryLayout((num_rows, num_cols), col_stride=1)
697697
tmem = tmem._reinterpret(ttgl.int8, (num_rows, num_cols), tmem_alias)
698698
value = tmem.load(blocked)
699-
ttgl.static_print(ttgl.to_linear_layout(blocked, (smem_h, smem_w)))
700-
ttgl.static_print(ttgl.to_linear_layout(blocked, (num_rows, num_cols)))
701699
ttgl.store(ttgl.set_auto_layout(out_ptrs, blocked), value)
702700

703701
torch.manual_seed(0)

0 commit comments

Comments
 (0)