We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent 1d99b61 commit 0cd5b90Copy full SHA for 0cd5b90
python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags.py
@@ -157,7 +157,8 @@ def make_default_opt_flags_nvidia(
157
elif enforce_bitwise_invariance:
158
block_m = 128
159
else:
160
- block_m = max(16, min(triton.next_power_of_2(tokens_per_expt), 128))
+ min_block_m = 64 if torch.cuda.get_device_capability()[0] == 10 else 16
161
+ block_m = max(min_block_m, min(triton.next_power_of_2(tokens_per_expt), 128))
162
# block n
163
arch = None
164
block_n = opt_flags_nvidia.compute_block_n(n, arch, precision_config)
0 commit comments