[Common] Fix long compile time in padding.cu on arch 75 (#2562)

jberchtold-nvidia · pre-commit-ci[bot] · web-flow · commit df69100c3bbb · 2026-01-06T10:49:27.000-08:00
* Fix long compile time in padding.cu Signed-off-by: Jeremy Berchtold <jberchtold@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Jeremy Berchtold <jberchtold@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/transformer_engine/common/util/padding.cu b/transformer_engine/common/util/padding.cu
@@ -94,15 +94,14 @@ __global__ void __launch_bounds__(threads_per_block) multi_padding_kernel(MultiP
 #pragma unroll
     for (int i2 = 0; i2 < nvec; ++i2) {
       const int row = tile_row + i1 * nvec + i2;
-      size_t row_offset = static_cast<size_t>(row) * row_length;
       const int col = tile_col + j1 * nvec;
       Vec local_input;
       Vec local_output;
       local_input.clear();
       if (row < num_rows) {
         for (int j2 = 0; j2 < nvec; ++j2) {
           if (col + j2 < row_length) {
-            local_input.data.elt[j2] = input[row_offset + col + j2];
+            local_input.data.elt[j2] = input[static_cast<size_t>(row) * row_length + col + j2];
           }
         }
       }
@@ -113,14 +112,14 @@ __global__ void __launch_bounds__(threads_per_block) multi_padding_kernel(MultiP
       if (row < num_rows) {
         for (int j2 = 0; j2 < nvec; ++j2) {
           if (col + j2 < row_length) {
-            output[row_offset + col + j2] = local_output.data.elt[j2];
+            output[static_cast<size_t>(row) * row_length + col + j2] = local_output.data.elt[j2];
           }
         }
       } else if (row < padded_num_rows) {
         // padding
         for (int j2 = 0; j2 < nvec; ++j2) {
           if (col + j2 < row_length) {
-            output[row_offset + col + j2] = local_zero;
+            output[static_cast<size_t>(row) * row_length + col + j2] = local_zero;
           }
         }
       }
@@ -179,15 +178,14 @@ __global__ void __launch_bounds__(threads_per_block) multi_unpadding_kernel(Mult
 #pragma unroll
     for (int i2 = 0; i2 < nvec; ++i2) {
       const int row = tile_row + i1 * nvec + i2;
-      size_t row_offset = static_cast<size_t>(row) * row_length;
       const int col = tile_col + j1 * nvec;
       Vec local_input;
       Vec local_output;
       local_input.clear();
       if (row < num_rows) {
         for (int j2 = 0; j2 < nvec; ++j2) {
           if (col + j2 < row_length) {
-            local_input.data.elt[j2] = input[row_offset + col + j2];
+            local_input.data.elt[j2] = input[static_cast<size_t>(row) * row_length + col + j2];
           }
         }
       }
@@ -198,7 +196,7 @@ __global__ void __launch_bounds__(threads_per_block) multi_unpadding_kernel(Mult
       if (row < num_rows) {
         for (int j2 = 0; j2 < nvec; ++j2) {
           if (col + j2 < row_length) {
-            output[row_offset + col + j2] = local_output.data.elt[j2];
+            output[static_cast<size_t>(row) * row_length + col + j2] = local_output.data.elt[j2];
           }
         }
       }

Original file line number	Diff line number	Diff line change
`@@ -94,15 +94,14 @@ __global__ void __launch_bounds__(threads_per_block) multi_padding_kernel(MultiP`
`94`	`94`	`#pragma unroll`
`95`	`95`	`for (int i2 = 0; i2 < nvec; ++i2) {`
`96`	`96`	`const int row = tile_row + i1 * nvec + i2;`
`97`		`- size_t row_offset = static_cast<size_t>(row) * row_length;`
`98`	`97`	`const int col = tile_col + j1 * nvec;`
`99`	`98`	`Vec local_input;`
`100`	`99`	`Vec local_output;`
`101`	`100`	`local_input.clear();`
`102`	`101`	`if (row < num_rows) {`
`103`	`102`	`for (int j2 = 0; j2 < nvec; ++j2) {`
`104`	`103`	`if (col + j2 < row_length) {`
`105`		`- local_input.data.elt[j2] = input[row_offset + col + j2];`
	`104`	`+ local_input.data.elt[j2] = input[static_cast<size_t>(row) * row_length + col + j2];`
`106`	`105`	`}`
`107`	`106`	`}`
`108`	`107`	`}`
`@@ -113,14 +112,14 @@ __global__ void __launch_bounds__(threads_per_block) multi_padding_kernel(MultiP`
`113`	`112`	`if (row < num_rows) {`
`114`	`113`	`for (int j2 = 0; j2 < nvec; ++j2) {`
`115`	`114`	`if (col + j2 < row_length) {`
`116`		`- output[row_offset + col + j2] = local_output.data.elt[j2];`
	`115`	`+ output[static_cast<size_t>(row) * row_length + col + j2] = local_output.data.elt[j2];`
`117`	`116`	`}`
`118`	`117`	`}`
`119`	`118`	`} else if (row < padded_num_rows) {`
`120`	`119`	`// padding`
`121`	`120`	`for (int j2 = 0; j2 < nvec; ++j2) {`
`122`	`121`	`if (col + j2 < row_length) {`
`123`		`- output[row_offset + col + j2] = local_zero;`
	`122`	`+ output[static_cast<size_t>(row) * row_length + col + j2] = local_zero;`
`124`	`123`	`}`
`125`	`124`	`}`
`126`	`125`	`}`
`@@ -179,15 +178,14 @@ __global__ void __launch_bounds__(threads_per_block) multi_unpadding_kernel(Mult`
`179`	`178`	`#pragma unroll`
`180`	`179`	`for (int i2 = 0; i2 < nvec; ++i2) {`
`181`	`180`	`const int row = tile_row + i1 * nvec + i2;`
`182`		`- size_t row_offset = static_cast<size_t>(row) * row_length;`
`183`	`181`	`const int col = tile_col + j1 * nvec;`
`184`	`182`	`Vec local_input;`
`185`	`183`	`Vec local_output;`
`186`	`184`	`local_input.clear();`
`187`	`185`	`if (row < num_rows) {`
`188`	`186`	`for (int j2 = 0; j2 < nvec; ++j2) {`
`189`	`187`	`if (col + j2 < row_length) {`
`190`		`- local_input.data.elt[j2] = input[row_offset + col + j2];`
	`188`	`+ local_input.data.elt[j2] = input[static_cast<size_t>(row) * row_length + col + j2];`
`191`	`189`	`}`
`192`	`190`	`}`
`193`	`191`	`}`
`@@ -198,7 +196,7 @@ __global__ void __launch_bounds__(threads_per_block) multi_unpadding_kernel(Mult`
`198`	`196`	`if (row < num_rows) {`
`199`	`197`	`for (int j2 = 0; j2 < nvec; ++j2) {`
`200`	`198`	`if (col + j2 < row_length) {`
`201`		`- output[row_offset + col + j2] = local_output.data.elt[j2];`
	`199`	`+ output[static_cast<size_t>(row) * row_length + col + j2] = local_output.data.elt[j2];`
`202`	`200`	`}`
`203`	`201`	`}`
`204`	`202`	`}`