pytorch · SS-JIA · Mar 11, 2025 · Mar 11, 2025 · Mar 11, 2025 · Mar 11, 2025
@@ -104,16 +104,19 @@ ivec4 tidx_to_4bufi(
 }
 
 ivec4 nchwi_to_tidx(const int nchwi, const ivec4 sizes) {
+  const int nchwi_div_x = nchwi / sizes.x;
+  const int nchwi_div_y = nchwi_div_x / sizes.y;
   return ivec4(
       nchwi % sizes.x,
-      (nchwi / (sizes.x)) % sizes.y,
-      (nchwi / (sizes.x * sizes.y)) % sizes.z,
-      (nchwi / (sizes.x * sizes.y * sizes.z)));
+      nchwi_div_x % sizes.y,
+      nchwi_div_y % sizes.z,
+      nchwi_div_y / sizes.z);
 }
 
 int tidx_to_nchwi(const ivec4 tidx, const ivec4 sizes) {
-  return tidx.w * sizes.x * sizes.y * sizes.z + tidx.z * sizes.x * sizes.y +
-      tidx.y * sizes.x + tidx.x;
+  const int sizes_xy = sizes.x * sizes.y;
+  return tidx.w * sizes_xy * sizes.z + tidx.z * sizes_xy + tidx.y * sizes.x +
+      tidx.x;
 }
 
 // TODO(ssjia): make this function use dim order so that it can work with any
@@ -360,8 +363,8 @@ ivec4 to_texture_elem_pos(ivec4 idx, ivec4 sizes, int packed_dim) {
   //  pos[4] is set to a placeholder value
   ivec4 pos = idx.xyzx;
   pos[BATCH_AXIS] += idx.w * sizes[BATCH_AXIS];
-  pos[packed_dim] /= 4;
-  pos.w = idx[packed_dim] % 4;
+  pos[packed_dim] >>= 2;
+  pos.w = idx[packed_dim] & 0x3;
   return pos;
 }
 

@@ -1,4 +1,4 @@
-slice_channel:
+slice_packed_dim:
   parameter_names_with_default_values:
     DTYPE: float
     NDIM: 3
@@ -8,4 +8,4 @@ slice_channel:
       - VALUE: half
       - VALUE: float
   shader_variants:
-    - NAME: slice_channel
+    - NAME: slice_packed_dim
@@ -1,4 +1,4 @@
-slice_batch_height_width:
+slice_unpacked_dim:
   parameter_names_with_default_values:
     DTYPE: float
     NDIM: 3
@@ -7,4 +7,4 @@ slice_batch_height_width:
       - VALUE: half
       - VALUE: float
   shader_variants:
-    - NAME: slice_batch_height_width
+    - NAME: slice_unpacked_dim
@@ -83,7 +83,7 @@ void add_slice_tensor_copy_node(
   // if slice dim is the same as the packed dim, we can use the channel slice
   if (dim_index == packed_dim_idx) {
     // slice by channel
-    std::string kernel_name = "slice_channel";
+    std::string kernel_name = "slice_packed_dim";
     kernel_name.reserve(kShaderNameReserve);
     add_dtype_suffix(kernel_name, *t_out);
 
@@ -108,30 +108,18 @@ void add_slice_tensor_copy_node(
         spec_vars));
 
   } else {
-    // GPU's coordinate is in x, y, z
-    int64_t gpu_dim = -1;
-    int64_t in_channel_stride = 1;
-    if (dim_index == kWidth4D) {
-      gpu_dim = 0; // width: x dimension in gpu
-      VK_CHECK_COND(out_sizes[dim] == (1 + (end - start - 1) / step));
-    } else if (dim_index == kHeight4D) {
-      gpu_dim = 1; // height: y dimension
-      VK_CHECK_COND(out_sizes[dim] == (1 + (end - start - 1) / step));
-    } else if (dim_index == kChannel4D) {
-      gpu_dim = 2; // channel: z dimension
-      VK_CHECK_COND(out_sizes[dim] == (1 + (end - start - 1) / step));
-      in_channel_stride = dim_at(in_sizes, kChannel4D);
-    } else {
-      gpu_dim = 3; // batch: w dimension
-
-      in_channel_stride = dim_at(in_sizes, kChannel4D);
-      if (packed_dim_idx == kChannel4D) {
-        // Due to channel packing, each batch value is span over stride planes
-        in_channel_stride = utils::div_up_4(in_channel_stride);
-      }
+    // GPU's coordinate is in x = 0, y = 1, z = 2, w = 3
+    const int64_t gpu_dim = -(dim_index + 1);
+    // stride of input tensor's channel dimension
+    int64_t in_channel_stride = dim_at(in_sizes, kChannel4D);
+    VK_CHECK_COND(out_sizes[dim] == (1 + (end - start - 1) / step));
+
+    // Due to channel packing, each batch value is span over stride planes
+    if (dim_index == kBatch4D && packed_dim_idx == kChannel4D) {
+      in_channel_stride = utils::div_up_4(in_channel_stride);
     }
 
-    std::string kernel_name = "slice_batch_height_width";
+    std::string kernel_name = "slice_unpacked_dim";
     kernel_name.reserve(kShaderNameReserve);
     add_dtype_suffix(kernel_name, *t_out);