@@ -71,61 +71,68 @@ void add_copy_packed_dim_offset_node(
7171    const  ivec3& range,
7272    const  ivec4& src_offset,
7373    const  ivec4& dst_offset,
74-     const  ValueRef out) {
74+     const  ValueRef out,
75+     bool  repeat) {
7576  vTensorPtr t_in = graph.get_tensor (in);
7677  vTensorPtr t_out = graph.get_tensor (out);
7778
78-   //  Check the packed dimension is same for both tensors, and if the packed
79-   //  dimension is Width or Height. Since the function does not support channel
80-   //  packing.
81-   VK_CHECK_COND (
82-       check_same_packed_dim (*t_in, *t_out) &&
83-       (check_packed_dim_is (*t_in, WHCN::kWidthDim ) ||
84-        check_packed_dim_is (*t_in, WHCN::kHeightDim )));
79+   //  Check the packed dimension is same for both tensors
80+   VK_CHECK_COND (check_same_packed_dim (*t_in, *t_out));
81+   if  (!repeat) {
82+     //  For non repeat copy also check if the packed dimension is Width or
83+     //  Height. Since the function does not support channel packing.
84+     VK_CHECK_COND (
85+         check_same_packed_dim (*t_in, *t_out) &&
86+         (check_packed_dim_is (*t_in, WHCN::kWidthDim ) ||
87+          check_packed_dim_is (*t_in, WHCN::kHeightDim )));
88+   }
8589
8690  std::string kernel_name = " copy_packed_dim_offset"  ;
8791  kernel_name.reserve (kShaderNameReserve );
8892  add_dtype_suffix (kernel_name, *t_out);
8993
90-   const  auto  packed_dim = t_in->packed_dim ();
9194  //  A copy of range with the last element set to batch size of the input tensor
9295  ivec4 final_range = {
9396      range[0 ], range[1 ], range[2 ], dim_at (t_in->sizes (), kBatch4D )};
9497  ivec3 global_wg_size = t_out->logical_limits ();
95-   //  The starting offset in a texel where this tensor will start copying from
96-   const  auto  src_lane_offset = src_offset[packed_dim] & 0x3 ;
97-   //  The starting offset in a texel where this tensor will start copying to
98-   const  auto  dst_lane_offset = dst_offset[packed_dim] & 0x3 ;
99- 
100-   //  The total packed texels this tensor will be copied from
101-   //  The first texel of tensor data in packed dimension will be copied from
102-   //  remaining lanes from current source Hence (4 - src_lane_offset) is added
103-   //  to tensor size in packed dimension
104-   const  auto  src_packed_size = utils::div_up_4 (
105-       (4  - src_lane_offset) +
106-       dim_at (t_out->sizes (), normalize_to_dim_index (*t_out, packed_dim)));
107- 
108-   //  The total packed texels this tensor will be copied to
109-   //  The first texel of tensor data in packed dimension will be copied to
110-   //  remaining lanes from previous write Hence (4 - dst_lane_offset) is added to
111-   //  tensor size in packed dimension
112-   const  auto  dst_packed_size = utils::div_up_4 (
113-       (4  - dst_lane_offset) +
114-       dim_at (t_in->sizes (), normalize_to_dim_index (*t_in, packed_dim)));
115- 
116-   //  If the starting src offset is not 0, and the total packed texels is greater
117-   //  than the source texel range
118-   const  bool  has_additional_src_work =
119-       src_lane_offset != 0  && src_packed_size > final_range[packed_dim];
120-   //  If the starting dst offset is not 0, and the total packed texels is greater
121-   //  than the source texel range
122-   const  bool  has_additional_dst_work =
123-       dst_lane_offset != 0  && dst_packed_size > final_range[packed_dim];
124- 
125-   if  (has_additional_src_work || has_additional_dst_work) {
126-     global_wg_size[packed_dim]++; //  Increase the global work group size in
127-                                   //  packed dimension
128-     final_range[packed_dim]++; //  Increase the range in packed dimension
98+ 
99+   if  (!repeat) {
100+     const  auto  packed_dim = t_in->packed_dim ();
101+     //  The starting offset in a texel where this tensor will start copying from
102+     const  auto  src_lane_offset = src_offset[packed_dim] & 0x3 ;
103+     //  The starting offset in a texel where this tensor will start copying to
104+     const  auto  dst_lane_offset = dst_offset[packed_dim] & 0x3 ;
105+ 
106+     //  The total packed texels this tensor will be copied from
107+     //  The first texel of tensor data in packed dimension will be copied from
108+     //  remaining lanes from current source Hence (4 - src_lane_offset) is added
109+     //  to tensor size in packed dimension
110+     const  auto  src_packed_size = utils::div_up_4 (
111+         (4  - src_lane_offset) +
112+         dim_at (t_out->sizes (), normalize_to_dim_index (*t_out, packed_dim)));
113+ 
114+     //  The total packed texels this tensor will be copied to
115+     //  The first texel of tensor data in packed dimension will be copied to
116+     //  remaining lanes from previous write Hence (4 - dst_lane_offset) is added
117+     //  to tensor size in packed dimension
118+     const  auto  dst_packed_size = utils::div_up_4 (
119+         (4  - dst_lane_offset) +
120+         dim_at (t_in->sizes (), normalize_to_dim_index (*t_in, packed_dim)));
121+ 
122+     //  If the starting src offset is not 0, and the total packed texels is
123+     //  greater than the source texel range
124+     const  bool  has_additional_src_work =
125+         src_lane_offset != 0  && src_packed_size > final_range[packed_dim];
126+     //  If the starting dst offset is not 0, and the total packed texels is
127+     //  greater than the source texel range
128+     const  bool  has_additional_dst_work =
129+         dst_lane_offset != 0  && dst_packed_size > final_range[packed_dim];
130+ 
131+     if  (has_additional_src_work || has_additional_dst_work) {
132+       global_wg_size[packed_dim]++; //  Increase the global work group size in
133+                                     //  packed dimension
134+       final_range[packed_dim]++; //  Increase the range in packed dimension
135+     }
129136  }
130137
131138  auto  shader = VK_KERNEL_FROM_STR (kernel_name);
@@ -144,7 +151,7 @@ void add_copy_packed_dim_offset_node(
144151      //  Parameter buffers
145152      {},
146153      //  Specialization Constants
147-       {graph.hashed_layout_of (out), graph.hashed_layout_of (in)},
154+       {graph.hashed_layout_of (out), graph.hashed_layout_of (in), repeat },
148155      nullptr ,
149156      {},
150157      {
0 commit comments