@@ -71,61 +71,68 @@ void add_copy_packed_dim_offset_node(
7171 const ivec3& range,
7272 const ivec4& src_offset,
7373 const ivec4& dst_offset,
74- const ValueRef out) {
74+ const ValueRef out,
75+ bool repeat) {
7576 vTensorPtr t_in = graph.get_tensor (in);
7677 vTensorPtr t_out = graph.get_tensor (out);
7778
78- // Check the packed dimension is same for both tensors, and if the packed
79- // dimension is Width or Height. Since the function does not support channel
80- // packing.
81- VK_CHECK_COND (
82- check_same_packed_dim (*t_in, *t_out) &&
83- (check_packed_dim_is (*t_in, WHCN::kWidthDim ) ||
84- check_packed_dim_is (*t_in, WHCN::kHeightDim )));
79+ // Check the packed dimension is same for both tensors
80+ VK_CHECK_COND (check_same_packed_dim (*t_in, *t_out));
81+ if (!repeat) {
82+ // For non repeat copy also check if the packed dimension is Width or
83+ // Height. Since the function does not support channel packing.
84+ VK_CHECK_COND (
85+ check_same_packed_dim (*t_in, *t_out) &&
86+ (check_packed_dim_is (*t_in, WHCN::kWidthDim ) ||
87+ check_packed_dim_is (*t_in, WHCN::kHeightDim )));
88+ }
8589
8690 std::string kernel_name = " copy_packed_dim_offset" ;
8791 kernel_name.reserve (kShaderNameReserve );
8892 add_dtype_suffix (kernel_name, *t_out);
8993
90- const auto packed_dim = t_in->packed_dim ();
9194 // A copy of range with the last element set to batch size of the input tensor
9295 ivec4 final_range = {
9396 range[0 ], range[1 ], range[2 ], dim_at (t_in->sizes (), kBatch4D )};
9497 ivec3 global_wg_size = t_out->logical_limits ();
95- // The starting offset in a texel where this tensor will start copying from
96- const auto src_lane_offset = src_offset[packed_dim] & 0x3 ;
97- // The starting offset in a texel where this tensor will start copying to
98- const auto dst_lane_offset = dst_offset[packed_dim] & 0x3 ;
99-
100- // The total packed texels this tensor will be copied from
101- // The first texel of tensor data in packed dimension will be copied from
102- // remaining lanes from current source Hence (4 - src_lane_offset) is added
103- // to tensor size in packed dimension
104- const auto src_packed_size = utils::div_up_4 (
105- (4 - src_lane_offset) +
106- dim_at (t_out->sizes (), normalize_to_dim_index (*t_out, packed_dim)));
107-
108- // The total packed texels this tensor will be copied to
109- // The first texel of tensor data in packed dimension will be copied to
110- // remaining lanes from previous write Hence (4 - dst_lane_offset) is added to
111- // tensor size in packed dimension
112- const auto dst_packed_size = utils::div_up_4 (
113- (4 - dst_lane_offset) +
114- dim_at (t_in->sizes (), normalize_to_dim_index (*t_in, packed_dim)));
115-
116- // If the starting src offset is not 0, and the total packed texels is greater
117- // than the source texel range
118- const bool has_additional_src_work =
119- src_lane_offset != 0 && src_packed_size > final_range[packed_dim];
120- // If the starting dst offset is not 0, and the total packed texels is greater
121- // than the source texel range
122- const bool has_additional_dst_work =
123- dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim];
124-
125- if (has_additional_src_work || has_additional_dst_work) {
126- global_wg_size[packed_dim]++; // Increase the global work group size in
127- // packed dimension
128- final_range[packed_dim]++; // Increase the range in packed dimension
98+
99+ if (!repeat) {
100+ const auto packed_dim = t_in->packed_dim ();
101+ // The starting offset in a texel where this tensor will start copying from
102+ const auto src_lane_offset = src_offset[packed_dim] & 0x3 ;
103+ // The starting offset in a texel where this tensor will start copying to
104+ const auto dst_lane_offset = dst_offset[packed_dim] & 0x3 ;
105+
106+ // The total packed texels this tensor will be copied from
107+ // The first texel of tensor data in packed dimension will be copied from
108+ // remaining lanes from current source Hence (4 - src_lane_offset) is added
109+ // to tensor size in packed dimension
110+ const auto src_packed_size = utils::div_up_4 (
111+ (4 - src_lane_offset) +
112+ dim_at (t_out->sizes (), normalize_to_dim_index (*t_out, packed_dim)));
113+
114+ // The total packed texels this tensor will be copied to
115+ // The first texel of tensor data in packed dimension will be copied to
116+ // remaining lanes from previous write Hence (4 - dst_lane_offset) is added
117+ // to tensor size in packed dimension
118+ const auto dst_packed_size = utils::div_up_4 (
119+ (4 - dst_lane_offset) +
120+ dim_at (t_in->sizes (), normalize_to_dim_index (*t_in, packed_dim)));
121+
122+ // If the starting src offset is not 0, and the total packed texels is
123+ // greater than the source texel range
124+ const bool has_additional_src_work =
125+ src_lane_offset != 0 && src_packed_size > final_range[packed_dim];
126+ // If the starting dst offset is not 0, and the total packed texels is
127+ // greater than the source texel range
128+ const bool has_additional_dst_work =
129+ dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim];
130+
131+ if (has_additional_src_work || has_additional_dst_work) {
132+ global_wg_size[packed_dim]++; // Increase the global work group size in
133+ // packed dimension
134+ final_range[packed_dim]++; // Increase the range in packed dimension
135+ }
129136 }
130137
131138 auto shader = VK_KERNEL_FROM_STR (kernel_name);
@@ -144,7 +151,7 @@ void add_copy_packed_dim_offset_node(
144151 // Parameter buffers
145152 {},
146153 // Specialization Constants
147- {graph.hashed_layout_of (out), graph.hashed_layout_of (in)},
154+ {graph.hashed_layout_of (out), graph.hashed_layout_of (in), repeat },
148155 nullptr ,
149156 {},
150157 {
0 commit comments