@@ -71,21 +71,17 @@ void add_copy_packed_dim_offset_node(
7171 const ivec3& range,
7272 const ivec4& src_offset,
7373 const ivec4& dst_offset,
74- const ValueRef out,
75- bool repeat) {
74+ const ValueRef out) {
7675 vTensorPtr t_in = graph.get_tensor (in);
7776 vTensorPtr t_out = graph.get_tensor (out);
7877
79- // Check the packed dimension is same for both tensors
80- VK_CHECK_COND (check_same_packed_dim (*t_in, *t_out));
81- if (!repeat) {
82- // For non repeat copy also check if the packed dimension is Width or
83- // Height. Since the function does not support channel packing.
84- VK_CHECK_COND (
85- check_same_packed_dim (*t_in, *t_out) &&
86- (check_packed_dim_is (*t_in, WHCN::kWidthDim ) ||
87- check_packed_dim_is (*t_in, WHCN::kHeightDim )));
88- }
78+ // Check the packed dimension is same for both tensors, also check if the
79+ // packed dimension is Width or Height. Since the function does not support
80+ // channel packing.
81+ VK_CHECK_COND (
82+ check_same_packed_dim (*t_in, *t_out) &&
83+ (check_packed_dim_is (*t_in, WHCN::kWidthDim ) ||
84+ check_packed_dim_is (*t_in, WHCN::kHeightDim )));
8985
9086 std::string kernel_name = " copy_packed_dim_offset" ;
9187 kernel_name.reserve (kShaderNameReserve );
@@ -96,43 +92,41 @@ void add_copy_packed_dim_offset_node(
9692 range[0 ], range[1 ], range[2 ], dim_at (t_in->sizes (), kBatch4D )};
9793 ivec3 global_wg_size = t_out->logical_limits ();
9894
99- if (!repeat) {
100- const auto packed_dim = t_in->packed_dim ();
101- // The starting offset in a texel where this tensor will start copying from
102- const auto src_lane_offset = src_offset[packed_dim] & 0x3 ;
103- // The starting offset in a texel where this tensor will start copying to
104- const auto dst_lane_offset = dst_offset[packed_dim] & 0x3 ;
105-
106- // The total packed texels this tensor will be copied from
107- // The first texel of tensor data in packed dimension will be copied from
108- // remaining lanes from current source Hence (4 - src_lane_offset) is added
109- // to tensor size in packed dimension
110- const auto src_packed_size = utils::div_up_4 (
111- (4 - src_lane_offset) +
112- dim_at (t_out->sizes (), normalize_to_dim_index (*t_out, packed_dim)));
113-
114- // The total packed texels this tensor will be copied to
115- // The first texel of tensor data in packed dimension will be copied to
116- // remaining lanes from previous write Hence (4 - dst_lane_offset) is added
117- // to tensor size in packed dimension
118- const auto dst_packed_size = utils::div_up_4 (
119- (4 - dst_lane_offset) +
120- dim_at (t_in->sizes (), normalize_to_dim_index (*t_in, packed_dim)));
121-
122- // If the starting src offset is not 0, and the total packed texels is
123- // greater than the source texel range
124- const bool has_additional_src_work =
125- src_lane_offset != 0 && src_packed_size > final_range[packed_dim];
126- // If the starting dst offset is not 0, and the total packed texels is
127- // greater than the source texel range
128- const bool has_additional_dst_work =
129- dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim];
130-
131- if (has_additional_src_work || has_additional_dst_work) {
132- global_wg_size[packed_dim]++; // Increase the global work group size in
133- // packed dimension
134- final_range[packed_dim]++; // Increase the range in packed dimension
135- }
95+ const auto packed_dim = t_in->packed_dim ();
96+ // The starting offset in a texel where this tensor will start copying from
97+ const auto src_lane_offset = src_offset[packed_dim] & 0x3 ;
98+ // The starting offset in a texel where this tensor will start copying to
99+ const auto dst_lane_offset = dst_offset[packed_dim] & 0x3 ;
100+
101+ // The total packed texels this tensor will be copied from
102+ // The first texel of tensor data in packed dimension will be copied from
103+ // remaining lanes from current source Hence (4 - src_lane_offset) is added
104+ // to tensor size in packed dimension
105+ const auto src_packed_size = utils::div_up_4 (
106+ (4 - src_lane_offset) +
107+ dim_at (t_out->sizes (), normalize_to_dim_index (*t_out, packed_dim)));
108+
109+ // The total packed texels this tensor will be copied to
110+ // The first texel of tensor data in packed dimension will be copied to
111+ // remaining lanes from previous write Hence (4 - dst_lane_offset) is added
112+ // to tensor size in packed dimension
113+ const auto dst_packed_size = utils::div_up_4 (
114+ (4 - dst_lane_offset) +
115+ dim_at (t_in->sizes (), normalize_to_dim_index (*t_in, packed_dim)));
116+
117+ // If the starting src offset is not 0, and the total packed texels is
118+ // greater than the source texel range
119+ const bool has_additional_src_work =
120+ src_lane_offset != 0 && src_packed_size > final_range[packed_dim];
121+ // If the starting dst offset is not 0, and the total packed texels is
122+ // greater than the source texel range
123+ const bool has_additional_dst_work =
124+ dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim];
125+
126+ if (has_additional_src_work || has_additional_dst_work) {
127+ global_wg_size[packed_dim]++; // Increase the global work group size in
128+ // packed dimension
129+ final_range[packed_dim]++; // Increase the range in packed dimension
136130 }
137131
138132 auto shader = VK_KERNEL_FROM_STR (kernel_name);
@@ -151,7 +145,7 @@ void add_copy_packed_dim_offset_node(
151145 // Parameter buffers
152146 {},
153147 // Specialization Constants
154- {graph.hashed_layout_of (out), graph.hashed_layout_of (in), repeat ? 1 : 0 },
148+ {graph.hashed_layout_of (out), graph.hashed_layout_of (in)},
155149 nullptr ,
156150 {},
157151 {
0 commit comments