ARM-software
diff --git a/‎src/core/NEON/kernels/NEReorderKernel.cpp‎
Lines changed: 72 additions & 48 deletions b/‎src/core/NEON/kernels/NEReorderKernel.cpp‎
Lines changed: 72 additions & 48 deletions
diff --git a/‎src/core/NEON/kernels/arm_gemm/transform.cpp‎
Lines changed: 4 additions & 1 deletion b/‎src/core/NEON/kernels/arm_gemm/transform.cpp‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎tests/datasets/ReorderLayerDataset.h‎
Lines changed: 27 additions & 34 deletions b/‎tests/datasets/ReorderLayerDataset.h‎
Lines changed: 27 additions & 34 deletions
diff --git a/‎tests/validation/NEON/ReorderLayer.cpp‎
Lines changed: 2 additions & 2 deletions b/‎tests/validation/NEON/ReorderLayer.cpp‎
Lines changed: 2 additions & 2 deletions
@@ -60,7 +60,10 @@ struct TransformParams
 
 std::map<TransformParams, void (*)(float *, const float *, int, int, int, int, int)> supported_float_transforms = {
     {{4, 1, true, arm_gemm::VLType::None}, &arm_gemm::Transform<4, 1, true, arm_gemm::VLType::None, float, float>},
+    {{4, 1, false, arm_gemm::VLType::None}, &arm_gemm::Transform<4, 1, false, arm_gemm::VLType::None, float, float>},
+    {{8, 1, false, arm_gemm::VLType::None}, &arm_gemm::Transform<8, 1, false, arm_gemm::VLType::None, float, float>},
 #ifdef ARM_COMPUTE_ENABLE_SVE
+    // When there is an asm kernel, use formula in transform.cpp to get the interleave_by_ number
     {{1, 1, true, arm_gemm::VLType::SVE}, &arm_gemm::Transform<1, 1, true, arm_gemm::VLType::SVE, float, float>},
 #endif // ARM_COMPUTE_ENABLE_SVE
 };
@@ -72,6 +75,17 @@ std::map<TransformParams, void (*)(bfloat16 *, const float *, int, int, int, int
 #endif // ARM_COMPUTE_ENABLE_SVE
 };
 
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+// Calculate the interleave_by parameter needed for SVE kernels
+// using the formula listed in transform.cpp
+template <typename TOut>
+inline int get_sve_interleave_by(int interleave_by, int block_by)
+{
+    return interleave_by / (get_vector_length<TOut>() / block_by);
+}
+#endif // ARM_COMPUTE_ENABLE_SVE
+
 } // namespace
 
 void NEReorderKernel::run(const Window &window, const ThreadInfo &info)
@@ -84,7 +98,7 @@ void NEReorderKernel::run(const Window &window, const ThreadInfo &info)
     const int jump_rows           = ksize_rows_elements * window.x().start();
     const int k_start             = window.x().start() * _ksize;
     const int k_end               = std::min(window.x().end() * _ksize, _kmax);
-    const int stride              = _kmax;
+    const int stride              = _transpose ? _kmax : _xmax;
     const int block_by            = arm_compute::block_by(_output_wf);
     const int interleave_by       = arm_compute::interleave_by(_output_wf);
     ARM_COMPUTE_ERROR_ON(interleave_by != 4 && interleave_by != 8);
@@ -96,22 +110,46 @@ void NEReorderKernel::run(const Window &window, const ThreadInfo &info)
     {
         case DataType::F32:
         {
-            // Interleave_by is different for SVE cases. Refer to src/core/NEON/kernels/arm_gemm/transform.cpp
-            const int interleave_by_ = interleave_by == 8 ? interleave_by / (8 / block_by) : 4;
-            supported_float_transforms[{interleave_by_, block_by, _transpose,
-                                        interleave_by == 8 ? arm_gemm::VLType::SVE : arm_gemm::VLType::None}](
-                reinterpret_cast<float *>(_output->buffer()) + jump_rows, reinterpret_cast<float *>(_input->buffer()),
-                stride, k_start, k_end, 0, _xmax);
+            void (*transform_func)(float *, const float *, int, int, int, int, int) = nullptr;
+#ifdef ARM_COMPUTE_ENABLE_SVE
+            if (CPUInfo::get().has_sve())
+            {
+                TransformParams tparams = {get_sve_interleave_by<float>(interleave_by, block_by), block_by, _transpose,
+                                           arm_gemm::VLType::SVE};
+                if (supported_float_transforms.count(tparams))
+                {
+                    transform_func = supported_float_transforms[tparams];
+                }
+            }
+#endif // ARM_COMPUTE_ENABLE_SVE
+            if (transform_func == nullptr)
+            {
+                transform_func =
+                    supported_float_transforms[{interleave_by, block_by, _transpose, arm_gemm::VLType::None}];
+            }
+            transform_func(reinterpret_cast<float *>(_output->buffer()) + jump_rows,
+                           reinterpret_cast<float *>(_input->buffer()), stride, k_start, k_end, 0, _xmax);
             break;
         }
         case DataType::BFLOAT16:
         {
-            // Interleave_by is different for SVE cases. Refer to transform.cpp
-            const int interleave_by_ = interleave_by == 8 ? interleave_by / (16 / block_by) : 4;
-            supported_bf16_transforms[{interleave_by_, block_by, _transpose,
-                                       interleave_by == 8 ? arm_gemm::VLType::SVE : arm_gemm::VLType::None}](
-                reinterpret_cast<bfloat16 *>(_output->buffer()) + jump_rows,
-                reinterpret_cast<float *>(_input->buffer()), stride, k_start, k_end, 0, _xmax);
+            void (*transform_func)(bfloat16 *, const float *, int, int, int, int, int) = nullptr;
+#ifdef ARM_COMPUTE_ENABLE_SVE
+            if (CPUInfo::get().has_sve())
+            {
+                TransformParams tparams = {get_sve_interleave_by<bfloat16>(interleave_by, block_by), block_by,
+                                           _transpose, arm_gemm::VLType::SVE};
+                if (supported_bf16_transforms.count(tparams))
+                    transform_func = supported_bf16_transforms[tparams];
+            }
+#endif // ARM_COMPUTE_ENABLE_SVE
+            if (transform_func == nullptr)
+            {
+                transform_func =
+                    supported_bf16_transforms[{interleave_by, block_by, _transpose, arm_gemm::VLType::None}];
+            }
+            transform_func(reinterpret_cast<bfloat16 *>(_output->buffer()) + jump_rows,
+                           reinterpret_cast<float *>(_input->buffer()), stride, k_start, k_end, 0, _xmax);
             break;
         }
         default:
@@ -237,52 +275,38 @@ Status NEReorderKernel::validate(const ITensorInfo        *input,
         int interleave_by = arm_compute::interleave_by(output_wf);
         int block_by      = arm_compute::block_by(output_wf);
         ARM_COMPUTE_RETURN_ERROR_ON(interleave_by != 4 && interleave_by != 8);
-        if (interleave_by == 8)
-        {
-#ifdef ARM_COMPUTE_ENABLE_SVE
-            ARM_COMPUTE_RETURN_ERROR_ON(!Scheduler::get().cpu_info().has_sve() ||
-                                        arm_gemm::utils::get_vector_length<float>() != 8);
-#else  // ARM_COMPUTE_ENABLE_SVE
-            ARM_COMPUTE_RETURN_ERROR_MSG("SVE format requested on non-SVE machine");
-#endif // ARM_COMPUTE_ENABLE_SVE
-        }
         ksize = interleave_by;
 
-        if (transpose)
-        {
-            // output k_dim needs to be same as input but multiple of ksize
-            int32_t rnd_up_input_kdim = arm_compute::ceil_to_multiple<int32_t, int32_t>(input_k_dim, ksize);
-            ARM_COMPUTE_RETURN_ERROR_ON(rnd_up_input_kdim != output_k_dim);
-            // output x_dim needs to be same as input
-            ARM_COMPUTE_RETURN_ERROR_ON(input_x_dim != output_x_dim);
-        }
-        else
-        {
-            // output x_dim needs to be same as input but multiple of ksize
-            int32_t rnd_up_input_xdim = arm_compute::ceil_to_multiple<int32_t, int32_t>(input_x_dim, ksize);
-            ARM_COMPUTE_RETURN_ERROR_ON(rnd_up_input_xdim != output_x_dim);
-            // output k_dim needs to be same as input
-            ARM_COMPUTE_RETURN_ERROR_ON(input_k_dim != output_k_dim);
-        }
+        // output k_dim needs to be same as input but multiple of ksize
+        int32_t rnd_up_input_kdim = arm_compute::ceil_to_multiple<int32_t, int32_t>(input_k_dim, ksize);
+        ARM_COMPUTE_RETURN_ERROR_ON(rnd_up_input_kdim != output_k_dim);
+        // output x_dim needs to be same as input
+        ARM_COMPUTE_RETURN_ERROR_ON(input_x_dim != output_x_dim);
 
         switch (output->data_type())
         {
             case DataType::F32:
             {
-                // Interleave_by is different for SVE cases. Refer to transform.cpp
-                const int interleave_by_ = interleave_by == 8 ? interleave_by / (8 / block_by) : 4;
-                ARM_COMPUTE_RETURN_ERROR_ON(!supported_float_transforms.count(
-                    {interleave_by_, block_by, transpose,
-                     interleave_by == 8 ? arm_gemm::VLType::SVE : arm_gemm::VLType::None}));
+#ifdef ARM_COMPUTE_ENABLE_SVE
+                if (CPUInfo::get().has_sve() &&
+                    supported_float_transforms.count({get_sve_interleave_by<float>(interleave_by, block_by), block_by,
+                                                      transpose, arm_gemm::VLType::SVE}))
+                    break;
+#endif // ARM_COMPUTE_ENABLE_SVE
+                ARM_COMPUTE_RETURN_ERROR_ON(
+                    !supported_float_transforms.count({interleave_by, block_by, transpose, arm_gemm::VLType::None}));
                 break;
             }
             case DataType::BFLOAT16:
             {
-                // Interleave_by is different for SVE cases. Refer to transform.cpp
-                const int interleave_by_ = interleave_by == 8 ? interleave_by / (16 / block_by) : 4;
-                ARM_COMPUTE_RETURN_ERROR_ON(!supported_bf16_transforms.count(
-                    {interleave_by_, block_by, transpose,
-                     interleave_by == 8 ? arm_gemm::VLType::SVE : arm_gemm::VLType::None}));
+#ifdef ARM_COMPUTE_ENABLE_SVE
+                if (CPUInfo::get().has_sve() &&
+                    supported_bf16_transforms.count({get_sve_interleave_by<bfloat16>(interleave_by, block_by), block_by,
+                                                     transpose, arm_gemm::VLType::SVE}))
+                    break;
+#endif // ARM_COMPUTE_ENABLE_SVE
+                ARM_COMPUTE_RETURN_ERROR_ON(
+                    !supported_bf16_transforms.count({interleave_by, block_by, transpose, arm_gemm::VLType::None}));
                 break;
             }
             default:
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024 Arm Limited.
+ * Copyright (c) 2021-2025 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -126,6 +126,9 @@ void Transform(
 
 #include "transforms/list.hpp"
 
+template void Transform<4, 1, false, VLType::None>(float *, const float *, int, int, int, int, int);
+template void Transform<8, 1, false, VLType::None>(float *, const float *, int, int, int, int, int);
+
 // We don't have assembler transforms for AArch32, generate templated ones here.
 #ifdef __arm__
 template void Transform<8, 1, true, VLType::None>(float *, const float *, int, int, int, int, int);
 
@@ -38,20 +38,18 @@ namespace datasets
 class ReorderLayerDataset
 {
 public:
-    using type = std::tuple<TensorShape, TensorShape, WeightFormat, WeightFormat, bool>;
+    using type = std::tuple<TensorShape, TensorShape, WeightFormat, WeightFormat>;
 
     struct iterator
     {
         iterator(std::vector<TensorShape>::const_iterator  in_it,
                  std::vector<TensorShape>::const_iterator  out_it,
                  std::vector<WeightFormat>::const_iterator _wf_in_it,
-                 std::vector<WeightFormat>::const_iterator _wf_out_it,
-                 std::vector<bool>::const_iterator _transposes_it)
+                 std::vector<WeightFormat>::const_iterator _wf_out_it)
             : _in_it{ std::move(in_it) },
               _out_it{ std::move(out_it) },
               _wf_in_it{ std::move(_wf_in_it) },
-              _wf_out_it{ std::move(_wf_out_it) },
-              _transposes_it{ std::move(_transposes_it) }
+              _wf_out_it{ std::move(_wf_out_it) }
         {
         }
 
@@ -62,13 +60,12 @@ class ReorderLayerDataset
             description << "Out=" << *_out_it << ":";
             description << "Wf_In=" << *_wf_in_it << ":";
             description << "Wf_Out=" << *_wf_out_it;
-            description << "Transpose=" << *_transposes_it;
             return description.str();
         }
 
         ReorderLayerDataset::type operator*() const
         {
-            return std::make_tuple(*_in_it, *_out_it, *_wf_in_it, *_wf_out_it, *_transposes_it);
+            return std::make_tuple(*_in_it, *_out_it, *_wf_in_it, *_wf_out_it);
         }
 
         iterator &operator++()
@@ -77,7 +74,6 @@ class ReorderLayerDataset
             ++_out_it;
             ++_wf_in_it;
             ++_wf_out_it;
-            ++_transposes_it;
 
             return *this;
         }
@@ -87,26 +83,24 @@ class ReorderLayerDataset
         std::vector<TensorShape>::const_iterator  _out_it;
         std::vector<WeightFormat>::const_iterator _wf_in_it;
         std::vector<WeightFormat>::const_iterator _wf_out_it;
-        std::vector<bool>::const_iterator _transposes_it;
     };
 
     iterator begin() const
     {
-        return iterator(_in_shapes.begin(), _out_shapes.begin(), _in_wfs.begin(), _out_wfs.begin(), _transposes.begin());
+        return iterator(_in_shapes.begin(), _out_shapes.begin(), _in_wfs.begin(), _out_wfs.begin());
     }
 
     int size() const
     {
-        return std::min(_in_shapes.size(), std::min(_out_shapes.size(), std::min(_in_wfs.size(), std::min(_out_wfs.size(), _transposes.size()))));
+        return std::min(_in_shapes.size(), std::min(_out_shapes.size(), std::min(_in_wfs.size(), _out_wfs.size())));
     }
 
-    void add_config(TensorShape in, TensorShape out, WeightFormat in_wf, WeightFormat out_wf, bool transpose)
+    void add_config(TensorShape in, TensorShape out, WeightFormat in_wf, WeightFormat out_wf)
     {
         _in_shapes.emplace_back(std::move(in));
         _out_shapes.emplace_back(std::move(out));
         _in_wfs.emplace_back(std::move(in_wf));
         _out_wfs.emplace_back(std::move(out_wf));
-        _transposes.emplace_back(transpose);
     }
 
     // protected:
@@ -118,7 +112,6 @@ class ReorderLayerDataset
     std::vector<TensorShape>  _out_shapes{};
     std::vector<WeightFormat> _in_wfs{};
     std::vector<WeightFormat> _out_wfs{};
-    std::vector<bool> _transposes{};
 };
 
 /** [ReorderLayer datasets] **/
@@ -128,16 +121,16 @@ class ReorderLayerDatasetBlock4 final : public ReorderLayerDataset
     public:
     ReorderLayerDatasetBlock4()
     {
-        add_config(TensorShape(10U, 9U), TensorShape(10U, 12U), WeightFormat::OHWI, WeightFormat::OHWIo4, true);
-        add_config(TensorShape(16U, 16U), TensorShape(16U, 16U), WeightFormat::OHWI, WeightFormat::OHWIo4, true);
-        add_config(TensorShape(10U, 511U), TensorShape(10U, 512U), WeightFormat::OHWI, WeightFormat::OHWIo4, true);
-        add_config(TensorShape(234U, 301U), TensorShape(234U, 304U), WeightFormat::OHWI, WeightFormat::OHWIo4, true);
-        add_config(TensorShape(1024U, 1024U), TensorShape(1024U, 1024U), WeightFormat::OHWI, WeightFormat::OHWIo4, true);
-        add_config(TensorShape(10U, 9U, 1U, 1U), TensorShape(10U, 12U, 1U, 1U), WeightFormat::OHWI, WeightFormat::OHWIo4, true);
-        add_config(TensorShape(16U, 16U, 1U, 1U), TensorShape(16U, 16U, 1U, 1U), WeightFormat::OHWI, WeightFormat::OHWIo4, true);
-        add_config(TensorShape(10U, 511U, 1U, 1U), TensorShape(10U, 512U, 1U, 1U), WeightFormat::OHWI, WeightFormat::OHWIo4, true);
-        add_config(TensorShape(234U, 301U, 1U, 1U), TensorShape(234U, 304U, 1U, 1U), WeightFormat::OHWI, WeightFormat::OHWIo4, true);
-        add_config(TensorShape(1024U, 1024U, 1U, 1U), TensorShape(1024U, 1024U, 1U, 1U), WeightFormat::OHWI, WeightFormat::OHWIo4, true);
+        add_config(TensorShape(10U, 9U), TensorShape(10U, 12U), WeightFormat::OHWI, WeightFormat::OHWIo4);
+        add_config(TensorShape(16U, 16U), TensorShape(16U, 16U), WeightFormat::OHWI, WeightFormat::OHWIo4);
+        add_config(TensorShape(10U, 511U), TensorShape(10U, 512U), WeightFormat::OHWI, WeightFormat::OHWIo4);
+        add_config(TensorShape(234U, 301U), TensorShape(234U, 304U), WeightFormat::OHWI, WeightFormat::OHWIo4);
+        add_config(TensorShape(1024U, 1024U), TensorShape(1024U, 1024U), WeightFormat::OHWI, WeightFormat::OHWIo4);
+        add_config(TensorShape(10U, 9U, 1U, 1U), TensorShape(10U, 12U, 1U, 1U), WeightFormat::OHWI, WeightFormat::OHWIo4);
+        add_config(TensorShape(16U, 16U, 1U, 1U), TensorShape(16U, 16U, 1U, 1U), WeightFormat::OHWI, WeightFormat::OHWIo4);
+        add_config(TensorShape(10U, 511U, 1U, 1U), TensorShape(10U, 512U, 1U, 1U), WeightFormat::OHWI, WeightFormat::OHWIo4);
+        add_config(TensorShape(234U, 301U, 1U, 1U), TensorShape(234U, 304U, 1U, 1U), WeightFormat::OHWI, WeightFormat::OHWIo4);
+        add_config(TensorShape(1024U, 1024U, 1U, 1U), TensorShape(1024U, 1024U, 1U, 1U), WeightFormat::OHWI, WeightFormat::OHWIo4);
     }
 };
 
@@ -146,16 +139,16 @@ class ReorderLayerDatasetBlock8 final : public ReorderLayerDataset
     public:
     ReorderLayerDatasetBlock8()
     {
-        add_config(TensorShape(10U, 9U), TensorShape(10U, 16U), WeightFormat::OHWI, WeightFormat::OHWIo8, true);
-        add_config(TensorShape(16U, 16U), TensorShape(16U, 16U), WeightFormat::OHWI, WeightFormat::OHWIo8, true);
-        add_config(TensorShape(10U, 511U), TensorShape(10U, 512U), WeightFormat::OHWI, WeightFormat::OHWIo8, true);
-        add_config(TensorShape(234U, 301U), TensorShape(234U, 304U), WeightFormat::OHWI, WeightFormat::OHWIo8, true);
-        add_config(TensorShape(1024U, 1024U), TensorShape(1024U, 1024U), WeightFormat::OHWI, WeightFormat::OHWIo8, true);
-        add_config(TensorShape(10U, 9U, 1U, 1U), TensorShape(10U, 16U, 1U, 1U), WeightFormat::OHWI, WeightFormat::OHWIo8, true);
-        add_config(TensorShape(16U, 16U, 1U, 1U), TensorShape(16U, 16U, 1U, 1U), WeightFormat::OHWI, WeightFormat::OHWIo8, true);
-        add_config(TensorShape(10U, 511U, 1U, 1U), TensorShape(10U, 512U, 1U, 1U), WeightFormat::OHWI, WeightFormat::OHWIo8, true);
-        add_config(TensorShape(234U, 301U, 1U, 1U), TensorShape(234U, 304U, 1U, 1U), WeightFormat::OHWI, WeightFormat::OHWIo8, true);
-        add_config(TensorShape(1024U, 1024U, 1U, 1U), TensorShape(1024U, 1024U, 1U, 1U), WeightFormat::OHWI, WeightFormat::OHWIo8, true);
+        add_config(TensorShape(10U, 9U), TensorShape(10U, 16U), WeightFormat::OHWI, WeightFormat::OHWIo8);
+        add_config(TensorShape(16U, 16U), TensorShape(16U, 16U), WeightFormat::OHWI, WeightFormat::OHWIo8);
+        add_config(TensorShape(10U, 511U), TensorShape(10U, 512U), WeightFormat::OHWI, WeightFormat::OHWIo8);
+        add_config(TensorShape(234U, 301U), TensorShape(234U, 304U), WeightFormat::OHWI, WeightFormat::OHWIo8);
+        add_config(TensorShape(1024U, 1024U), TensorShape(1024U, 1024U), WeightFormat::OHWI, WeightFormat::OHWIo8);
+        add_config(TensorShape(10U, 9U, 1U, 1U), TensorShape(10U, 16U, 1U, 1U), WeightFormat::OHWI, WeightFormat::OHWIo8);
+        add_config(TensorShape(16U, 16U, 1U, 1U), TensorShape(16U, 16U, 1U, 1U), WeightFormat::OHWI, WeightFormat::OHWIo8);
+        add_config(TensorShape(10U, 511U, 1U, 1U), TensorShape(10U, 512U, 1U, 1U), WeightFormat::OHWI, WeightFormat::OHWIo8);
+        add_config(TensorShape(234U, 301U, 1U, 1U), TensorShape(234U, 304U, 1U, 1U), WeightFormat::OHWI, WeightFormat::OHWIo8);
+        add_config(TensorShape(1024U, 1024U, 1U, 1U), TensorShape(1024U, 1024U, 1U, 1U), WeightFormat::OHWI, WeightFormat::OHWIo8);
     }
 };
 
 
@@ -80,7 +80,7 @@ DATA_TEST_CASE(ValidateReorderOHWIo8, framework::DatasetMode::ALL, combine(
     }
 }
 
-FIXTURE_DATA_TEST_CASE(RunBlock8, NEReorderLayerAlias<float>, framework::DatasetMode::ALL, combine(datasets::ReorderLayerDatasetBlock8(), make("DataType", DataType::F32)))
+FIXTURE_DATA_TEST_CASE(RunBlock8, NEReorderLayerAlias<float>, framework::DatasetMode::ALL, combine(datasets::ReorderLayerDatasetBlock8(), make("Transpose", {true, false}), make("DataType", DataType::F32)))
 {
     // Validate output
     if (_hardware_supports)
@@ -90,7 +90,7 @@ FIXTURE_DATA_TEST_CASE(RunBlock8, NEReorderLayerAlias<float>, framework::Dataset
 }
 #endif // ARM_COMPUTE_ENABLE_SVE
 
-FIXTURE_DATA_TEST_CASE(RunBlock4, NEReorderLayerAlias<float>, framework::DatasetMode::ALL, combine(datasets::ReorderLayerDatasetBlock4(), make("DataType", DataType::F32)))
+FIXTURE_DATA_TEST_CASE(RunBlock4, NEReorderLayerAlias<float>, framework::DatasetMode::ALL, combine(datasets::ReorderLayerDatasetBlock4(), make("Transpose", {true, false}), make("DataType", DataType::F32)))
 {
     // Validate output
     validate(Accessor(_target), _reference);
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`/*`
`2`		`- * Copyright (c) 2021-2024 Arm Limited.`
	`2`	`+ * Copyright (c) 2021-2025 Arm Limited.`
`3`	`3`	`*`
`4`	`4`	`* SPDX-License-Identifier: MIT`
`5`	`5`	`*`
`@@ -126,6 +126,9 @@ void Transform(`
`126`	`126`
`127`	`127`	`#include "transforms/list.hpp"`
`128`	`128`
	`129`	`+template void Transform<4, 1, false, VLType::None>(float , const float , int, int, int, int, int);`
	`130`	`+template void Transform<8, 1, false, VLType::None>(float , const float , int, int, int, int, int);`
	`131`	`+`
`129`	`132`	`// We don't have assembler transforms for AArch32, generate templated ones here.`
`130`	`133`	`#ifdef __arm__`
`131`	`134`	`template void Transform<8, 1, true, VLType::None>(float , const float , int, int, int, int, int);`
Original file line number	Diff line number	Diff line change
`@@ -80,7 +80,7 @@ DATA_TEST_CASE(ValidateReorderOHWIo8, framework::DatasetMode::ALL, combine(`
`80`	`80`	`}`
`81`	`81`	`}`
`82`	`82`
`83`		`-FIXTURE_DATA_TEST_CASE(RunBlock8, NEReorderLayerAlias<float>, framework::DatasetMode::ALL, combine(datasets::ReorderLayerDatasetBlock8(), make("DataType", DataType::F32)))`
	`83`	`+FIXTURE_DATA_TEST_CASE(RunBlock8, NEReorderLayerAlias<float>, framework::DatasetMode::ALL, combine(datasets::ReorderLayerDatasetBlock8(), make("Transpose", {true, false}), make("DataType", DataType::F32)))`
`84`	`84`	`{`
`85`	`85`	`// Validate output`
`86`	`86`	`if (_hardware_supports)`
`@@ -90,7 +90,7 @@ FIXTURE_DATA_TEST_CASE(RunBlock8, NEReorderLayerAlias<float>, framework::Dataset`
`90`	`90`	`}`
`91`	`91`	`#endif // ARM_COMPUTE_ENABLE_SVE`
`92`	`92`
`93`		`-FIXTURE_DATA_TEST_CASE(RunBlock4, NEReorderLayerAlias<float>, framework::DatasetMode::ALL, combine(datasets::ReorderLayerDatasetBlock4(), make("DataType", DataType::F32)))`
	`93`	`+FIXTURE_DATA_TEST_CASE(RunBlock4, NEReorderLayerAlias<float>, framework::DatasetMode::ALL, combine(datasets::ReorderLayerDatasetBlock4(), make("Transpose", {true, false}), make("DataType", DataType::F32)))`
`94`	`94`	`{`
`95`	`95`	`// Validate output`
`96`	`96`	`validate(Accessor(_target), _reference);`