aarch64: Add -msimd-memops option controlling SIMD usage in memset/memcpy

keith-packard · stephanosio · commit dd01da6c36ae · 2025-08-11T20:53:23.000+09:00
This option (enabled by default) preserves existing behavior by allowing use of Advanced SIMD registers while expanding memset/memcpy/memmove operations into inline instructions. Disabling this option prevents use of these registers for environments where the FPU may be disabled to reduce the cost of saving/restoring the processor state, such as in interrupt handlers. Signed-off-by: Keith Packard <keithp@keithp.com> (cherry picked from commit 65837c3)
diff --git a/gcc/common/config/aarch64/aarch64-common.cc b/gcc/common/config/aarch64/aarch64-common.cc
@@ -133,6 +133,10 @@ aarch64_handle_option (struct gcc_options *opts,
       opts->x_aarch64_flag_outline_atomics = val;
       return true;
 
+    case OPT_msimd_memops:
+      opts->x_aarch64_flag_simd_memops = val;
+      return true;
+
     default:
       return true;
     }
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
@@ -19371,6 +19371,8 @@ static const struct aarch64_attribute_info aarch64_attributes[] =
      OPT_msign_return_address_ },
   { "outline-atomics", aarch64_attr_bool, true, NULL,
      OPT_moutline_atomics},
+  { "simd-memops", aarch64_attr_bool, true, NULL,
+     OPT_msimd_memops},
   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
 };
 
@@ -26652,8 +26654,8 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
     return aarch64_expand_cpymem_mops (operands, is_memmove);
 
   unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
-  bool use_ldpq = TARGET_SIMD && !(aarch64_tune_params.extra_tuning_flags
-				   & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS);
+  bool use_ldpq = TARGET_SIMD_MEMOPS && !(aarch64_tune_params.extra_tuning_flags
+					  & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS);
 
   /* Set inline limits for memmove/memcpy.  MOPS has a separate threshold.  */
   unsigned max_copy_size = use_ldpq ? 256 : 128;
@@ -26673,7 +26675,7 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
      ??? Although it would be possible to use LDP/STP Qn in streaming mode
      (so using TARGET_BASE_SIMD instead of TARGET_SIMD), it isn't clear
      whether that would improve performance.  */
-  bool use_qregs = size > 24 && TARGET_SIMD;
+  bool use_qregs = size > 24 && TARGET_SIMD_MEMOPS;
 
   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
@@ -26814,7 +26816,7 @@ aarch64_expand_setmem (rtx *operands)
   machine_mode cur_mode = BLKmode, next_mode;
 
   /* Variable-sized or strict-align memset may use the MOPS expansion.  */
-  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
+  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD_MEMOPS
       || (STRICT_ALIGNMENT && align < 16))
     return aarch64_expand_setmem_mops (operands);
 
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
@@ -125,6 +125,13 @@
    of LSE instructions.  */
 #define TARGET_OUTLINE_ATOMICS (aarch64_flag_outline_atomics)
 
+#ifndef AARCH64_SIMD_MEMOPS_DEFAULT
+#define AARCH64_SIMD_MEMOPS_DEFAULT 1
+#endif
+
+/* Allow use of SIMD registers for memory copy and set expansions */
+#define TARGET_SIMD_MEMOPS (TARGET_SIMD && aarch64_flag_simd_memops)
+
 /* Align definitions of arrays, unions and structures so that
    initializations and copies can be made more efficient.  This is not
    ABI-changing, so it only affects places where we can see the
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
@@ -332,6 +332,10 @@ moutline-atomics
 Target Var(aarch64_flag_outline_atomics) Init(2) Save
 Generate local calls to out-of-line atomic operations.
 
+msimd-memops
+Target Var(aarch64_flag_simd_memops) Init(AARCH64_SIMD_MEMOPS_DEFAULT) Save
+Allow use of SIMD registers in memory set/copy expansions.
+
 -param=aarch64-vect-compare-costs=
 Target Joined UInteger Var(aarch64_vect_compare_costs) Init(1) IntegerRange(0, 1) Param
 When vectorizing, consider using multiple different approaches and use
diff --git a/gcc/config/aarch64/aarch64.opt.urls b/gcc/config/aarch64/aarch64.opt.urls
@@ -91,3 +91,5 @@ UrlSuffix(gcc/AArch64-Options.html#index-mstack-protector-guard-reg)
 mstack-protector-guard-offset=
 UrlSuffix(gcc/AArch64-Options.html#index-mstack-protector-guard-offset)
 
+msimd-memops
+UrlSuffix(gcc/AArch64-Options.html#index-msimd-memops)
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
@@ -807,7 +807,7 @@ Objective-C and Objective-C++ Dialects}.
 -moverride=@var{string}  -mverbose-cost-dump
 -mstack-protector-guard=@var{guard} -mstack-protector-guard-reg=@var{sysreg}
 -mstack-protector-guard-offset=@var{offset} -mtrack-speculation
--moutline-atomics -mearly-ldp-fusion -mlate-ldp-fusion}
+-moutline-atomics -mearly-ldp-fusion -mlate-ldp-fusion -msimd-memops}
 
 @emph{Adapteva Epiphany Options}
 @gccoptlist{-mhalf-reg-file  -mprefer-short-insn-regs
@@ -21357,6 +21357,14 @@ used directly.  The same applies when using @option{-mcpu=} when the
 selected cpu supports the @samp{lse} feature.
 This option is on by default.
 
+@item -msimd-memops
+@itemx -mno-simd-memops
+Enable or disable use of Advanced SIMD registers when expanding memory
+copy and memory set operations. Use of these registers can improve
+performance and reduce instruction count for these operations. This
+option is ignored unless Advanced SIMD registers are available.
+This option is on by default.
+
 @opindex march
 @item -march=@var{name}
 Specify the name of the target architecture and, optionally, one or