@@ -159,8 +159,7 @@ namespace snmalloc
159159 std::max (sizeof (uint64_t ), sizeof (void *));
160160
161161 /* *
162- * Hook for architecture-specific optimisations. Does nothing in the
163- * default case.
162+ * Hook for architecture-specific optimisations.
164163 */
165164 static SNMALLOC_FAST_PATH_INLINE void
166165 copy (void * dst, const void * src, size_t len)
@@ -179,6 +178,135 @@ namespace snmalloc
179178 }
180179 };
181180
181+ /* *
182+ * StrictProvenance architectures are prickly about their pointers. In
183+ * particular, they may not permit misaligned loads and stores of
184+ * pointer-sized data, even if they can have non-pointers in their
185+ * pointer registers. On the other hand, pointers might be hiding anywhere
186+ * they are architecturally permitted!
187+ */
188+ struct GenericStrictProvenance
189+ {
190+ static_assert (bits::is_pow2(sizeof (void *)));
191+ /*
192+ * It's not entirely clear what we would do if this were not the case.
193+ * Best not think too hard about it now.
194+ */
195+ static_assert (alignof (void *) == sizeof (void *));
196+
197+ static constexpr size_t LargestRegisterSize = 16 ;
198+
199+ static SNMALLOC_FAST_PATH_INLINE void
200+ copy (void * dst, const void * src, size_t len)
201+ {
202+ /*
203+ * As a function of misalignment relative to pointers, how big do we need
204+ * to be such that the span could contain an aligned pointer? We'd need
205+ * to be big enough to contain the pointer and would need an additional
206+ * however many bytes it would take to get us up to alignment. That is,
207+ * (sizeof(void*) - src_misalign) except in the case that src_misalign is
208+ * 0, when the answer is 0, which we can get with some bit-twiddling.
209+ *
210+ * Below that threshold, just use a jump table to move bytes around.
211+ */
212+ if (
213+ len < sizeof (void *) +
214+ (static_cast <size_t >(-static_cast <ptrdiff_t >(address_cast (src))) &
215+ (alignof (void *) - 1 )))
216+ {
217+ small_copies<2 * sizeof (void *) - 1 , LargestRegisterSize>(dst, src, len);
218+ }
219+ /*
220+ * Equally-misaligned segments could be holding pointers internally,
221+ * assuming they're sufficiently large. In this case, perform unaligned
222+ * operations at the top and bottom of the range. This check also
223+ * suffices to include the case where both segments are
224+ * alignof(void*)-aligned.
225+ */
226+ else if (
227+ address_misalignment<alignof (void *)>(address_cast (src)) ==
228+ address_misalignment<alignof (void *)>(address_cast (dst)))
229+ {
230+ /*
231+ * Find the buffers' ends. Do this before the unaligned_start so that
232+ * there are fewer dependencies in the instruction stream; it would be
233+ * functionally equivalent to do so below.
234+ */
235+ auto dep = pointer_offset (dst, len);
236+ auto sep = pointer_offset (src, len);
237+
238+ /*
239+ * Come up to alignof(void*)-alignment using a jump table. This
240+ * operation will move no pointers, since it serves to get us up to
241+ * alignof(void*). Recall that unaligned_start takes its arguments by
242+ * reference, so they will be aligned hereafter.
243+ */
244+ unaligned_start<alignof (void *), sizeof (long )>(dst, src, len);
245+
246+ /*
247+ * Move aligned pointer *pairs* for as long as we can (possibly none).
248+ * This generates load-pair/store-pair operations where we have them,
249+ * and should be benign where we don't, looking like just a bit of loop
250+ * unrolling with two loads and stores.
251+ */
252+ {
253+ struct Ptr2
254+ {
255+ void * p[2 ];
256+ };
257+ if (sizeof (Ptr2) <= len)
258+ {
259+ auto dp = static_cast <Ptr2*>(dst);
260+ auto sp = static_cast <const Ptr2*>(src);
261+ for (size_t i = 0 ; i <= len - sizeof (Ptr2); i += sizeof (Ptr2))
262+ {
263+ *dp++ = *sp++;
264+ }
265+ }
266+ }
267+
268+ /*
269+ * After that copy loop, there can be at most one pointer-aligned and
270+ * -sized region left. If there is one, copy it.
271+ */
272+ len = len & (2 * sizeof (void *) - 1 );
273+ if (sizeof (void *) <= len)
274+ {
275+ ptrdiff_t o = -static_cast <ptrdiff_t >(sizeof (void *));
276+ auto dp =
277+ pointer_align_down<alignof (void *)>(pointer_offset_signed (dep, o));
278+ auto sp =
279+ pointer_align_down<alignof (void *)>(pointer_offset_signed (sep, o));
280+ *static_cast <void **>(dp) = *static_cast <void * const *>(sp);
281+ }
282+
283+ /*
284+ * There are up to sizeof(void*)-1 bytes left at the end, aligned at
285+ * alignof(void*). Figure out where and how many...
286+ */
287+ len = len & (sizeof (void *) - 1 );
288+ dst = pointer_align_down<alignof (void *)>(dep);
289+ src = pointer_align_down<alignof (void *)>(sep);
290+ /*
291+ * ... and use a jump table at the end, too. If we did the copy_end
292+ * overlapping store backwards trick, we'd risk damaging the capability
293+ * in the cell behind us.
294+ */
295+ small_copies<sizeof (void *), sizeof (long )>(dst, src, len);
296+ }
297+ /*
298+ * Otherwise, we cannot use pointer-width operations because one of
299+ * the load or store is going to be misaligned and so will trap.
300+ * So, same dance, but with integer registers only.
301+ */
302+ else
303+ {
304+ block_copy<LargestRegisterSize>(dst, src, len);
305+ copy_end<LargestRegisterSize>(dst, src, len);
306+ }
307+ }
308+ };
309+
182310#if defined(__x86_64__) || defined(_M_X64)
183311 /* *
184312 * x86-64 architecture. Prefers SSE registers for small and medium copies
@@ -288,7 +416,10 @@ namespace snmalloc
288416#elif defined(__powerpc64__)
289417 PPC64Arch
290418#else
291- GenericArch
419+ std::conditional_t <
420+ aal_supports<StrictProvenance>,
421+ GenericStrictProvenance,
422+ GenericArch>
292423#endif
293424 ;
294425
0 commit comments