Skip to content

Commit 2b3897e

Browse files
nwf-msrmjp41
andcommitted
memcpy vs. StrictProvenance
StrictProvenance architectures are likely to impose additional alignment requirements on their pointer-sized loads and stores. On the other hand, we must use pointer-sized loads and stores wherever possible to ensure achieve copy. Add a StrictProvenance-aware memcpy architecture implementation. Thanks to Matt for suggesting the trick of avoiding even thinking about capability operations in the too-misaligned 16-31 byte cases as well as other helpful suggestions. Co-authored-by: Matthew Parkinson <[email protected]>
1 parent 88a2740 commit 2b3897e

File tree

2 files changed

+144
-3
lines changed

2 files changed

+144
-3
lines changed

src/snmalloc/aal/address.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -280,4 +280,14 @@ namespace snmalloc
280280
return pointer_diff_signed(base.unsafe_ptr(), cursor.unsafe_ptr());
281281
}
282282

283+
/**
284+
* Compute the degree to which an address is misaligned relative to some
285+
* putative alignment.
286+
*/
287+
template<size_t alignment>
288+
inline size_t address_misalignment(address_t a)
289+
{
290+
return static_cast<size_t>(a - pointer_align_down<alignment>(a));
291+
}
292+
283293
} // namespace snmalloc

src/snmalloc/global/memcpy.h

Lines changed: 134 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -159,8 +159,7 @@ namespace snmalloc
159159
std::max(sizeof(uint64_t), sizeof(void*));
160160

161161
/**
162-
* Hook for architecture-specific optimisations. Does nothing in the
163-
* default case.
162+
* Hook for architecture-specific optimisations.
164163
*/
165164
static SNMALLOC_FAST_PATH_INLINE void
166165
copy(void* dst, const void* src, size_t len)
@@ -179,6 +178,135 @@ namespace snmalloc
179178
}
180179
};
181180

181+
/**
182+
* StrictProvenance architectures are prickly about their pointers. In
183+
* particular, they may not permit misaligned loads and stores of
184+
* pointer-sized data, even if they can have non-pointers in their
185+
* pointer registers. On the other hand, pointers might be hiding anywhere
186+
* they are architecturally permitted!
187+
*/
188+
struct GenericStrictProvenance
189+
{
190+
static_assert(bits::is_pow2(sizeof(void*)));
191+
/*
192+
* It's not entirely clear what we would do if this were not the case.
193+
* Best not think too hard about it now.
194+
*/
195+
static_assert(alignof(void*) == sizeof(void*));
196+
197+
static constexpr size_t LargestRegisterSize = 16;
198+
199+
static SNMALLOC_FAST_PATH_INLINE void
200+
copy(void* dst, const void* src, size_t len)
201+
{
202+
/*
203+
* As a function of misalignment relative to pointers, how big do we need
204+
* to be such that the span could contain an aligned pointer? We'd need
205+
* to be big enough to contain the pointer and would need an additional
206+
* however many bytes it would take to get us up to alignment. That is,
207+
* (sizeof(void*) - src_misalign) except in the case that src_misalign is
208+
* 0, when the answer is 0, which we can get with some bit-twiddling.
209+
*
210+
* Below that threshold, just use a jump table to move bytes around.
211+
*/
212+
if (
213+
len < sizeof(void*) +
214+
(static_cast<size_t>(-static_cast<ptrdiff_t>(address_cast(src))) &
215+
(alignof(void*) - 1)))
216+
{
217+
small_copies<2 * sizeof(void*) - 1, LargestRegisterSize>(dst, src, len);
218+
}
219+
/*
220+
* Equally-misaligned segments could be holding pointers internally,
221+
* assuming they're sufficiently large. In this case, perform unaligned
222+
* operations at the top and bottom of the range. This check also
223+
* suffices to include the case where both segments are
224+
* alignof(void*)-aligned.
225+
*/
226+
else if (
227+
address_misalignment<alignof(void*)>(address_cast(src)) ==
228+
address_misalignment<alignof(void*)>(address_cast(dst)))
229+
{
230+
/*
231+
* Find the buffers' ends. Do this before the unaligned_start so that
232+
* there are fewer dependencies in the instruction stream; it would be
233+
* functionally equivalent to do so below.
234+
*/
235+
auto dep = pointer_offset(dst, len);
236+
auto sep = pointer_offset(src, len);
237+
238+
/*
239+
* Come up to alignof(void*)-alignment using a jump table. This
240+
* operation will move no pointers, since it serves to get us up to
241+
* alignof(void*). Recall that unaligned_start takes its arguments by
242+
* reference, so they will be aligned hereafter.
243+
*/
244+
unaligned_start<alignof(void*), sizeof(long)>(dst, src, len);
245+
246+
/*
247+
* Move aligned pointer *pairs* for as long as we can (possibly none).
248+
* This generates load-pair/store-pair operations where we have them,
249+
* and should be benign where we don't, looking like just a bit of loop
250+
* unrolling with two loads and stores.
251+
*/
252+
{
253+
struct Ptr2
254+
{
255+
void* p[2];
256+
};
257+
if (sizeof(Ptr2) <= len)
258+
{
259+
auto dp = static_cast<Ptr2*>(dst);
260+
auto sp = static_cast<const Ptr2*>(src);
261+
for (size_t i = 0; i <= len - sizeof(Ptr2); i += sizeof(Ptr2))
262+
{
263+
*dp++ = *sp++;
264+
}
265+
}
266+
}
267+
268+
/*
269+
* After that copy loop, there can be at most one pointer-aligned and
270+
* -sized region left. If there is one, copy it.
271+
*/
272+
len = len & (2 * sizeof(void*) - 1);
273+
if (sizeof(void*) <= len)
274+
{
275+
ptrdiff_t o = -static_cast<ptrdiff_t>(sizeof(void*));
276+
auto dp =
277+
pointer_align_down<alignof(void*)>(pointer_offset_signed(dep, o));
278+
auto sp =
279+
pointer_align_down<alignof(void*)>(pointer_offset_signed(sep, o));
280+
*static_cast<void**>(dp) = *static_cast<void* const*>(sp);
281+
}
282+
283+
/*
284+
* There are up to sizeof(void*)-1 bytes left at the end, aligned at
285+
* alignof(void*). Figure out where and how many...
286+
*/
287+
len = len & (sizeof(void*) - 1);
288+
dst = pointer_align_down<alignof(void*)>(dep);
289+
src = pointer_align_down<alignof(void*)>(sep);
290+
/*
291+
* ... and use a jump table at the end, too. If we did the copy_end
292+
* overlapping store backwards trick, we'd risk damaging the capability
293+
* in the cell behind us.
294+
*/
295+
small_copies<sizeof(void*), sizeof(long)>(dst, src, len);
296+
}
297+
/*
298+
* Otherwise, we cannot use pointer-width operations because one of
299+
* the load or store is going to be misaligned and so will trap.
300+
* So, same dance, but with integer registers only.
301+
*/
302+
else
303+
{
304+
block_copy<LargestRegisterSize>(dst, src, len);
305+
copy_end<LargestRegisterSize>(dst, src, len);
306+
}
307+
}
308+
};
309+
182310
#if defined(__x86_64__) || defined(_M_X64)
183311
/**
184312
* x86-64 architecture. Prefers SSE registers for small and medium copies
@@ -288,7 +416,10 @@ namespace snmalloc
288416
#elif defined(__powerpc64__)
289417
PPC64Arch
290418
#else
291-
GenericArch
419+
std::conditional_t<
420+
aal_supports<StrictProvenance>,
421+
GenericStrictProvenance,
422+
GenericArch>
292423
#endif
293424
;
294425

0 commit comments

Comments
 (0)