-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[clang] add array out-of-bounds access constraints using llvm.assume #159046
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
7fdec0a
702d9dd
ec1024d
77e2606
0ed60c2
1c11e60
033a1ce
f46618d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4559,6 +4559,152 @@ void CodeGenFunction::EmitCountedByBoundsChecking( | |
} | ||
} | ||
|
||
/// Emit array bounds constraints using llvm.assume for optimization hints. | ||
/// | ||
/// C Standard (ISO/IEC 9899:2011 - C11) | ||
/// Section J.2 (Undefined behavior): An array subscript is out of range, even | ||
/// if an object is apparently accessible with the given subscript (as in the | ||
/// lvalue expression a[1][7] given the declaration int a[4][5]) (6.5.6). | ||
/// | ||
/// Section 6.5.6 (Additive operators): If both the pointer operand and the | ||
/// result point to elements of the same array object, or one past the last | ||
/// element of the array object, the evaluation shall not produce an overflow; | ||
/// otherwise, the behavior is undefined. | ||
/// | ||
/// C++ Standard (ISO/IEC 14882 - 2017) | ||
/// Section 8.7 (Additive operators): | ||
/// 4 When an expression that has integral type is added to or subtracted from a | ||
/// pointer, the result has the type of the pointer operand. If the expression | ||
/// P points to element x[i] of an array object x with n elements,^86 the | ||
/// expressions P + J and J + P (where J has the value j) point to the | ||
/// (possibly-hypothetical) element x[i + j] if 0 ≤ i + j ≤ n; otherwise, the | ||
/// behavior is undefined. Likewise, the expression P - J points to the | ||
/// (possibly-hypothetical) element x[i − j] if 0 ≤ i − j ≤ n; otherwise, the | ||
/// behavior is undefined. | ||
/// ^86 A pointer past the last element of an array x of n elements is | ||
sebpop marked this conversation as resolved.
Show resolved
Hide resolved
|
||
/// considered to be equivalent to a pointer to a hypothetical element x[n] | ||
/// for this purpose; see 6.9.2. | ||
/// | ||
/// This function emits llvm.assume statements to inform the optimizer that | ||
/// array subscripts are within bounds, enabling better optimization without | ||
/// duplicating side effects from the subscript expression. The IndexVal | ||
/// parameter should be the already-emitted index value to avoid re-evaluation. | ||
/// | ||
/// Code that intentionally accesses out-of-bounds (UB) may break with | ||
/// optimizations. Only applies to constant-size arrays (not pointers, VLAs, or | ||
/// flexible arrays.) Disabled when -fsanitize=array-bounds is active. | ||
/// | ||
void CodeGenFunction::EmitArrayBoundsConstraints(const ArraySubscriptExpr *E, | ||
llvm::Value *IndexVal) { | ||
// Disable with -fno-assume-array-bounds. | ||
if (!CGM.getCodeGenOpts().AssumeArrayBounds) | ||
return; | ||
|
||
// Disable at -O0. | ||
if (CGM.getCodeGenOpts().OptimizationLevel == 0) | ||
return; | ||
|
||
// Disable with array-bounds sanitizer. | ||
if (SanOpts.has(SanitizerKind::ArrayBounds)) | ||
return; | ||
|
||
// Use the provided IndexVal to avoid duplicating side effects. | ||
// The caller has already emitted the index expression once. | ||
if (!IndexVal) | ||
return; | ||
|
||
const Expr *Base = E->getBase(); | ||
const Expr *Idx = E->getIdx(); | ||
QualType BaseType = Base->getType(); | ||
|
||
if (const auto *ICE = dyn_cast<ImplicitCastExpr>(Base)) { | ||
if (ICE->getCastKind() == CK_ArrayToPointerDecay) { | ||
BaseType = ICE->getSubExpr()->getType(); | ||
} | ||
} | ||
|
||
// Handle both constant arrays and VLAs (variable-length arrays.) | ||
const ConstantArrayType *CAT = getContext().getAsConstantArrayType(BaseType); | ||
llvm::Value *VLASize = nullptr; | ||
|
||
if (!CAT) { | ||
if (const VariableArrayType *VAT = | ||
getContext().getAsVariableArrayType(BaseType)) | ||
VLASize = getVLASize(VAT).NumElts; | ||
else | ||
return; // Not a constant or VLA. | ||
} | ||
|
||
llvm::APInt ArraySize; | ||
if (CAT) | ||
ArraySize = CAT->getSize(); | ||
|
||
// Don't generate assumes for flexible array member pattern. | ||
// Size-1 arrays: "struct { int len; char data[1]; }" (pre-C99 idiom.) | ||
// Zero-length arrays: "struct { int len; char data[0]; }" (GCC extension | ||
// https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html) | ||
// Both patterns use arrays as placeholders for variable-length data. | ||
if (CAT && (ArraySize == 0 || ArraySize == 1)) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should respect StrictFlexArraysLevel, probably. |
||
if (const auto *ME = dyn_cast<MemberExpr>(Base->IgnoreParenImpCasts())) { | ||
if (const auto *FD = dyn_cast<FieldDecl>(ME->getMemberDecl())) { | ||
const RecordDecl *RD = FD->getParent(); | ||
// Check if this field is the last field in the record. | ||
// Only the last field can be a flexible array member. | ||
const FieldDecl *LastField = nullptr; | ||
for (const auto *Field : RD->fields()) | ||
LastField = Field; | ||
if (LastField == FD) | ||
// This is a zero-length or size-1 array as the last field. | ||
// Likely a flexible array member pattern - skip assumes. | ||
return; | ||
} | ||
} | ||
} | ||
|
||
QualType IdxType = Idx->getType(); | ||
llvm::Type *IndexType = ConvertType(IdxType); | ||
llvm::Value *ArraySizeVal; | ||
|
||
if (CAT) | ||
// Constant array: use compile-time size. | ||
ArraySizeVal = | ||
llvm::ConstantInt::get(IndexType, ArraySize.getLimitedValue()); | ||
else | ||
// VLA: use runtime size. | ||
ArraySizeVal = | ||
VLASize->getType() == IndexType | ||
? VLASize | ||
: Builder.CreateIntCast(VLASize, IndexType, false, "vla.size.cast"); | ||
|
||
// Ensure index value has the same type as our constants. | ||
if (IndexVal->getType() != IndexType) { | ||
bool IsSigned = IdxType->isSignedIntegerOrEnumerationType(); | ||
IndexVal = Builder.CreateIntCast(IndexVal, IndexType, IsSigned, "idx.cast"); | ||
} | ||
|
||
// Create bounds constraint: 0 <= index && index < size. | ||
// C arrays are 0-based, so valid indices are [0, size-1]. | ||
// This enforces the C18 standard requirement that array subscripts | ||
// must be "greater than or equal to zero and less than the size of the | ||
// array." | ||
if (IdxType->isSignedIntegerOrEnumerationType()) { | ||
sebpop marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// For signed indices: index >= 0 && index < size. | ||
llvm::Value *Zero = llvm::ConstantInt::get(IndexType, 0); | ||
llvm::Value *LowerBound = | ||
Builder.CreateICmpSGE(IndexVal, Zero, "idx.ge.zero"); | ||
llvm::Value *UpperBound = | ||
Builder.CreateICmpSLT(IndexVal, ArraySizeVal, "idx.lt.size"); | ||
llvm::Value *BoundsConstraint = | ||
Builder.CreateAnd(LowerBound, UpperBound, "bounds.constraint"); | ||
sebpop marked this conversation as resolved.
Show resolved
Hide resolved
|
||
Builder.CreateAssumption(BoundsConstraint); | ||
} else { | ||
// For unsigned indices: index < size (>= 0 is implicit.) | ||
llvm::Value *UpperBound = | ||
Builder.CreateICmpULT(IndexVal, ArraySizeVal, "idx.lt.size"); | ||
Builder.CreateAssumption(UpperBound); | ||
} | ||
} | ||
|
||
LValue CodeGenFunction::EmitArraySubscriptExpr(const ArraySubscriptExpr *E, | ||
bool Accessed) { | ||
// The index must always be an integer, which is not an aggregate. Emit it | ||
|
@@ -4588,13 +4734,20 @@ LValue CodeGenFunction::EmitArraySubscriptExpr(const ArraySubscriptExpr *E, | |
}; | ||
IdxPre = nullptr; | ||
|
||
// Array bounds constraints will be emitted after index evaluation to avoid | ||
// duplicating side effects from the index expression. | ||
|
||
// If the base is a vector type, then we are forming a vector element lvalue | ||
// with this subscript. | ||
if (E->getBase()->getType()->isSubscriptableVectorType() && | ||
!isa<ExtVectorElementExpr>(E->getBase())) { | ||
// Emit the vector as an lvalue to get its address. | ||
LValue LHS = EmitLValue(E->getBase()); | ||
auto *Idx = EmitIdxAfterBase(/*Promote*/false); | ||
|
||
// Emit array bounds constraints for vector subscripts. | ||
EmitArrayBoundsConstraints(E, Idx); | ||
|
||
assert(LHS.isSimple() && "Can only subscript lvalue vectors here!"); | ||
return LValue::MakeVectorElt(LHS.getAddress(), Idx, E->getBase()->getType(), | ||
LHS.getBaseInfo(), TBAAAccessInfo()); | ||
|
@@ -4635,6 +4788,10 @@ LValue CodeGenFunction::EmitArraySubscriptExpr(const ArraySubscriptExpr *E, | |
Addr = EmitPointerWithAlignment(E->getBase(), &EltBaseInfo, &EltTBAAInfo); | ||
auto *Idx = EmitIdxAfterBase(/*Promote*/true); | ||
|
||
// Emit array bounds constraints for VLA access (though VLAs typically don't | ||
// have constant bounds). | ||
EmitArrayBoundsConstraints(E, Idx); | ||
|
||
// The element count here is the total number of non-VLA elements. | ||
llvm::Value *numElements = getVLASize(vla).NumElts; | ||
|
||
|
@@ -4659,6 +4816,9 @@ LValue CodeGenFunction::EmitArraySubscriptExpr(const ArraySubscriptExpr *E, | |
Addr = EmitPointerWithAlignment(E->getBase(), &EltBaseInfo, &EltTBAAInfo); | ||
auto *Idx = EmitIdxAfterBase(/*Promote*/true); | ||
|
||
// Emit array bounds constraints for ObjC interface access. | ||
EmitArrayBoundsConstraints(E, Idx); | ||
|
||
CharUnits InterfaceSize = getContext().getTypeSizeInChars(OIT); | ||
llvm::Value *InterfaceSizeVal = | ||
llvm::ConstantInt::get(Idx->getType(), InterfaceSize.getQuantity()); | ||
|
@@ -4694,6 +4854,9 @@ LValue CodeGenFunction::EmitArraySubscriptExpr(const ArraySubscriptExpr *E, | |
ArrayLV = EmitLValue(Array); | ||
auto *Idx = EmitIdxAfterBase(/*Promote*/true); | ||
|
||
// Emit array bounds constraints for optimization. | ||
EmitArrayBoundsConstraints(E, Idx); | ||
|
||
if (SanOpts.has(SanitizerKind::ArrayBounds)) | ||
EmitCountedByBoundsChecking(Array, Idx, ArrayLV.getAddress(), | ||
E->getIdx()->getType(), Array->getType(), | ||
|
@@ -4737,6 +4900,10 @@ LValue CodeGenFunction::EmitArraySubscriptExpr(const ArraySubscriptExpr *E, | |
Address BaseAddr = | ||
EmitPointerWithAlignment(E->getBase(), &EltBaseInfo, &EltTBAAInfo); | ||
auto *Idx = EmitIdxAfterBase(/*Promote*/true); | ||
|
||
// Emit array bounds constraints for pointer-based array access. | ||
EmitArrayBoundsConstraints(E, Idx); | ||
|
||
QualType ptrType = E->getBase()->getType(); | ||
Addr = emitArraySubscriptGEP(*this, BaseAddr, Idx, E->getType(), | ||
!getLangOpts().PointerOverflowDefined, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
// RUN: %clang_cc1 -emit-llvm -O2 -fassume-array-bounds %s -o - | FileCheck %s | ||
// Test that array bounds constraints are NOT applied to cases that might | ||
// break real-world code with intentional out-of-bounds access patterns. | ||
|
||
// C18 standard allows one-past-the-end pointers, and some legacy code | ||
// intentionally accesses out-of-bounds for performance or compatibility. | ||
// This test verifies that bounds constraints are only applied to safe cases. | ||
|
||
// CHECK-LABEL: define {{.*}} @test_zero_length_array | ||
struct ZeroLengthData { | ||
int count; | ||
int items[0]; // GNU C extension: zero-length array | ||
}; | ||
|
||
int test_zero_length_array(struct ZeroLengthData *d, int i) { | ||
// CHECK-NOT: call void @llvm.assume | ||
// Zero-length array as last field should not generate bounds constraints. | ||
// See https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html | ||
return d->items[i]; | ||
} | ||
|
||
// CHECK-LABEL: define {{.*}} @test_flexible_array_member | ||
struct Data { | ||
int count; | ||
int items[1]; // Flexible array member pattern (pre-C99 style) | ||
}; | ||
|
||
int test_flexible_array_member(struct Data *d, int i) { | ||
// CHECK-NOT: call void @llvm.assume | ||
// Flexible array member pattern (size 1 array as last field) should NOT | ||
// generate bounds constraints because items[1] is just a placeholder | ||
// for a larger array allocated with `malloc (sizeof (struct Data) + 42)`. | ||
return d->items[i]; | ||
} | ||
|
||
// CHECK-LABEL: define {{.*}} @test_not_flexible_array | ||
struct NotFlexible { | ||
int items[1]; // Size 1 array but NOT the last field. | ||
int count; // Something comes after it. | ||
}; | ||
|
||
int test_not_flexible_array(struct NotFlexible *s, int i) { | ||
// CHECK: call void @llvm.assume | ||
// This is NOT a flexible array pattern (not the last field), | ||
// so we're fine generating `assume(i < 1)`. | ||
return s->items[i]; | ||
} | ||
|
||
// CHECK-LABEL: define {{.*}} @test_pointer_parameter | ||
int test_pointer_parameter(int *arr, int i) { | ||
// CHECK-NOT: call void @llvm.assume | ||
// Pointer parameters should NOT generate bounds constraints | ||
// because we don't know the actual array size. | ||
return arr[i]; | ||
} | ||
|
||
// CHECK-LABEL: define {{.*}} @test_vla | ||
void init_vla(int *arr, int n); | ||
|
||
int test_vla(int n, int i) { | ||
int arr[n]; // Variable-length array. | ||
init_vla(arr, n); // Initialize to avoid UB. | ||
// CHECK: call void @llvm.assume | ||
// For VLAs, generate bounds constraints using the runtime size: 0 <= i < n. | ||
return arr[i]; | ||
} | ||
|
||
// CHECK-LABEL: define {{.*}} @test_one_past_end | ||
extern int extern_array[100]; | ||
int *test_one_past_end(void) { | ||
// CHECK-NOT: call void @llvm.assume | ||
// Taking address of one-past-the-end is allowed by C standard. | ||
// We should NOT assume anything about this access. | ||
return &extern_array[100]; // Legal: one past the end. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tried extern int extern_array[100];
int *test_extern_array_val(int i) {
return &extern_array[i];
} with this PR an it generates %bounds.constraint = icmp ult i32 %i, 100
tail call void @llvm.assume(i1 %bounds.constraint) if Did you consider C++ references?
I think a reference must always point to valid memory, so here one can apply the stricter |
||
} | ||
|
||
// CHECK-LABEL: define {{.*}} @test_extern_array | ||
int test_extern_array(int i) { | ||
// CHECK: call void @llvm.assume | ||
// This will generate bounds constraints. | ||
// The array is a constant-size global array. | ||
// This is the safe case where we want optimization hints. | ||
return extern_array[i]; | ||
} | ||
|
||
// CHECK-LABEL: define {{.*}} @test_local_constant_array | ||
void init_array(int *arr); | ||
int test_local_constant_array(int i) { | ||
int arr[10]; | ||
init_array(arr); // Initialize to avoid UB from uninitialized read. | ||
// CHECK: call void @llvm.assume | ||
// This will generate bounds constraints. | ||
// We know the exact size of this alloca array. | ||
// This is the safe case where we want optimization hints. | ||
return arr[i]; | ||
} | ||
|
||
// CHECK-LABEL: define {{.*}} @test_malloc_array | ||
int *my_malloc(int); | ||
int test_malloc_array(int i) { | ||
// CHECK-NOT: call void @llvm.assume | ||
// Dynamically allocated arrays accessed via pointers do not get bounds | ||
// constraints. | ||
int *x = my_malloc(100 * sizeof(int)); | ||
return x[i]; | ||
} |
Uh oh!
There was an error while loading. Please reload this page.