Skip to content

#108985 optimization doesn't remove strlen like loop body in some cases #134736

@delcypher

Description

@delcypher

In #108985 an optimization was added that tries to replace strlen like coding idioms with a call to strlen.

We ran into problems with this when this is optimization is used in -fbounds-safety.

For a test case like:

**NOTE: requires building the swift fork of llvm-project on the next branch.

// compile with -fbounds-safety
#include <ptrcheck.h>
int *__indexable convert_null_terminated_to_bounded_ptr(int *__null_terminated p) {
    return __unsafe_terminated_by_to_indexable(p);
}

the unoptimized IR looks like:

; Function Attrs: noinline nounwind optnone ssp uwtable(sync)
define [2 x i64] @convert_null_terminated_to_bounded_ptr(ptr noundef %p) #0 {
entry:
  %retval = alloca %"__bounds_safety::wide_ptr.indexable", align 8
  %p.addr = alloca ptr, align 8
  %terminated_by.len = alloca i64, align 8
  store ptr %p, ptr %p.addr, align 8
  %0 = load ptr, ptr %p.addr, align 8
  store i64 0, ptr %terminated_by.len, align 8
  br label %terminated_by.loop_cond

terminated_by.loop_cond:                          ; preds = %terminated_by.loop_cont, %entry
  %terminated_by.len1 = load i64, ptr %terminated_by.len, align 8
  %1 = getelementptr inbounds i32, ptr %0, i64 %terminated_by.len1
  %terminated_by.elem = load i32, ptr %1, align 4
  %terminted_by.check_terminator = icmp eq i32 %terminated_by.elem, 0
  br i1 %terminted_by.check_terminator, label %terminated_by.loop_end, label %terminated_by.loop_cont

terminated_by.loop_cont:                          ; preds = %terminated_by.loop_cond
  %terminated_by.new_len = add i64 %terminated_by.len1, 1
  store i64 %terminated_by.new_len, ptr %terminated_by.len, align 8
  br label %terminated_by.loop_cond

terminated_by.loop_end:                           ; preds = %terminated_by.loop_cond
  %2 = load i64, ptr %terminated_by.len, align 8
  %3 = add i64 %2, 1
  %terminated_by.upper = getelementptr inbounds i32, ptr %0, i64 %3
  %4 = getelementptr inbounds nuw %"__bounds_safety::wide_ptr.indexable", ptr %retval, i32 0, i32 0
  store ptr %0, ptr %4, align 8
  %5 = getelementptr inbounds nuw %"__bounds_safety::wide_ptr.indexable", ptr %retval, i32 0, i32 1
  store ptr %terminated_by.upper, ptr %5, align 8
  %6 = load [2 x i64], ptr %retval, align 8
  ret [2 x i64] %6
}

If we optimize this with -mllvm -disable-loop-idiom-wcslen the IR looks like

define [2 x i64] @convert_null_terminated_to_bounded_ptr(ptr noundef %p) local_unnamed_addr #0 {
entry:
  br label %terminated_by.loop_cond

terminated_by.loop_cond:                          ; preds = %terminated_by.loop_cond, %entry
  %terminated_by.len.0 = phi i64 [ 0, %entry ], [ %terminated_by.new_len, %terminated_by.loop_cond ]
  %0 = getelementptr inbounds i32, ptr %p, i64 %terminated_by.len.0
  %terminated_by.elem = load i32, ptr %0, align 4
  %terminted_by.check_terminator = icmp eq i32 %terminated_by.elem, 0
  %terminated_by.new_len = add i64 %terminated_by.len.0, 1
  br i1 %terminted_by.check_terminator, label %terminated_by.loop_end, label %terminated_by.loop_cond

terminated_by.loop_end:                           ; preds = %terminated_by.loop_cond
  %1 = getelementptr inbounds i32, ptr %p, i64 %terminated_by.len.0
  %terminated_by.upper = getelementptr i8, ptr %1, i64 4
  %2 = ptrtoint ptr %p to i64
  %.fca.0.insert = insertvalue [2 x i64] poison, i64 %2, 0
  %3 = ptrtoint ptr %terminated_by.upper to i64
  %.fca.1.insert = insertvalue [2 x i64] %.fca.0.insert, i64 %3, 1
  ret [2 x i64] %.fca.1.insert
}

Now if we optimize with the pass enabled the result looks like

; Function Attrs: nofree nounwind ssp memory(argmem: read) uwtable(sync)
define [2 x i64] @convert_null_terminated_to_bounded_ptr(ptr noundef %p) local_unnamed_addr #0 {
entry:
  %wcslen = tail call i64 @wcslen(ptr %p)
  br label %terminated_by.loop_cond

terminated_by.loop_cond:                          ; preds = %terminated_by.loop_cond, %entry
  %terminated_by.len.0 = phi i64 [ 0, %entry ], [ %terminated_by.new_len, %terminated_by.loop_cond ]
  %0 = getelementptr inbounds i32, ptr %p, i64 %terminated_by.len.0
  %terminated_by.elem = load i32, ptr %0, align 4
  %terminted_by.check_terminator = icmp eq i32 %terminated_by.elem, 0
  %terminated_by.new_len = add i64 %terminated_by.len.0, 1
  br i1 %terminted_by.check_terminator, label %terminated_by.loop_end, label %terminated_by.loop_cond

terminated_by.loop_end:                           ; preds = %terminated_by.loop_cond
  %1 = getelementptr inbounds i32, ptr %p, i64 %wcslen
  %terminated_by.upper = getelementptr i8, ptr %1, i64 4
  %2 = ptrtoint ptr %p to i64
  %.fca.0.insert = insertvalue [2 x i64] poison, i64 %2, 0
  %3 = ptrtoint ptr %terminated_by.upper to i64
  %.fca.1.insert = insertvalue [2 x i64] %.fca.0.insert, i64 %3, 1
  ret [2 x i64] %.fca.1.insert
}

This IR doesn't seem great because the call to wcslen is pointless because the loop body is still present. I would expect the perf of this code to be worse.

Metadata

Metadata

Assignees

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions