-
Notifications
You must be signed in to change notification settings - Fork 15.3k
Open
Labels
enhancementImproving things as opposed to bug fixing, e.g. new or missing featureImproving things as opposed to bug fixing, e.g. new or missing featurellvm:SLPVectorizerquestionA question, not bug report. Check out https://llvm.org/docs/GettingInvolved.html instead!A question, not bug report. Check out https://llvm.org/docs/GettingInvolved.html instead!
Description
In the below example, there are two SLP reduction vectorization opportunities that share the same load. On their own, they are both profitable. However, when together, neither is able to vectorize because the load value's have external uses so a cost is added to extract that element.
I put together a rough draft using the following strategy:
- When we find a tree that has external uses, cache the tree and its cost without the extraction cost if removing the extraction cost would make it profitable.
- In the future, if we find a tree that has external uses, check to see if we have cached those same external uses elsewhere. If we can account for all the external uses in the cache, do not add a cost for the external uses for the current tree. This is because we are assuming that the cached chains can also be vectorized so their will be no need for extracting any elements.
- Run SLP again so that we can now vectorize the cache chains (they didn't vectorize the first time due to external uses).
Would be interested in discussing this further, also happy to post my current work as an MR.
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
; RUN: opt -mtriple=riscv64 -mattr=+v,+m -passes=slp-vectorizer -slp-threshold=0 -S < %s | FileCheck %s
;define i32 @add_slp(ptr %p, i32 %i_stride) {
;; CHECK-LABEL: define i32 @add_slp(
;; CHECK-SAME: ptr [[P:%.*]], i32 [[I_STRIDE:%.*]]) {
;; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P]], align 1
;; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr i32, ptr [[P]], i64 4
;; CHECK-NEXT: [[LD_4:%.*]] = load i32, ptr [[ARRAYIDX_4]], align 1
;; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
;; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP2]], [[LD_4]]
;; CHECK-NEXT: ret i32 [[OP_RDX]]
;;
; %ld = load i32, ptr %p, align 1
; %arrayidx.1 = getelementptr i32, ptr %p, i64 1
; %ld.1 = load i32, ptr %arrayidx.1, align 1
; %add.1 = add i32 %ld, %ld.1
; %arrayidx.2 = getelementptr i32, ptr %p, i64 2
; %ld.2 = load i32, ptr %arrayidx.2, align 1
; %add.2 = add i32 %add.1, %ld.2
; %arrayidx.3 = getelementptr i32, ptr %p, i64 3
; %ld.3 = load i32, ptr %arrayidx.3, align 1
; %add.3 = add i32 %add.2, %ld.3
; %arrayidx.4 = getelementptr i32, ptr %p, i64 4
; %ld.4 = load i32, ptr %arrayidx.4, align 1
; %add.4 = add i32 %add.3, %ld.4
; ret i32 %add.4
;}
;
;define i32 @mul_slp(ptr %p, i32 %i_stride) {
;; CHECK-LABEL: define i32 @mul_slp(
;; CHECK-SAME: ptr [[P:%.*]], i32 [[I_STRIDE:%.*]]) {
;; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P]], align 1
;; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i32> [[TMP1]], [[TMP1]]
;; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr i32, ptr [[P]], i64 4
;; CHECK-NEXT: [[LD_4:%.*]] = load i32, ptr [[ARRAYIDX_4]], align 1
;; CHECK-NEXT: [[MUL_4:%.*]] = mul i32 [[LD_4]], [[LD_4]]
;; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]])
;; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP3]], [[MUL_4]]
;; CHECK-NEXT: ret i32 [[OP_RDX]]
;;
; %ld = load i32, ptr %p, align 1
; %mul = mul i32 %ld, %ld
; %arrayidx.1 = getelementptr i32, ptr %p, i64 1
; %ld.1 = load i32, ptr %arrayidx.1, align 1
; %mul.1 = mul i32 %ld.1, %ld.1
; %add11.1 = add i32 %mul.1, %mul
; %arrayidx.2 = getelementptr i32, ptr %p, i64 2
; %ld.2 = load i32, ptr %arrayidx.2, align 1
; %mul.2 = mul i32 %ld.2, %ld.2
; %add11.2 = add i32 %mul.2, %add11.1
; %arrayidx.3 = getelementptr i32, ptr %p, i64 3
; %ld.3 = load i32, ptr %arrayidx.3, align 1
; %mul.3 = mul i32 %ld.3, %ld.3
; %add11.3 = add i32 %mul.3, %add11.2
; %arrayidx.4 = getelementptr i32, ptr %p, i64 4
; %ld.4 = load i32, ptr %arrayidx.4, align 1
; %mul.4 = mul i32 %ld.4, %ld.4
; %add11.4 = add i32 %mul.4, %add11.3
; ret i32 %add11.4
;}
define i32 @partial_slp(ptr %p, i32 %i_stride) {
; CHECK-LABEL: define i32 @partial_slp(
; CHECK-SAME: ptr [[P:%.*]], i32 [[I_STRIDE:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[LD:%.*]] = load i32, ptr [[P]], align 1
; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[LD]], [[LD]]
; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr i32, ptr [[P]], i64 1
; CHECK-NEXT: [[LD_4:%.*]] = load i32, ptr [[ARRAYIDX_4]], align 1
; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[LD]], [[LD_4]]
; CHECK-NEXT: [[MUL_5:%.*]] = mul i32 [[LD_4]], [[LD_4]]
; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[MUL_5]], [[MUL]]
; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr i32, ptr [[P]], i64 2
; CHECK-NEXT: [[ADD_2:%.*]] = load i32, ptr [[ARRAYIDX_2]], align 1
; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[TMP5]], [[ADD_2]]
; CHECK-NEXT: [[TMP3:%.*]] = mul i32 [[ADD_2]], [[ADD_2]]
; CHECK-NEXT: [[OP_RDX2:%.*]] = add i32 [[TMP3]], [[TMP2]]
; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr i32, ptr [[P]], i64 3
; CHECK-NEXT: [[LD_3:%.*]] = load i32, ptr [[ARRAYIDX_3]], align 1
; CHECK-NEXT: [[OP_RDX3:%.*]] = add i32 [[OP_RDX1]], [[LD_3]]
; CHECK-NEXT: [[MUL_3:%.*]] = mul i32 [[LD_3]], [[LD_3]]
; CHECK-NEXT: [[ADD11_3:%.*]] = add i32 [[MUL_3]], [[OP_RDX2]]
; CHECK-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr i32, ptr [[P]], i64 4
; CHECK-NEXT: [[OP_RDX4:%.*]] = load i32, ptr [[ARRAYIDX_5]], align 1
; CHECK-NEXT: [[OP_RDX5:%.*]] = add i32 [[OP_RDX3]], [[OP_RDX4]]
; CHECK-NEXT: [[MUL_4:%.*]] = mul i32 [[OP_RDX4]], [[OP_RDX4]]
; CHECK-NEXT: [[ADD11_4:%.*]] = add i32 [[MUL_4]], [[ADD11_3]]
; CHECK-NEXT: [[ADD:%.*]] = add i32 [[OP_RDX5]], [[ADD11_4]]
; CHECK-NEXT: ret i32 [[ADD]]
;
%ld = load i32, ptr %p, align 1
%mul = mul i32 %ld, %ld
%arrayidx.1 = getelementptr i32, ptr %p, i64 1
%ld.1 = load i32, ptr %arrayidx.1, align 1
%add.1 = add i32 %ld, %ld.1
%mul.1 = mul i32 %ld.1, %ld.1
%add11.1 = add i32 %mul.1, %mul
%arrayidx.2 = getelementptr i32, ptr %p, i64 2
%ld.2 = load i32, ptr %arrayidx.2, align 1
%add.2 = add i32 %add.1, %ld.2
%mul.2 = mul i32 %ld.2, %ld.2
%add11.2 = add i32 %mul.2, %add11.1
%arrayidx.3 = getelementptr i32, ptr %p, i64 3
%ld.3 = load i32, ptr %arrayidx.3, align 1
%add.3 = add i32 %add.2, %ld.3
%mul.3 = mul i32 %ld.3, %ld.3
%add11.3 = add i32 %mul.3, %add11.2
%arrayidx.4 = getelementptr i32, ptr %p, i64 4
%ld.4 = load i32, ptr %arrayidx.4, align 1
%add.4 = add i32 %add.3, %ld.4
%mul.4 = mul i32 %ld.4, %ld.4
%add11.4 = add i32 %mul.4, %add11.3
%add = add i32 %add.4, %add11.4
ret i32 %add
}
Metadata
Metadata
Assignees
Labels
enhancementImproving things as opposed to bug fixing, e.g. new or missing featureImproving things as opposed to bug fixing, e.g. new or missing featurellvm:SLPVectorizerquestionA question, not bug report. Check out https://llvm.org/docs/GettingInvolved.html instead!A question, not bug report. Check out https://llvm.org/docs/GettingInvolved.html instead!