diff --git a/llvm/test/Analysis/CostModel/X86/reduce-add.ll b/llvm/test/Analysis/CostModel/X86/reduce-add.ll index c869d0e3032b9..9a717e7dbef73 100644 --- a/llvm/test/Analysis/CostModel/X86/reduce-add.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-add.ll @@ -1,55 +1,71 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx | FileCheck %s --check-prefixes=AVX1 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse2 | FileCheck %s --check-prefixes=SSE +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx | FileCheck %s --check-prefixes=AVX1 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512BW +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mcpu=slm | FileCheck %s --check-prefixes=SLM +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mcpu=slm | FileCheck %s --check-prefixes=SLM define i32 @reduce_i64(i32 %arg) { ; SSE-LABEL: 'reduce_i64' -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef) +; SSE-NEXT: Cost Model: Found costs of 2 for: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef) +; SSE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:4 for: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef) +; SSE-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:8 SizeLat:8 for: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef) +; SSE-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:16 SizeLat:16 for: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef) +; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX1-LABEL: 'reduce_i64' -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX1-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef) +; AVX1-NEXT: Cost Model: Found costs of 1 for: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef) +; AVX1-NEXT: Cost Model: Found costs of 3 for: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:7 CodeSize:8 Lat:5 SizeLat:9 for: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:15 CodeSize:18 Lat:9 SizeLat:21 for: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX2-LABEL: 'reduce_i64' -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX2-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef) +; AVX2-NEXT: Cost Model: Found costs of 1 for: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef) +; AVX2-NEXT: Cost Model: Found costs of 3 for: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:4 SizeLat:5 for: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:6 SizeLat:9 for: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; -; AVX512-LABEL: 'reduce_i64' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512F-LABEL: 'reduce_i64' +; AVX512F-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef) +; AVX512F-NEXT: Cost Model: Found costs of 1 for: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef) +; AVX512F-NEXT: Cost Model: Found costs of 3 for: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:8 for: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:10 SizeLat:9 for: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef +; +; AVX512BW-LABEL: 'reduce_i64' +; AVX512BW-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 1 for: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:7 for: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:10 SizeLat:8 for: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef +; +; AVX512DQ-LABEL: 'reduce_i64' +; AVX512DQ-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 1 for: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 3 for: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:8 for: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:10 SizeLat:9 for: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SLM-LABEL: 'reduce_i64' -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SLM-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef) +; SLM-NEXT: Cost Model: Found costs of 5 for: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef) +; SLM-NEXT: Cost Model: Found costs of RThru:9 CodeSize:6 Lat:7 SizeLat:7 for: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef) +; SLM-NEXT: Cost Model: Found costs of RThru:17 CodeSize:8 Lat:11 SizeLat:11 for: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef) +; SLM-NEXT: Cost Model: Found costs of RThru:33 CodeSize:12 Lat:19 SizeLat:19 for: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef) +; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef) %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef) @@ -61,44 +77,60 @@ define i32 @reduce_i64(i32 %arg) { define i32 @reduce_i32(i32 %arg) { ; SSE-LABEL: 'reduce_i32' -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE-NEXT: Cost Model: Found costs of 2 for: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef) +; SSE-NEXT: Cost Model: Found costs of 3 for: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef) +; SSE-NEXT: Cost Model: Found costs of 4 for: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef) +; SSE-NEXT: Cost Model: Found costs of 6 for: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef) +; SSE-NEXT: Cost Model: Found costs of 10 for: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef) +; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX1-LABEL: 'reduce_i32' -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX1-NEXT: Cost Model: Found costs of 2 for: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef) +; AVX1-NEXT: Cost Model: Found costs of 3 for: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef) +; AVX1-NEXT: Cost Model: Found costs of 5 for: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:9 CodeSize:10 Lat:7 SizeLat:11 for: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:17 CodeSize:20 Lat:11 SizeLat:23 for: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX2-LABEL: 'reduce_i32' -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX2-NEXT: Cost Model: Found costs of 2 for: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef) +; AVX2-NEXT: Cost Model: Found costs of 3 for: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef) +; AVX2-NEXT: Cost Model: Found costs of 5 for: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:6 SizeLat:7 for: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:8 SizeLat:11 for: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef +; +; AVX512F-LABEL: 'reduce_i32' +; AVX512F-NEXT: Cost Model: Found costs of 2 for: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef) +; AVX512F-NEXT: Cost Model: Found costs of 3 for: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef) +; AVX512F-NEXT: Cost Model: Found costs of 5 for: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:10 for: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:10 CodeSize:10 Lat:14 SizeLat:11 for: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef +; +; AVX512BW-LABEL: 'reduce_i32' +; AVX512BW-NEXT: Cost Model: Found costs of 2 for: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 5 for: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef) +; AVX512BW-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:9 for: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef) +; AVX512BW-NEXT: Cost Model: Found costs of RThru:10 CodeSize:10 Lat:14 SizeLat:10 for: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef) +; AVX512BW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; -; AVX512-LABEL: 'reduce_i32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512DQ-LABEL: 'reduce_i32' +; AVX512DQ-NEXT: Cost Model: Found costs of 2 for: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 3 for: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 5 for: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:10 for: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:10 CodeSize:10 Lat:14 SizeLat:11 for: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SLM-LABEL: 'reduce_i32' -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SLM-NEXT: Cost Model: Found costs of 2 for: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef) +; SLM-NEXT: Cost Model: Found costs of 3 for: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef) +; SLM-NEXT: Cost Model: Found costs of 4 for: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef) +; SLM-NEXT: Cost Model: Found costs of 6 for: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef) +; SLM-NEXT: Cost Model: Found costs of 10 for: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef) +; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef) %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef) @@ -110,67 +142,67 @@ define i32 @reduce_i32(i32 %arg) { define i32 @reduce_i16(i32 %arg) { ; SSE-LABEL: 'reduce_i16' -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE-NEXT: Cost Model: Found costs of 2 for: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef) +; SSE-NEXT: Cost Model: Found costs of 3 for: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef) +; SSE-NEXT: Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef) +; SSE-NEXT: Cost Model: Found costs of 5 for: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef) +; SSE-NEXT: Cost Model: Found costs of 7 for: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef) +; SSE-NEXT: Cost Model: Found costs of 11 for: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef) +; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX1-LABEL: 'reduce_i16' -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX1-NEXT: Cost Model: Found costs of 2 for: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of 3 for: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of 5 for: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:9 CodeSize:10 Lat:7 SizeLat:11 for: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:17 CodeSize:20 Lat:11 SizeLat:23 for: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX2-LABEL: 'reduce_i16' -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX2-NEXT: Cost Model: Found costs of 2 for: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of 3 for: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of 5 for: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:6 SizeLat:7 for: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:8 SizeLat:11 for: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i16' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512F-NEXT: Cost Model: Found costs of 2 for: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef) +; AVX512F-NEXT: Cost Model: Found costs of 3 for: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef) +; AVX512F-NEXT: Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef) +; AVX512F-NEXT: Cost Model: Found costs of 5 for: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:11 CodeSize:11 Lat:15 SizeLat:12 for: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:14 CodeSize:16 Lat:22 SizeLat:17 for: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i16' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512BW-NEXT: Cost Model: Found costs of 2 for: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 5 for: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found costs of RThru:11 CodeSize:11 Lat:15 SizeLat:11 for: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found costs of RThru:12 CodeSize:12 Lat:16 SizeLat:12 for: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i16' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512DQ-NEXT: Cost Model: Found costs of 2 for: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 3 for: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 5 for: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:11 CodeSize:11 Lat:15 SizeLat:12 for: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:14 CodeSize:16 Lat:22 SizeLat:17 for: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SLM-LABEL: 'reduce_i16' -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SLM-NEXT: Cost Model: Found costs of 2 for: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef) +; SLM-NEXT: Cost Model: Found costs of 3 for: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef) +; SLM-NEXT: Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef) +; SLM-NEXT: Cost Model: Found costs of 5 for: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef) +; SLM-NEXT: Cost Model: Found costs of 7 for: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef) +; SLM-NEXT: Cost Model: Found costs of 11 for: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef) +; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef) %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef) @@ -183,74 +215,74 @@ define i32 @reduce_i16(i32 %arg) { define i32 @reduce_i8(i32 %arg) { ; SSE-LABEL: 'reduce_i8' -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE-NEXT: Cost Model: Found costs of 2 for: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef) +; SSE-NEXT: Cost Model: Found costs of 2 for: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef) +; SSE-NEXT: Cost Model: Found costs of 2 for: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef) +; SSE-NEXT: Cost Model: Found costs of 3 for: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef) +; SSE-NEXT: Cost Model: Found costs of 4 for: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef) +; SSE-NEXT: Cost Model: Found costs of 6 for: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef) +; SSE-NEXT: Cost Model: Found costs of 10 for: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef) +; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX1-LABEL: 'reduce_i8' -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX1-NEXT: Cost Model: Found costs of 2 for: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of 2 for: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of 2 for: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of 3 for: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of 4 for: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:8 CodeSize:9 Lat:6 SizeLat:10 for: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:16 CodeSize:19 Lat:10 SizeLat:22 for: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX2-LABEL: 'reduce_i8' -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX2-NEXT: Cost Model: Found costs of 2 for: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of 2 for: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of 2 for: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of 3 for: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of 4 for: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:5 SizeLat:6 for: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:7 SizeLat:10 for: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i8' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512F-NEXT: Cost Model: Found costs of 2 for: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of 2 for: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of 2 for: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of 3 for: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of 4 for: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:13 CodeSize:13 Lat:17 SizeLat:14 for: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:16 CodeSize:18 Lat:24 SizeLat:19 for: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i8' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512BW-NEXT: Cost Model: Found costs of 2 for: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 2 for: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 2 for: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 4 for: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of RThru:13 CodeSize:13 Lat:17 SizeLat:13 for: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of RThru:14 CodeSize:14 Lat:18 SizeLat:14 for: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i8' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512DQ-NEXT: Cost Model: Found costs of 2 for: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 2 for: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 2 for: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 3 for: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 4 for: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:13 CodeSize:13 Lat:17 SizeLat:14 for: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:16 CodeSize:18 Lat:24 SizeLat:19 for: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SLM-LABEL: 'reduce_i8' -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SLM-NEXT: Cost Model: Found costs of 2 for: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef) +; SLM-NEXT: Cost Model: Found costs of 2 for: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef) +; SLM-NEXT: Cost Model: Found costs of 2 for: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef) +; SLM-NEXT: Cost Model: Found costs of 3 for: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef) +; SLM-NEXT: Cost Model: Found costs of 4 for: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef) +; SLM-NEXT: Cost Model: Found costs of 6 for: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef) +; SLM-NEXT: Cost Model: Found costs of 10 for: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef) +; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef) %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef) diff --git a/llvm/test/Analysis/CostModel/X86/reduce-and.ll b/llvm/test/Analysis/CostModel/X86/reduce-and.ll index 5da1109cabe58..21338a12381fc 100644 --- a/llvm/test/Analysis/CostModel/X86/reduce-and.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-and.ll @@ -1,37 +1,37 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse2 | FileCheck %s --check-prefixes=SSE +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ define i32 @reduce_i64(i32 %arg) { ; SSE-LABEL: 'reduce_i64' -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> undef) +; SSE-NEXT: Cost Model: Found costs of 3 for: %V2 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> undef) +; SSE-NEXT: Cost Model: Found costs of 4 for: %V4 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> undef) +; SSE-NEXT: Cost Model: Found costs of 6 for: %V8 = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> undef) +; SSE-NEXT: Cost Model: Found costs of 10 for: %V16 = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> undef) +; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX-LABEL: 'reduce_i64' -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> undef) +; AVX-NEXT: Cost Model: Found costs of 3 for: %V2 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> undef) +; AVX-NEXT: Cost Model: Found costs of 5 for: %V4 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> undef) +; AVX-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:6 SizeLat:7 for: %V8 = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> undef) +; AVX-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:8 SizeLat:11 for: %V16 = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> undef) +; AVX-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512-LABEL: 'reduce_i64' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %V2 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:7 SizeLat:5 for: %V4 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:7 for: %V8 = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:10 SizeLat:8 for: %V16 = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V1 = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> undef) %V2 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> undef) @@ -43,28 +43,28 @@ define i32 @reduce_i64(i32 %arg) { define i32 @reduce_i32(i32 %arg) { ; SSE-LABEL: 'reduce_i32' -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE-NEXT: Cost Model: Found costs of 3 for: %V2 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> undef) +; SSE-NEXT: Cost Model: Found costs of 5 for: %V4 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> undef) +; SSE-NEXT: Cost Model: Found costs of 6 for: %V8 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef) +; SSE-NEXT: Cost Model: Found costs of 8 for: %V16 = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> undef) +; SSE-NEXT: Cost Model: Found costs of 12 for: %V32 = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> undef) +; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX-LABEL: 'reduce_i32' -; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX-NEXT: Cost Model: Found costs of 3 for: %V2 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> undef) +; AVX-NEXT: Cost Model: Found costs of 5 for: %V4 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> undef) +; AVX-NEXT: Cost Model: Found costs of 7 for: %V8 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef) +; AVX-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:8 SizeLat:9 for: %V16 = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> undef) +; AVX-NEXT: Cost Model: Found costs of RThru:10 CodeSize:10 Lat:10 SizeLat:13 for: %V32 = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> undef) +; AVX-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512-LABEL: 'reduce_i32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %V2 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:9 SizeLat:5 for: %V4 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:11 SizeLat:7 for: %V8 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:9 for: %V16 = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:10 CodeSize:10 Lat:14 SizeLat:10 for: %V32 = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V2 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> undef) %V4 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> undef) @@ -76,31 +76,40 @@ define i32 @reduce_i32(i32 %arg) { define i32 @reduce_i16(i32 %arg) { ; SSE-LABEL: 'reduce_i16' -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE-NEXT: Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> undef) +; SSE-NEXT: Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef) +; SSE-NEXT: Cost Model: Found costs of 7 for: %V8 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef) +; SSE-NEXT: Cost Model: Found costs of 8 for: %V16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef) +; SSE-NEXT: Cost Model: Found costs of 10 for: %V32 = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> undef) +; SSE-NEXT: Cost Model: Found costs of 14 for: %V64 = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> undef) +; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; -; AVX-LABEL: 'reduce_i16' -; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX1-LABEL: 'reduce_i16' +; AVX1-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %V2 = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:6 SizeLat:5 for: %V4 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:8 SizeLat:7 for: %V8 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:10 SizeLat:9 for: %V16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:10 CodeSize:10 Lat:11 SizeLat:11 for: %V32 = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:12 CodeSize:12 Lat:13 SizeLat:15 for: %V64 = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef +; +; AVX2-LABEL: 'reduce_i16' +; AVX2-NEXT: Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of 7 for: %V8 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of 9 for: %V16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:10 CodeSize:10 Lat:10 SizeLat:11 for: %V32 = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:12 CodeSize:12 Lat:12 SizeLat:15 for: %V64 = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512-LABEL: 'reduce_i16' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-NEXT: Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:7 SizeLat:5 for: %V4 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:11 SizeLat:7 for: %V8 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:9 for: %V16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:11 CodeSize:11 Lat:15 SizeLat:11 for: %V32 = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:12 CodeSize:12 Lat:16 SizeLat:12 for: %V64 = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V2 = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> undef) %V4 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef) @@ -113,34 +122,44 @@ define i32 @reduce_i16(i32 %arg) { define i32 @reduce_i8(i32 %arg) { ; SSE-LABEL: 'reduce_i8' -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE-NEXT: Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> undef) +; SSE-NEXT: Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef) +; SSE-NEXT: Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef) +; SSE-NEXT: Cost Model: Found costs of 9 for: %V16 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef) +; SSE-NEXT: Cost Model: Found costs of 10 for: %V32 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef) +; SSE-NEXT: Cost Model: Found costs of 12 for: %V64 = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> undef) +; SSE-NEXT: Cost Model: Found costs of 16 for: %V128 = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> undef) +; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef +; +; AVX1-LABEL: 'reduce_i8' +; AVX1-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %V2 = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:7 SizeLat:5 for: %V4 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:7 for: %V8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:11 SizeLat:9 for: %V16 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:11 CodeSize:11 Lat:13 SizeLat:11 for: %V32 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:12 CodeSize:12 Lat:14 SizeLat:13 for: %V64 = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:14 CodeSize:14 Lat:16 SizeLat:17 for: %V128 = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; -; AVX-LABEL: 'reduce_i8' -; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX2-LABEL: 'reduce_i8' +; AVX2-NEXT: Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of 9 for: %V16 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of 11 for: %V32 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:12 CodeSize:12 Lat:12 SizeLat:13 for: %V64 = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:14 CodeSize:14 Lat:14 SizeLat:17 for: %V128 = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512-LABEL: 'reduce_i8' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-NEXT: Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> undef) +; AVX512-NEXT: Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:7 for: %V8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:9 for: %V16 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:11 CodeSize:11 Lat:15 SizeLat:11 for: %V32 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:13 CodeSize:13 Lat:17 SizeLat:13 for: %V64 = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:14 CodeSize:14 Lat:18 SizeLat:14 for: %V128 = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V2 = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> undef) %V4 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef) @@ -154,70 +173,70 @@ define i32 @reduce_i8(i32 %arg) { define i32 @reduce_i1(i32 %arg) { ; SSE-LABEL: 'reduce_i1' -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE-NEXT: Cost Model: Found costs of 0 for: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef) +; SSE-NEXT: Cost Model: Found costs of 2 for: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef) +; SSE-NEXT: Cost Model: Found costs of 2 for: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef) +; SSE-NEXT: Cost Model: Found costs of 2 for: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef) +; SSE-NEXT: Cost Model: Found costs of 2 for: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef) +; SSE-NEXT: Cost Model: Found costs of 3 for: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef) +; SSE-NEXT: Cost Model: Found costs of 5 for: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef) +; SSE-NEXT: Cost Model: Found costs of 9 for: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef) +; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX1-LABEL: 'reduce_i1' -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX1-NEXT: Cost Model: Found costs of 0 for: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef) +; AVX1-NEXT: Cost Model: Found costs of 2 for: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef) +; AVX1-NEXT: Cost Model: Found costs of 2 for: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef) +; AVX1-NEXT: Cost Model: Found costs of 2 for: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef) +; AVX1-NEXT: Cost Model: Found costs of 2 for: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef) +; AVX1-NEXT: Cost Model: Found costs of 4 for: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:5 SizeLat:6 for: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:7 SizeLat:10 for: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX2-LABEL: 'reduce_i1' -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX2-NEXT: Cost Model: Found costs of 0 for: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef) +; AVX2-NEXT: Cost Model: Found costs of 2 for: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef) +; AVX2-NEXT: Cost Model: Found costs of 2 for: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef) +; AVX2-NEXT: Cost Model: Found costs of 2 for: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef) +; AVX2-NEXT: Cost Model: Found costs of 2 for: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef) +; AVX2-NEXT: Cost Model: Found costs of 2 for: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:3 SizeLat:4 for: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:5 SizeLat:8 for: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i1' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512F-NEXT: Cost Model: Found costs of 1 for: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef) +; AVX512F-NEXT: Cost Model: Found costs of 3 for: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef) +; AVX512F-NEXT: Cost Model: Found costs of 5 for: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef) +; AVX512F-NEXT: Cost Model: Found costs of 7 for: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef) +; AVX512F-NEXT: Cost Model: Found costs of 9 for: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef) +; AVX512F-NEXT: Cost Model: Found costs of 10 for: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef) +; AVX512F-NEXT: Cost Model: Found costs of 12 for: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef) +; AVX512F-NEXT: Cost Model: Found costs of 16 for: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i1' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512BW-NEXT: Cost Model: Found costs of 1 for: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 5 for: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 7 for: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 9 for: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 11 for: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 13 for: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 14 for: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i1' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512DQ-NEXT: Cost Model: Found costs of 1 for: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 3 for: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 5 for: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 7 for: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 9 for: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 10 for: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 12 for: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 16 for: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef) %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef) diff --git a/llvm/test/Analysis/CostModel/X86/reduce-fadd.ll b/llvm/test/Analysis/CostModel/X86/reduce-fadd.ll index 225d79b82ab77..122903d1e7a1c 100644 --- a/llvm/test/Analysis/CostModel/X86/reduce-fadd.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-fadd.ll @@ -1,70 +1,70 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx | FileCheck %s --check-prefixes=AVX1 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx | FileCheck %s --check-prefixes=AVX1 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512 define void @reduce_f64(double %arg) { ; SSE2-LABEL: 'reduce_f64' -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:5 CodeSize:3 Lat:7 SizeLat:3 for: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:10 CodeSize:6 Lat:14 SizeLat:6 for: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:20 CodeSize:12 Lat:28 SizeLat:12 for: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:40 CodeSize:24 Lat:56 SizeLat:24 for: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; SSSE3-LABEL: 'reduce_f64' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSSE3-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:5 CodeSize:3 Lat:7 SizeLat:3 for: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:10 CodeSize:6 Lat:14 SizeLat:6 for: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:20 CodeSize:12 Lat:28 SizeLat:12 for: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:40 CodeSize:24 Lat:56 SizeLat:24 for: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; SSE41-LABEL: 'reduce_f64' -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSE41-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:5 CodeSize:3 Lat:7 SizeLat:3 for: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:10 CodeSize:6 Lat:14 SizeLat:6 for: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:20 CodeSize:12 Lat:28 SizeLat:12 for: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:40 CodeSize:24 Lat:56 SizeLat:24 for: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; SSE42-LABEL: 'reduce_f64' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSE42-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:7 SizeLat:3 for: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:14 SizeLat:6 for: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:12 CodeSize:12 Lat:28 SizeLat:12 for: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:24 CodeSize:24 Lat:56 SizeLat:24 for: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; AVX1-LABEL: 'reduce_f64' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; AVX1-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:11 SizeLat:3 for: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:23 SizeLat:7 for: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:14 CodeSize:14 Lat:46 SizeLat:14 for: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:28 CodeSize:28 Lat:92 SizeLat:28 for: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; AVX2-LABEL: 'reduce_f64' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:9 SizeLat:3 for: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:19 SizeLat:7 for: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:14 CodeSize:14 Lat:38 SizeLat:14 for: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:28 CodeSize:28 Lat:76 SizeLat:28 for: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; AVX512-LABEL: 'reduce_f64' -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; AVX512-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:9 SizeLat:3 for: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:19 SizeLat:7 for: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:15 CodeSize:15 Lat:39 SizeLat:15 for: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:30 CodeSize:30 Lat:78 SizeLat:30 for: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) @@ -76,67 +76,67 @@ define void @reduce_f64(double %arg) { define void @reduce_f32(float %arg) { ; SSE2-LABEL: 'reduce_f32' -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:5 CodeSize:3 Lat:7 SizeLat:3 for: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:11 CodeSize:7 Lat:15 SizeLat:7 for: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:22 CodeSize:14 Lat:30 SizeLat:14 for: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:44 CodeSize:28 Lat:60 SizeLat:28 for: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:88 CodeSize:56 Lat:120 SizeLat:56 for: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; SSSE3-LABEL: 'reduce_f32' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSSE3-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:5 CodeSize:3 Lat:7 SizeLat:3 for: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:11 CodeSize:7 Lat:15 SizeLat:7 for: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:22 CodeSize:14 Lat:30 SizeLat:14 for: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:44 CodeSize:28 Lat:60 SizeLat:28 for: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:88 CodeSize:56 Lat:120 SizeLat:56 for: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; SSE41-LABEL: 'reduce_f32' -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSE41-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:5 CodeSize:3 Lat:7 SizeLat:3 for: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:11 CodeSize:7 Lat:15 SizeLat:7 for: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:22 CodeSize:14 Lat:30 SizeLat:14 for: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:44 CodeSize:28 Lat:60 SizeLat:28 for: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:88 CodeSize:56 Lat:120 SizeLat:56 for: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; SSE42-LABEL: 'reduce_f32' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSE42-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:7 SizeLat:3 for: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:15 SizeLat:7 for: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:14 CodeSize:14 Lat:30 SizeLat:14 for: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:28 CodeSize:28 Lat:60 SizeLat:28 for: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:56 CodeSize:56 Lat:120 SizeLat:56 for: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; AVX1-LABEL: 'reduce_f32' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; AVX1-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:11 SizeLat:3 for: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:23 SizeLat:7 for: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:15 CodeSize:15 Lat:47 SizeLat:15 for: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:30 CodeSize:30 Lat:94 SizeLat:30 for: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:60 CodeSize:60 Lat:188 SizeLat:60 for: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; AVX2-LABEL: 'reduce_f32' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:9 SizeLat:3 for: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:19 SizeLat:7 for: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:15 CodeSize:15 Lat:39 SizeLat:15 for: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:30 CodeSize:30 Lat:78 SizeLat:30 for: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:60 CodeSize:60 Lat:156 SizeLat:60 for: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; AVX512-LABEL: 'reduce_f32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; AVX512-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:9 SizeLat:3 for: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:19 SizeLat:7 for: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:15 CodeSize:15 Lat:39 SizeLat:15 for: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:31 CodeSize:31 Lat:79 SizeLat:31 for: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:62 CodeSize:62 Lat:158 SizeLat:62 for: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) @@ -151,60 +151,60 @@ define void @reduce_f32(float %arg) { define void @reduce_f64_fast(double %arg) { ; SSE2-LABEL: 'reduce_f64_fast' -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSE2-NEXT: Cost Model: Found costs of 0 for: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) +; SSE2-NEXT: Cost Model: Found costs of 2 for: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:3 Lat:5 SizeLat:3 for: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:8 CodeSize:5 Lat:11 SizeLat:5 for: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:16 CodeSize:9 Lat:23 SizeLat:9 for: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; SSSE3-LABEL: 'reduce_f64_fast' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSSE3-NEXT: Cost Model: Found costs of 0 for: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) +; SSSE3-NEXT: Cost Model: Found costs of 2 for: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:4 CodeSize:3 Lat:5 SizeLat:3 for: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:8 CodeSize:5 Lat:11 SizeLat:5 for: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:16 CodeSize:9 Lat:23 SizeLat:9 for: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; SSE41-LABEL: 'reduce_f64_fast' -; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSE41-NEXT: Cost Model: Found costs of 0 for: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) +; SSE41-NEXT: Cost Model: Found costs of 2 for: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:4 CodeSize:3 Lat:5 SizeLat:3 for: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:8 CodeSize:5 Lat:11 SizeLat:5 for: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:16 CodeSize:9 Lat:23 SizeLat:9 for: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; SSE42-LABEL: 'reduce_f64_fast' -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSE42-NEXT: Cost Model: Found costs of 0 for: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) +; SSE42-NEXT: Cost Model: Found costs of 2 for: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:11 SizeLat:5 for: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:23 SizeLat:9 for: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; AVX1-LABEL: 'reduce_f64_fast' -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; AVX1-NEXT: Cost Model: Found costs of 0 for: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) +; AVX1-NEXT: Cost Model: Found costs of 2 for: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) +; AVX1-NEXT: Cost Model: Found costs of 3 for: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:8 SizeLat:5 for: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:9 CodeSize:6 Lat:18 SizeLat:9 for: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; AVX2-LABEL: 'reduce_f64_fast' -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; AVX2-NEXT: Cost Model: Found costs of 0 for: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) +; AVX2-NEXT: Cost Model: Found costs of 2 for: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) +; AVX2-NEXT: Cost Model: Found costs of 3 for: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:7 SizeLat:5 for: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:15 SizeLat:9 for: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; AVX512-LABEL: 'reduce_f64_fast' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; AVX512-NEXT: Cost Model: Found costs of 0 for: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) +; AVX512-NEXT: Cost Model: Found costs of 2 for: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) +; AVX512-NEXT: Cost Model: Found costs of 3 for: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:17 SizeLat:6 for: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:21 SizeLat:7 for: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) @@ -216,67 +216,67 @@ define void @reduce_f64_fast(double %arg) { define void @reduce_f32_fast(float %arg) { ; SSE2-LABEL: 'reduce_f32_fast' -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSE2-NEXT: Cost Model: Found costs of 0 for: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) +; SSE2-NEXT: Cost Model: Found costs of 2 for: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) +; SSE2-NEXT: Cost Model: Found costs of 4 for: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:6 CodeSize:5 Lat:7 SizeLat:5 for: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:10 CodeSize:7 Lat:13 SizeLat:7 for: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:18 CodeSize:11 Lat:25 SizeLat:11 for: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; SSSE3-LABEL: 'reduce_f32_fast' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSSE3-NEXT: Cost Model: Found costs of 0 for: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) +; SSSE3-NEXT: Cost Model: Found costs of 2 for: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) +; SSSE3-NEXT: Cost Model: Found costs of 4 for: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:6 CodeSize:5 Lat:7 SizeLat:5 for: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:10 CodeSize:7 Lat:13 SizeLat:7 for: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:18 CodeSize:11 Lat:25 SizeLat:11 for: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; SSE41-LABEL: 'reduce_f32_fast' -; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSE41-NEXT: Cost Model: Found costs of 0 for: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) +; SSE41-NEXT: Cost Model: Found costs of 2 for: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) +; SSE41-NEXT: Cost Model: Found costs of 4 for: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:6 CodeSize:5 Lat:7 SizeLat:5 for: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:10 CodeSize:7 Lat:13 SizeLat:7 for: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:18 CodeSize:11 Lat:25 SizeLat:11 for: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; SSE42-LABEL: 'reduce_f32_fast' -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSE42-NEXT: Cost Model: Found costs of 0 for: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) +; SSE42-NEXT: Cost Model: Found costs of 2 for: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) +; SSE42-NEXT: Cost Model: Found costs of 4 for: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:7 SizeLat:5 for: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:13 SizeLat:7 for: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:11 CodeSize:11 Lat:25 SizeLat:11 for: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; AVX1-LABEL: 'reduce_f32_fast' -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; AVX1-NEXT: Cost Model: Found costs of 0 for: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) +; AVX1-NEXT: Cost Model: Found costs of 2 for: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) +; AVX1-NEXT: Cost Model: Found costs of 3 for: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) +; AVX1-NEXT: Cost Model: Found costs of 4 for: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:6 CodeSize:5 Lat:9 SizeLat:6 for: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:10 CodeSize:7 Lat:19 SizeLat:10 for: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; AVX2-LABEL: 'reduce_f32_fast' -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; AVX2-NEXT: Cost Model: Found costs of 0 for: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) +; AVX2-NEXT: Cost Model: Found costs of 2 for: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) +; AVX2-NEXT: Cost Model: Found costs of 3 for: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) +; AVX2-NEXT: Cost Model: Found costs of 4 for: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:8 SizeLat:6 for: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:16 SizeLat:10 for: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; AVX512-LABEL: 'reduce_f32_fast' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; AVX512-NEXT: Cost Model: Found costs of 0 for: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of 2 for: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of 3 for: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of 4 for: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:24 SizeLat:8 for: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:28 SizeLat:9 for: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) diff --git a/llvm/test/Analysis/CostModel/X86/reduce-fmax.ll b/llvm/test/Analysis/CostModel/X86/reduce-fmax.ll index bd8ba23e93297..922a18fe47a6f 100644 --- a/llvm/test/Analysis/CostModel/X86/reduce-fmax.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-fmax.ll @@ -1,46 +1,62 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSE2 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512 define i32 @reduce_f64(i32 %arg) { -; SSE-LABEL: 'reduce_f64' -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-LABEL: 'reduce_f64' +; SSE2-NEXT: Cost Model: Found costs of 0 for: %V1 = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:5 CodeSize:7 Lat:7 SizeLat:7 for: %V2 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:13 Lat:13 SizeLat:13 for: %V4 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:17 CodeSize:25 Lat:25 SizeLat:25 for: %V8 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:33 CodeSize:49 Lat:49 SizeLat:49 for: %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef +; +; SSE41-LABEL: 'reduce_f64' +; SSE41-NEXT: Cost Model: Found costs of 0 for: %V1 = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:5 CodeSize:7 Lat:7 SizeLat:7 for: %V2 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:9 CodeSize:13 Lat:13 SizeLat:13 for: %V4 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:17 CodeSize:25 Lat:25 SizeLat:25 for: %V8 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:33 CodeSize:49 Lat:49 SizeLat:49 for: %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef +; +; SSE42-LABEL: 'reduce_f64' +; SSE42-NEXT: Cost Model: Found costs of 0 for: %V1 = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:5 SizeLat:6 for: %V2 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:9 SizeLat:11 for: %V4 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:17 CodeSize:17 Lat:17 SizeLat:21 for: %V8 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:33 CodeSize:33 Lat:33 SizeLat:41 for: %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX1-LABEL: 'reduce_f64' -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V8 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX1-NEXT: Cost Model: Found costs of 0 for: %V1 = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:7 SizeLat:6 for: %V2 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:14 SizeLat:12 for: %V4 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:13 CodeSize:11 Lat:21 SizeLat:22 for: %V8 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:23 CodeSize:17 Lat:35 SizeLat:42 for: %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX2-LABEL: 'reduce_f64' -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX2-NEXT: Cost Model: Found costs of 0 for: %V1 = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:3 CodeSize:4 Lat:8 SizeLat:6 for: %V2 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:6 CodeSize:8 Lat:16 SizeLat:12 for: %V4 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:11 Lat:23 SizeLat:18 for: %V8 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:15 CodeSize:17 Lat:37 SizeLat:30 for: %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512-LABEL: 'reduce_f64' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-NEXT: Cost Model: Found costs of 0 for: %V1 = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V2 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:4 CodeSize:8 Lat:6 SizeLat:8 for: %V4 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:7 CodeSize:12 Lat:9 SizeLat:12 for: %V8 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:10 CodeSize:15 Lat:12 SizeLat:15 for: %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V1 = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef) %V2 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) @@ -51,41 +67,59 @@ define i32 @reduce_f64(i32 %arg) { } define i32 @reduce_f32(i32 %arg) { -; SSE-LABEL: 'reduce_f32' -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-LABEL: 'reduce_f32' +; SSE2-NEXT: Cost Model: Found costs of 0 for: %V1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:5 CodeSize:7 Lat:7 SizeLat:7 for: %V2 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:10 CodeSize:14 Lat:14 SizeLat:14 for: %V4 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:14 CodeSize:20 Lat:20 SizeLat:20 for: %V8 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:22 CodeSize:32 Lat:32 SizeLat:32 for: %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:38 CodeSize:56 Lat:56 SizeLat:56 for: %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef +; +; SSE41-LABEL: 'reduce_f32' +; SSE41-NEXT: Cost Model: Found costs of 0 for: %V1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:5 CodeSize:7 Lat:7 SizeLat:7 for: %V2 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:10 CodeSize:14 Lat:14 SizeLat:14 for: %V4 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:14 CodeSize:20 Lat:20 SizeLat:20 for: %V8 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:22 CodeSize:32 Lat:32 SizeLat:32 for: %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:38 CodeSize:56 Lat:56 SizeLat:56 for: %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef +; +; SSE42-LABEL: 'reduce_f32' +; SSE42-NEXT: Cost Model: Found costs of 0 for: %V1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:5 SizeLat:6 for: %V2 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:10 CodeSize:10 Lat:10 SizeLat:12 for: %V4 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:14 CodeSize:14 Lat:14 SizeLat:17 for: %V8 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:22 CodeSize:22 Lat:22 SizeLat:27 for: %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:38 CodeSize:38 Lat:38 SizeLat:47 for: %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX1-LABEL: 'reduce_f32' -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX1-NEXT: Cost Model: Found costs of 0 for: %V1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:7 SizeLat:6 for: %V2 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:14 SizeLat:12 for: %V4 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:12 CodeSize:12 Lat:21 SizeLat:18 for: %V8 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:17 CodeSize:15 Lat:28 SizeLat:28 for: %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:27 CodeSize:21 Lat:42 SizeLat:48 for: %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX2-LABEL: 'reduce_f32' -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX2-NEXT: Cost Model: Found costs of 0 for: %V1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:3 CodeSize:4 Lat:8 SizeLat:6 for: %V2 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:6 CodeSize:8 Lat:16 SizeLat:12 for: %V4 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:12 Lat:24 SizeLat:18 for: %V8 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:12 CodeSize:15 Lat:31 SizeLat:24 for: %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:18 CodeSize:21 Lat:45 SizeLat:36 for: %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512-LABEL: 'reduce_f32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-NEXT: Cost Model: Found costs of 0 for: %V1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V2 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:4 CodeSize:8 Lat:8 SizeLat:8 for: %V4 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:6 CodeSize:12 Lat:10 SizeLat:12 for: %V8 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:9 CodeSize:16 Lat:13 SizeLat:16 for: %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:13 CodeSize:19 Lat:17 SizeLat:19 for: %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef) %V2 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef) @@ -100,28 +134,28 @@ define i32 @reduce_f32(i32 %arg) { define i32 @reduce_f64_fast(i32 %arg) { ; SSE-LABEL: 'reduce_f64_fast' -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call fast double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE-NEXT: Cost Model: Found costs of 0 for: %V1 = call fast double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef) +; SSE-NEXT: Cost Model: Found costs of 2 for: %V2 = call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) +; SSE-NEXT: Cost Model: Found costs of 3 for: %V4 = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef) +; SSE-NEXT: Cost Model: Found costs of 5 for: %V8 = call fast double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef) +; SSE-NEXT: Cost Model: Found costs of 9 for: %V16 = call fast double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef) +; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX-LABEL: 'reduce_f64_fast' -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX-NEXT: Cost Model: Found costs of 0 for: %V1 = call fast double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef) +; AVX-NEXT: Cost Model: Found costs of 2 for: %V2 = call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) +; AVX-NEXT: Cost Model: Found costs of 4 for: %V4 = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef) +; AVX-NEXT: Cost Model: Found costs of 5 for: %V8 = call fast double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef) +; AVX-NEXT: Cost Model: Found costs of 7 for: %V16 = call fast double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef) +; AVX-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512-LABEL: 'reduce_f64_fast' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-NEXT: Cost Model: Found costs of 0 for: %V1 = call fast double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:2 for: %V2 = call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:6 SizeLat:4 for: %V4 = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:8 SizeLat:6 for: %V8 = call fast double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:7 for: %V16 = call fast double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V1 = call fast double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef) %V2 = call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) @@ -133,31 +167,31 @@ define i32 @reduce_f64_fast(i32 %arg) { define i32 @reduce_f32_fast(i32 %arg) { ; SSE-LABEL: 'reduce_f32_fast' -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call fast float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE-NEXT: Cost Model: Found costs of 0 for: %V1 = call fast float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef) +; SSE-NEXT: Cost Model: Found costs of 2 for: %V2 = call fast float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef) +; SSE-NEXT: Cost Model: Found costs of 4 for: %V4 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef) +; SSE-NEXT: Cost Model: Found costs of 5 for: %V8 = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef) +; SSE-NEXT: Cost Model: Found costs of 7 for: %V16 = call fast float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef) +; SSE-NEXT: Cost Model: Found costs of 11 for: %V32 = call fast float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef) +; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX-LABEL: 'reduce_f32_fast' -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call fast float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX-NEXT: Cost Model: Found costs of 0 for: %V1 = call fast float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef) +; AVX-NEXT: Cost Model: Found costs of 2 for: %V2 = call fast float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef) +; AVX-NEXT: Cost Model: Found costs of 4 for: %V4 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef) +; AVX-NEXT: Cost Model: Found costs of 6 for: %V8 = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef) +; AVX-NEXT: Cost Model: Found costs of 7 for: %V16 = call fast float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef) +; AVX-NEXT: Cost Model: Found costs of 9 for: %V32 = call fast float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef) +; AVX-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512-LABEL: 'reduce_f32_fast' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call fast float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call fast float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-NEXT: Cost Model: Found costs of 0 for: %V1 = call fast float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:2 for: %V2 = call fast float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:4 for: %V4 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:10 SizeLat:6 for: %V8 = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:12 SizeLat:8 for: %V16 = call fast float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:9 for: %V32 = call fast float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V1 = call fast float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef) %V2 = call fast float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef) diff --git a/llvm/test/Analysis/CostModel/X86/reduce-fmin.ll b/llvm/test/Analysis/CostModel/X86/reduce-fmin.ll index c857104746094..23e11cd812f4d 100644 --- a/llvm/test/Analysis/CostModel/X86/reduce-fmin.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-fmin.ll @@ -1,46 +1,62 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSE2 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512 define i32 @reduce_f64(i32 %arg) { -; SSE-LABEL: 'reduce_f64' -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8 = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16 = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-LABEL: 'reduce_f64' +; SSE2-NEXT: Cost Model: Found costs of 0 for: %V1 = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:5 CodeSize:7 Lat:7 SizeLat:7 for: %V2 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:13 Lat:13 SizeLat:13 for: %V4 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:17 CodeSize:25 Lat:25 SizeLat:25 for: %V8 = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:33 CodeSize:49 Lat:49 SizeLat:49 for: %V16 = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef +; +; SSE41-LABEL: 'reduce_f64' +; SSE41-NEXT: Cost Model: Found costs of 0 for: %V1 = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:5 CodeSize:7 Lat:7 SizeLat:7 for: %V2 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:9 CodeSize:13 Lat:13 SizeLat:13 for: %V4 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:17 CodeSize:25 Lat:25 SizeLat:25 for: %V8 = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:33 CodeSize:49 Lat:49 SizeLat:49 for: %V16 = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef +; +; SSE42-LABEL: 'reduce_f64' +; SSE42-NEXT: Cost Model: Found costs of 0 for: %V1 = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:5 SizeLat:6 for: %V2 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:9 SizeLat:11 for: %V4 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:17 CodeSize:17 Lat:17 SizeLat:21 for: %V8 = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:33 CodeSize:33 Lat:33 SizeLat:41 for: %V16 = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX1-LABEL: 'reduce_f64' -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V8 = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V16 = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX1-NEXT: Cost Model: Found costs of 0 for: %V1 = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:7 SizeLat:6 for: %V2 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:14 SizeLat:12 for: %V4 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:13 CodeSize:11 Lat:21 SizeLat:22 for: %V8 = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:23 CodeSize:17 Lat:35 SizeLat:42 for: %V16 = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX2-LABEL: 'reduce_f64' -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16 = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX2-NEXT: Cost Model: Found costs of 0 for: %V1 = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:3 CodeSize:4 Lat:8 SizeLat:6 for: %V2 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:6 CodeSize:8 Lat:16 SizeLat:12 for: %V4 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:11 Lat:23 SizeLat:18 for: %V8 = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:15 CodeSize:17 Lat:37 SizeLat:30 for: %V16 = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512-LABEL: 'reduce_f64' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-NEXT: Cost Model: Found costs of 0 for: %V1 = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V2 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:4 CodeSize:8 Lat:6 SizeLat:8 for: %V4 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:7 CodeSize:12 Lat:9 SizeLat:12 for: %V8 = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:10 CodeSize:15 Lat:12 SizeLat:15 for: %V16 = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V1 = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef) %V2 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef) @@ -51,41 +67,59 @@ define i32 @reduce_f64(i32 %arg) { } define i32 @reduce_f32(i32 %arg) { -; SSE-LABEL: 'reduce_f32' -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V16 = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V32 = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-LABEL: 'reduce_f32' +; SSE2-NEXT: Cost Model: Found costs of 0 for: %V1 = call float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:5 CodeSize:7 Lat:7 SizeLat:7 for: %V2 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:10 CodeSize:14 Lat:14 SizeLat:14 for: %V4 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:14 CodeSize:20 Lat:20 SizeLat:20 for: %V8 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:22 CodeSize:32 Lat:32 SizeLat:32 for: %V16 = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:38 CodeSize:56 Lat:56 SizeLat:56 for: %V32 = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef +; +; SSE41-LABEL: 'reduce_f32' +; SSE41-NEXT: Cost Model: Found costs of 0 for: %V1 = call float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:5 CodeSize:7 Lat:7 SizeLat:7 for: %V2 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:10 CodeSize:14 Lat:14 SizeLat:14 for: %V4 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:14 CodeSize:20 Lat:20 SizeLat:20 for: %V8 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:22 CodeSize:32 Lat:32 SizeLat:32 for: %V16 = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:38 CodeSize:56 Lat:56 SizeLat:56 for: %V32 = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef +; +; SSE42-LABEL: 'reduce_f32' +; SSE42-NEXT: Cost Model: Found costs of 0 for: %V1 = call float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:5 SizeLat:6 for: %V2 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:10 CodeSize:10 Lat:10 SizeLat:12 for: %V4 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:14 CodeSize:14 Lat:14 SizeLat:17 for: %V8 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:22 CodeSize:22 Lat:22 SizeLat:27 for: %V16 = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:38 CodeSize:38 Lat:38 SizeLat:47 for: %V32 = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX1-LABEL: 'reduce_f32' -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32 = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX1-NEXT: Cost Model: Found costs of 0 for: %V1 = call float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:7 SizeLat:6 for: %V2 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:14 SizeLat:12 for: %V4 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:12 CodeSize:12 Lat:21 SizeLat:18 for: %V8 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:17 CodeSize:15 Lat:28 SizeLat:28 for: %V16 = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:27 CodeSize:21 Lat:42 SizeLat:48 for: %V32 = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX2-LABEL: 'reduce_f32' -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32 = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX2-NEXT: Cost Model: Found costs of 0 for: %V1 = call float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:3 CodeSize:4 Lat:8 SizeLat:6 for: %V2 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:6 CodeSize:8 Lat:16 SizeLat:12 for: %V4 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:12 Lat:24 SizeLat:18 for: %V8 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:12 CodeSize:15 Lat:31 SizeLat:24 for: %V16 = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:18 CodeSize:21 Lat:45 SizeLat:36 for: %V32 = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512-LABEL: 'reduce_f32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V32 = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-NEXT: Cost Model: Found costs of 0 for: %V1 = call float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V2 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:4 CodeSize:8 Lat:8 SizeLat:8 for: %V4 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:6 CodeSize:12 Lat:10 SizeLat:12 for: %V8 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:9 CodeSize:16 Lat:13 SizeLat:16 for: %V16 = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:13 CodeSize:19 Lat:17 SizeLat:19 for: %V32 = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V1 = call float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef) %V2 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef) @@ -100,28 +134,28 @@ define i32 @reduce_f32(i32 %arg) { define i32 @reduce_f64_fast(i32 %arg) { ; SSE-LABEL: 'reduce_f64_fast' -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call fast double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE-NEXT: Cost Model: Found costs of 0 for: %V1 = call fast double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef) +; SSE-NEXT: Cost Model: Found costs of 2 for: %V2 = call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef) +; SSE-NEXT: Cost Model: Found costs of 3 for: %V4 = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef) +; SSE-NEXT: Cost Model: Found costs of 5 for: %V8 = call fast double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef) +; SSE-NEXT: Cost Model: Found costs of 9 for: %V16 = call fast double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef) +; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX-LABEL: 'reduce_f64_fast' -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX-NEXT: Cost Model: Found costs of 0 for: %V1 = call fast double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef) +; AVX-NEXT: Cost Model: Found costs of 2 for: %V2 = call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef) +; AVX-NEXT: Cost Model: Found costs of 4 for: %V4 = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef) +; AVX-NEXT: Cost Model: Found costs of 5 for: %V8 = call fast double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef) +; AVX-NEXT: Cost Model: Found costs of 7 for: %V16 = call fast double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef) +; AVX-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512-LABEL: 'reduce_f64_fast' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-NEXT: Cost Model: Found costs of 0 for: %V1 = call fast double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:2 for: %V2 = call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:6 SizeLat:4 for: %V4 = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:8 SizeLat:6 for: %V8 = call fast double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:7 for: %V16 = call fast double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V1 = call fast double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef) %V2 = call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef) @@ -133,31 +167,31 @@ define i32 @reduce_f64_fast(i32 %arg) { define i32 @reduce_f32_fast(i32 %arg) { ; SSE-LABEL: 'reduce_f32_fast' -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call fast float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE-NEXT: Cost Model: Found costs of 0 for: %V1 = call fast float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef) +; SSE-NEXT: Cost Model: Found costs of 2 for: %V2 = call fast float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef) +; SSE-NEXT: Cost Model: Found costs of 4 for: %V4 = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef) +; SSE-NEXT: Cost Model: Found costs of 5 for: %V8 = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef) +; SSE-NEXT: Cost Model: Found costs of 7 for: %V16 = call fast float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef) +; SSE-NEXT: Cost Model: Found costs of 11 for: %V32 = call fast float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef) +; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX-LABEL: 'reduce_f32_fast' -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call fast float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX-NEXT: Cost Model: Found costs of 0 for: %V1 = call fast float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef) +; AVX-NEXT: Cost Model: Found costs of 2 for: %V2 = call fast float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef) +; AVX-NEXT: Cost Model: Found costs of 4 for: %V4 = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef) +; AVX-NEXT: Cost Model: Found costs of 6 for: %V8 = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef) +; AVX-NEXT: Cost Model: Found costs of 7 for: %V16 = call fast float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef) +; AVX-NEXT: Cost Model: Found costs of 9 for: %V32 = call fast float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef) +; AVX-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512-LABEL: 'reduce_f32_fast' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call fast float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call fast float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-NEXT: Cost Model: Found costs of 0 for: %V1 = call fast float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:2 for: %V2 = call fast float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:4 for: %V4 = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:10 SizeLat:6 for: %V8 = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:12 SizeLat:8 for: %V16 = call fast float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:9 for: %V32 = call fast float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V1 = call fast float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef) %V2 = call fast float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef) diff --git a/llvm/test/Analysis/CostModel/X86/reduce-fmul.ll b/llvm/test/Analysis/CostModel/X86/reduce-fmul.ll index 14da7f5e539a2..06ff41e8f7101 100644 --- a/llvm/test/Analysis/CostModel/X86/reduce-fmul.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-fmul.ll @@ -1,70 +1,70 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx | FileCheck %s --check-prefixes=AVX1 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx | FileCheck %s --check-prefixes=AVX1 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512 define void @reduce_f64(double %arg) { ; SSE2-LABEL: 'reduce_f64' -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:5 SizeLat:1 for: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:5 CodeSize:3 Lat:11 SizeLat:3 for: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:10 CodeSize:6 Lat:22 SizeLat:6 for: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:20 CodeSize:12 Lat:44 SizeLat:12 for: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:40 CodeSize:24 Lat:88 SizeLat:24 for: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; SSSE3-LABEL: 'reduce_f64' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSSE3-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:5 SizeLat:1 for: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:5 CodeSize:3 Lat:11 SizeLat:3 for: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:10 CodeSize:6 Lat:22 SizeLat:6 for: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:20 CodeSize:12 Lat:44 SizeLat:12 for: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:40 CodeSize:24 Lat:88 SizeLat:24 for: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; SSE41-LABEL: 'reduce_f64' -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSE41-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:5 SizeLat:1 for: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:5 CodeSize:3 Lat:11 SizeLat:3 for: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:10 CodeSize:6 Lat:22 SizeLat:6 for: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:20 CodeSize:12 Lat:44 SizeLat:12 for: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:40 CodeSize:24 Lat:88 SizeLat:24 for: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; SSE42-LABEL: 'reduce_f64' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSE42-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:11 SizeLat:3 for: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:22 SizeLat:6 for: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:12 CodeSize:12 Lat:44 SizeLat:12 for: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:24 CodeSize:24 Lat:88 SizeLat:24 for: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; AVX1-LABEL: 'reduce_f64' -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; AVX1-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:5 SizeLat:1 for: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:5 CodeSize:3 Lat:11 SizeLat:3 for: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:11 CodeSize:7 Lat:23 SizeLat:7 for: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:22 CodeSize:14 Lat:46 SizeLat:14 for: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:44 CodeSize:28 Lat:92 SizeLat:28 for: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; AVX2-LABEL: 'reduce_f64' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:11 SizeLat:3 for: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:23 SizeLat:7 for: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:14 CodeSize:14 Lat:46 SizeLat:14 for: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:28 CodeSize:28 Lat:92 SizeLat:28 for: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; AVX512-LABEL: 'reduce_f64' -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; AVX512-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:9 SizeLat:3 for: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:19 SizeLat:7 for: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:15 CodeSize:15 Lat:39 SizeLat:15 for: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:30 CodeSize:30 Lat:78 SizeLat:30 for: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) @@ -76,67 +76,67 @@ define void @reduce_f64(double %arg) { define void @reduce_f32(float %arg) { ; SSE2-LABEL: 'reduce_f32' -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:5 SizeLat:1 for: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:5 CodeSize:3 Lat:11 SizeLat:3 for: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:11 CodeSize:7 Lat:23 SizeLat:7 for: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:22 CodeSize:14 Lat:46 SizeLat:14 for: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:44 CodeSize:28 Lat:92 SizeLat:28 for: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:88 CodeSize:56 Lat:184 SizeLat:56 for: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; SSSE3-LABEL: 'reduce_f32' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSSE3-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:5 SizeLat:1 for: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:5 CodeSize:3 Lat:11 SizeLat:3 for: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:11 CodeSize:7 Lat:23 SizeLat:7 for: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:22 CodeSize:14 Lat:46 SizeLat:14 for: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:44 CodeSize:28 Lat:92 SizeLat:28 for: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:88 CodeSize:56 Lat:184 SizeLat:56 for: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; SSE41-LABEL: 'reduce_f32' -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSE41-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:5 SizeLat:1 for: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:5 CodeSize:3 Lat:11 SizeLat:3 for: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:11 CodeSize:7 Lat:23 SizeLat:7 for: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:22 CodeSize:14 Lat:46 SizeLat:14 for: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:44 CodeSize:28 Lat:92 SizeLat:28 for: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:88 CodeSize:56 Lat:184 SizeLat:56 for: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; SSE42-LABEL: 'reduce_f32' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSE42-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:11 SizeLat:3 for: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:23 SizeLat:7 for: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:14 CodeSize:14 Lat:46 SizeLat:14 for: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:28 CodeSize:28 Lat:92 SizeLat:28 for: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:56 CodeSize:56 Lat:184 SizeLat:56 for: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; AVX1-LABEL: 'reduce_f32' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; AVX1-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:11 SizeLat:3 for: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:23 SizeLat:7 for: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:15 CodeSize:15 Lat:47 SizeLat:15 for: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:30 CodeSize:30 Lat:94 SizeLat:30 for: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:60 CodeSize:60 Lat:188 SizeLat:60 for: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; AVX2-LABEL: 'reduce_f32' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:11 SizeLat:3 for: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:23 SizeLat:7 for: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:15 CodeSize:15 Lat:47 SizeLat:15 for: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:30 CodeSize:30 Lat:94 SizeLat:30 for: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:60 CodeSize:60 Lat:188 SizeLat:60 for: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; AVX512-LABEL: 'reduce_f32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; AVX512-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:9 SizeLat:3 for: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:19 SizeLat:7 for: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:15 CodeSize:15 Lat:39 SizeLat:15 for: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:31 CodeSize:31 Lat:79 SizeLat:31 for: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:62 CodeSize:62 Lat:158 SizeLat:62 for: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) @@ -151,60 +151,60 @@ define void @reduce_f32(float %arg) { define void @reduce_f64_fast(double %arg) { ; SSE2-LABEL: 'reduce_f64_fast' -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSE2-NEXT: Cost Model: Found costs of 0 for: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:3 CodeSize:2 Lat:6 SizeLat:2 for: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:5 CodeSize:3 Lat:11 SizeLat:3 for: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:5 Lat:21 SizeLat:5 for: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:17 CodeSize:9 Lat:41 SizeLat:9 for: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; SSSE3-LABEL: 'reduce_f64_fast' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSSE3-NEXT: Cost Model: Found costs of 0 for: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:3 CodeSize:2 Lat:6 SizeLat:2 for: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:5 CodeSize:3 Lat:11 SizeLat:3 for: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:9 CodeSize:5 Lat:21 SizeLat:5 for: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:17 CodeSize:9 Lat:41 SizeLat:9 for: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; SSE41-LABEL: 'reduce_f64_fast' -; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSE41-NEXT: Cost Model: Found costs of 0 for: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:3 CodeSize:2 Lat:6 SizeLat:2 for: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:5 CodeSize:3 Lat:11 SizeLat:3 for: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:9 CodeSize:5 Lat:21 SizeLat:5 for: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:17 CodeSize:9 Lat:41 SizeLat:9 for: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; SSE42-LABEL: 'reduce_f64_fast' -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSE42-NEXT: Cost Model: Found costs of 0 for: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:11 SizeLat:3 for: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:21 SizeLat:5 for: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:41 SizeLat:9 for: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; AVX1-LABEL: 'reduce_f64_fast' -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; AVX1-NEXT: Cost Model: Found costs of 0 for: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:3 CodeSize:2 Lat:6 SizeLat:2 for: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:12 SizeLat:4 for: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:10 CodeSize:5 Lat:17 SizeLat:6 for: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:18 CodeSize:7 Lat:27 SizeLat:10 for: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; AVX2-LABEL: 'reduce_f64_fast' -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; AVX2-NEXT: Cost Model: Found costs of 0 for: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:12 SizeLat:4 for: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:17 SizeLat:6 for: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:27 SizeLat:10 for: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; AVX512-LABEL: 'reduce_f64_fast' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; AVX512-NEXT: Cost Model: Found costs of 0 for: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:7 SizeLat:2 for: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:12 SizeLat:4 for: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:17 SizeLat:6 for: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:21 SizeLat:7 for: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) @@ -216,67 +216,67 @@ define void @reduce_f64_fast(double %arg) { define void @reduce_f32_fast(float %arg) { ; SSE2-LABEL: 'reduce_f32_fast' -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSE2-NEXT: Cost Model: Found costs of 0 for: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:3 CodeSize:2 Lat:6 SizeLat:2 for: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:12 SizeLat:4 for: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:8 CodeSize:5 Lat:17 SizeLat:5 for: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:12 CodeSize:7 Lat:27 SizeLat:7 for: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:20 CodeSize:11 Lat:47 SizeLat:11 for: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; SSSE3-LABEL: 'reduce_f32_fast' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSSE3-NEXT: Cost Model: Found costs of 0 for: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:3 CodeSize:2 Lat:6 SizeLat:2 for: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:12 SizeLat:4 for: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:8 CodeSize:5 Lat:17 SizeLat:5 for: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:12 CodeSize:7 Lat:27 SizeLat:7 for: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:20 CodeSize:11 Lat:47 SizeLat:11 for: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; SSE41-LABEL: 'reduce_f32_fast' -; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSE41-NEXT: Cost Model: Found costs of 0 for: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:3 CodeSize:2 Lat:6 SizeLat:2 for: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:12 SizeLat:4 for: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:8 CodeSize:5 Lat:17 SizeLat:5 for: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:12 CodeSize:7 Lat:27 SizeLat:7 for: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:20 CodeSize:11 Lat:47 SizeLat:11 for: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) +; SSE41-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; SSE42-LABEL: 'reduce_f32_fast' -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSE42-NEXT: Cost Model: Found costs of 0 for: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:12 SizeLat:4 for: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:17 SizeLat:5 for: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:27 SizeLat:7 for: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:11 CodeSize:11 Lat:47 SizeLat:11 for: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; AVX1-LABEL: 'reduce_f32_fast' -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; AVX1-NEXT: Cost Model: Found costs of 0 for: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:12 SizeLat:4 for: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:18 SizeLat:6 for: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:8 CodeSize:7 Lat:23 SizeLat:8 for: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:12 CodeSize:9 Lat:33 SizeLat:12 for: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; AVX2-LABEL: 'reduce_f32_fast' -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; AVX2-NEXT: Cost Model: Found costs of 0 for: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:12 SizeLat:4 for: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:18 SizeLat:6 for: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:23 SizeLat:8 for: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:33 SizeLat:12 for: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; AVX512-LABEL: 'reduce_f32_fast' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; AVX512-NEXT: Cost Model: Found costs of 0 for: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:7 SizeLat:2 for: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:14 SizeLat:4 for: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:19 SizeLat:6 for: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:24 SizeLat:8 for: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:28 SizeLat:9 for: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) diff --git a/llvm/test/Analysis/CostModel/X86/reduce-mul.ll b/llvm/test/Analysis/CostModel/X86/reduce-mul.ll index 93d32466136d7..cebc3e577297c 100644 --- a/llvm/test/Analysis/CostModel/X86/reduce-mul.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-mul.ll @@ -1,77 +1,77 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx | FileCheck %s --check-prefixes=AVX1 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx | FileCheck %s --check-prefixes=AVX1 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ define i32 @reduce_i64(i32 %arg) { ; SSE2-LABEL: 'reduce_i64' -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:12 Lat:12 SizeLat:12 for: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:16 CodeSize:22 Lat:22 SizeLat:22 for: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:30 CodeSize:42 Lat:42 SizeLat:42 for: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:58 CodeSize:82 Lat:82 SizeLat:82 for: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i64' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSSE3-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:9 CodeSize:12 Lat:12 SizeLat:12 for: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:16 CodeSize:22 Lat:22 SizeLat:22 for: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:30 CodeSize:42 Lat:42 SizeLat:42 for: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:58 CodeSize:82 Lat:82 SizeLat:82 for: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SSE42-LABEL: 'reduce_i64' -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE42-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:8 CodeSize:12 Lat:12 SizeLat:12 for: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:14 CodeSize:22 Lat:22 SizeLat:22 for: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:26 CodeSize:42 Lat:42 SizeLat:42 for: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:50 CodeSize:82 Lat:82 SizeLat:82 for: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX1-LABEL: 'reduce_i64' -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX1-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:8 CodeSize:12 Lat:12 SizeLat:12 for: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:15 CodeSize:23 Lat:23 SizeLat:23 for: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:27 CodeSize:42 Lat:38 SizeLat:43 for: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:51 CodeSize:80 Lat:68 SizeLat:83 for: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX2-LABEL: 'reduce_i64' -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX2-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:8 CodeSize:10 Lat:12 SizeLat:10 for: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:15 CodeSize:19 Lat:23 SizeLat:19 for: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:21 CodeSize:27 Lat:33 SizeLat:32 for: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:33 CodeSize:43 Lat:53 SizeLat:58 for: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i64' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512F-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:8 CodeSize:10 Lat:14 SizeLat:10 for: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:15 CodeSize:19 Lat:25 SizeLat:19 for: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:22 CodeSize:28 Lat:36 SizeLat:33 for: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:28 CodeSize:36 Lat:45 SizeLat:41 for: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i64' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512BW-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found costs of RThru:8 CodeSize:10 Lat:14 SizeLat:10 for: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found costs of RThru:15 CodeSize:19 Lat:25 SizeLat:19 for: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found costs of RThru:22 CodeSize:28 Lat:36 SizeLat:33 for: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found costs of RThru:28 CodeSize:36 Lat:45 SizeLat:41 for: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i64' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512DQ-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:4 CodeSize:3 Lat:19 SizeLat:5 for: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:7 CodeSize:5 Lat:35 SizeLat:9 for: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:10 CodeSize:7 Lat:51 SizeLat:13 for: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:13 CodeSize:8 Lat:66 SizeLat:16 for: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef) %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef) @@ -83,52 +83,52 @@ define i32 @reduce_i64(i32 %arg) { define i32 @reduce_i32(i32 %arg) { ; SSE2-LABEL: 'reduce_i32' -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-NEXT: Cost Model: Found costs of RThru:8 CodeSize:9 Lat:10 SizeLat:9 for: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:15 CodeSize:17 Lat:19 SizeLat:17 for: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:21 CodeSize:24 Lat:27 SizeLat:24 for: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:33 CodeSize:38 Lat:43 SizeLat:38 for: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:57 CodeSize:66 Lat:75 SizeLat:66 for: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i32' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSSE3-NEXT: Cost Model: Found costs of RThru:8 CodeSize:9 Lat:10 SizeLat:9 for: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:15 CodeSize:17 Lat:19 SizeLat:17 for: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:21 CodeSize:24 Lat:27 SizeLat:24 for: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:33 CodeSize:38 Lat:43 SizeLat:38 for: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:57 CodeSize:66 Lat:75 SizeLat:66 for: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SSE42-LABEL: 'reduce_i32' -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE42-NEXT: Cost Model: Found costs of RThru:4 CodeSize:3 Lat:13 SizeLat:3 for: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:7 CodeSize:5 Lat:25 SizeLat:5 for: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:9 CodeSize:6 Lat:36 SizeLat:6 for: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:13 CodeSize:8 Lat:58 SizeLat:8 for: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:21 CodeSize:12 Lat:102 SizeLat:12 for: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX1-LABEL: 'reduce_i32' -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:3 Lat:7 SizeLat:5 for: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:7 CodeSize:5 Lat:13 SizeLat:9 for: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:10 CodeSize:7 Lat:19 SizeLat:13 for: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:15 CodeSize:12 Lat:27 SizeLat:23 for: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:25 CodeSize:22 Lat:43 SizeLat:43 for: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX2-LABEL: 'reduce_i32' -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:3 Lat:12 SizeLat:4 for: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:7 CodeSize:5 Lat:23 SizeLat:7 for: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:10 CodeSize:7 Lat:34 SizeLat:10 for: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:14 CodeSize:8 Lat:44 SizeLat:12 for: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:22 CodeSize:10 Lat:64 SizeLat:16 for: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512-LABEL: 'reduce_i32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:14 SizeLat:4 for: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:27 SizeLat:7 for: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:38 SizeLat:10 for: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:49 SizeLat:13 for: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:10 CodeSize:10 Lat:59 SizeLat:15 for: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef) %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef) @@ -140,58 +140,58 @@ define i32 @reduce_i32(i32 %arg) { define i32 @reduce_i16(i32 %arg) { ; SSE-LABEL: 'reduce_i16' -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:7 SizeLat:3 for: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef) +; SSE-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:13 SizeLat:5 for: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef) +; SSE-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:19 SizeLat:7 for: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef) +; SSE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:24 SizeLat:8 for: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef) +; SSE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:10 Lat:34 SizeLat:10 for: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef) +; SSE-NEXT: Cost Model: Found costs of RThru:14 CodeSize:14 Lat:54 SizeLat:14 for: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef) +; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX1-LABEL: 'reduce_i16' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX1-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:8 SizeLat:3 for: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:14 SizeLat:5 for: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:20 SizeLat:7 for: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:26 SizeLat:9 for: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:13 CodeSize:14 Lat:34 SizeLat:15 for: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:21 CodeSize:24 Lat:50 SizeLat:27 for: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX2-LABEL: 'reduce_i16' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX2-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:7 SizeLat:3 for: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:13 SizeLat:5 for: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:19 SizeLat:7 for: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:25 SizeLat:9 for: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:11 CodeSize:10 Lat:30 SizeLat:11 for: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:15 CodeSize:12 Lat:40 SizeLat:15 for: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i16' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512F-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:7 SizeLat:3 for: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:15 SizeLat:5 for: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:23 SizeLat:7 for: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:29 SizeLat:9 for: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:12 CodeSize:11 Lat:35 SizeLat:12 for: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:14 CodeSize:12 Lat:36 SizeLat:13 for: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i16' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512BW-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:7 SizeLat:3 for: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:15 SizeLat:5 for: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:23 SizeLat:7 for: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:29 SizeLat:9 for: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found costs of RThru:12 CodeSize:11 Lat:35 SizeLat:12 for: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found costs of RThru:13 CodeSize:12 Lat:40 SizeLat:13 for: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i16' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:7 SizeLat:3 for: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:15 SizeLat:5 for: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:23 SizeLat:7 for: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:29 SizeLat:9 for: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:12 CodeSize:11 Lat:35 SizeLat:12 for: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:14 CodeSize:12 Lat:36 SizeLat:13 for: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef) %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef) @@ -204,64 +204,64 @@ define i32 @reduce_i16(i32 %arg) { define i32 @reduce_i8(i32 %arg) { ; SSE-LABEL: 'reduce_i8' -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:4 for: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef) +; SSE-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:14 SizeLat:6 for: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef) +; SSE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:20 SizeLat:8 for: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef) +; SSE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:10 Lat:26 SizeLat:10 for: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef) +; SSE-NEXT: Cost Model: Found costs of RThru:14 CodeSize:14 Lat:38 SizeLat:14 for: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef) +; SSE-NEXT: Cost Model: Found costs of RThru:22 CodeSize:22 Lat:62 SizeLat:22 for: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef) +; SSE-NEXT: Cost Model: Found costs of RThru:38 CodeSize:38 Lat:110 SizeLat:38 for: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef) +; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX1-LABEL: 'reduce_i8' -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:9 SizeLat:4 for: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:15 SizeLat:6 for: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:21 SizeLat:8 for: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:12 CodeSize:10 Lat:27 SizeLat:10 for: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:20 CodeSize:15 Lat:35 SizeLat:16 for: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:35 CodeSize:25 Lat:51 SizeLat:28 for: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:65 CodeSize:45 Lat:83 SizeLat:52 for: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX2-LABEL: 'reduce_i8' -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:4 for: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:14 SizeLat:6 for: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:20 SizeLat:8 for: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:11 CodeSize:10 Lat:26 SizeLat:10 for: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:16 CodeSize:11 Lat:31 SizeLat:12 for: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:25 CodeSize:13 Lat:41 SizeLat:16 for: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:43 CodeSize:17 Lat:61 SizeLat:24 for: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i8' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512F-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:4 for: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:16 SizeLat:6 for: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:24 SizeLat:8 for: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:10 CodeSize:10 Lat:30 SizeLat:10 for: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:15 CodeSize:12 Lat:36 SizeLat:13 for: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:21 CodeSize:13 Lat:37 SizeLat:14 for: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:32 CodeSize:15 Lat:39 SizeLat:16 for: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i8' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512BW-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:4 for: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:16 SizeLat:6 for: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:24 SizeLat:8 for: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of RThru:10 CodeSize:10 Lat:30 SizeLat:10 for: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of RThru:13 CodeSize:12 Lat:36 SizeLat:13 for: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of RThru:16 CodeSize:13 Lat:41 SizeLat:14 for: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of RThru:21 CodeSize:15 Lat:51 SizeLat:16 for: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i8' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:4 for: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:16 SizeLat:6 for: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:24 SizeLat:8 for: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:10 CodeSize:10 Lat:30 SizeLat:10 for: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:15 CodeSize:12 Lat:36 SizeLat:13 for: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:21 CodeSize:13 Lat:37 SizeLat:14 for: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:32 CodeSize:15 Lat:39 SizeLat:16 for: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef) %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef) diff --git a/llvm/test/Analysis/CostModel/X86/reduce-or.ll b/llvm/test/Analysis/CostModel/X86/reduce-or.ll index 799a49fce26ba..4b82bb9c685e5 100644 --- a/llvm/test/Analysis/CostModel/X86/reduce-or.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-or.ll @@ -1,37 +1,37 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse2 | FileCheck %s --check-prefixes=SSE +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ define i32 @reduce_i64(i32 %arg) { ; SSE-LABEL: 'reduce_i64' -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> undef) +; SSE-NEXT: Cost Model: Found costs of 3 for: %V2 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> undef) +; SSE-NEXT: Cost Model: Found costs of 4 for: %V4 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> undef) +; SSE-NEXT: Cost Model: Found costs of 6 for: %V8 = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> undef) +; SSE-NEXT: Cost Model: Found costs of 10 for: %V16 = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> undef) +; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX-LABEL: 'reduce_i64' -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> undef) +; AVX-NEXT: Cost Model: Found costs of 3 for: %V2 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> undef) +; AVX-NEXT: Cost Model: Found costs of 5 for: %V4 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> undef) +; AVX-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:6 SizeLat:7 for: %V8 = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> undef) +; AVX-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:8 SizeLat:11 for: %V16 = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> undef) +; AVX-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512-LABEL: 'reduce_i64' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %V2 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:7 SizeLat:5 for: %V4 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:7 for: %V8 = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:10 SizeLat:8 for: %V16 = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V1 = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> undef) %V2 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> undef) @@ -43,28 +43,28 @@ define i32 @reduce_i64(i32 %arg) { define i32 @reduce_i32(i32 %arg) { ; SSE-LABEL: 'reduce_i32' -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE-NEXT: Cost Model: Found costs of 3 for: %V2 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> undef) +; SSE-NEXT: Cost Model: Found costs of 5 for: %V4 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> undef) +; SSE-NEXT: Cost Model: Found costs of 6 for: %V8 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef) +; SSE-NEXT: Cost Model: Found costs of 8 for: %V16 = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> undef) +; SSE-NEXT: Cost Model: Found costs of 12 for: %V32 = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> undef) +; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX-LABEL: 'reduce_i32' -; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX-NEXT: Cost Model: Found costs of 3 for: %V2 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> undef) +; AVX-NEXT: Cost Model: Found costs of 5 for: %V4 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> undef) +; AVX-NEXT: Cost Model: Found costs of 7 for: %V8 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef) +; AVX-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:8 SizeLat:9 for: %V16 = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> undef) +; AVX-NEXT: Cost Model: Found costs of RThru:10 CodeSize:10 Lat:10 SizeLat:13 for: %V32 = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> undef) +; AVX-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512-LABEL: 'reduce_i32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %V2 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:9 SizeLat:5 for: %V4 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:11 SizeLat:7 for: %V8 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:9 for: %V16 = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:10 CodeSize:10 Lat:14 SizeLat:10 for: %V32 = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V2 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> undef) %V4 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> undef) @@ -76,31 +76,40 @@ define i32 @reduce_i32(i32 %arg) { define i32 @reduce_i16(i32 %arg) { ; SSE-LABEL: 'reduce_i16' -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE-NEXT: Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> undef) +; SSE-NEXT: Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef) +; SSE-NEXT: Cost Model: Found costs of 7 for: %V8 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef) +; SSE-NEXT: Cost Model: Found costs of 8 for: %V16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef) +; SSE-NEXT: Cost Model: Found costs of 10 for: %V32 = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> undef) +; SSE-NEXT: Cost Model: Found costs of 14 for: %V64 = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> undef) +; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; -; AVX-LABEL: 'reduce_i16' -; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX1-LABEL: 'reduce_i16' +; AVX1-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %V2 = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:6 SizeLat:5 for: %V4 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:8 SizeLat:7 for: %V8 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:10 SizeLat:9 for: %V16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:10 CodeSize:10 Lat:11 SizeLat:11 for: %V32 = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:12 CodeSize:12 Lat:13 SizeLat:15 for: %V64 = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef +; +; AVX2-LABEL: 'reduce_i16' +; AVX2-NEXT: Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of 7 for: %V8 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of 9 for: %V16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:10 CodeSize:10 Lat:10 SizeLat:11 for: %V32 = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:12 CodeSize:12 Lat:12 SizeLat:15 for: %V64 = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512-LABEL: 'reduce_i16' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-NEXT: Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:7 SizeLat:5 for: %V4 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:11 SizeLat:7 for: %V8 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:9 for: %V16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:11 CodeSize:11 Lat:15 SizeLat:11 for: %V32 = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:12 CodeSize:12 Lat:16 SizeLat:12 for: %V64 = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V2 = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> undef) %V4 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef) @@ -113,34 +122,44 @@ define i32 @reduce_i16(i32 %arg) { define i32 @reduce_i8(i32 %arg) { ; SSE-LABEL: 'reduce_i8' -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE-NEXT: Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> undef) +; SSE-NEXT: Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef) +; SSE-NEXT: Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef) +; SSE-NEXT: Cost Model: Found costs of 9 for: %V16 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef) +; SSE-NEXT: Cost Model: Found costs of 10 for: %V32 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef) +; SSE-NEXT: Cost Model: Found costs of 12 for: %V64 = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> undef) +; SSE-NEXT: Cost Model: Found costs of 16 for: %V128 = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> undef) +; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef +; +; AVX1-LABEL: 'reduce_i8' +; AVX1-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %V2 = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:7 SizeLat:5 for: %V4 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:7 for: %V8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:11 SizeLat:9 for: %V16 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:11 CodeSize:11 Lat:13 SizeLat:11 for: %V32 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:12 CodeSize:12 Lat:14 SizeLat:13 for: %V64 = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:14 CodeSize:14 Lat:16 SizeLat:17 for: %V128 = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; -; AVX-LABEL: 'reduce_i8' -; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX2-LABEL: 'reduce_i8' +; AVX2-NEXT: Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of 9 for: %V16 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of 11 for: %V32 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:12 CodeSize:12 Lat:12 SizeLat:13 for: %V64 = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:14 CodeSize:14 Lat:14 SizeLat:17 for: %V128 = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512-LABEL: 'reduce_i8' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-NEXT: Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> undef) +; AVX512-NEXT: Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:7 for: %V8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:9 for: %V16 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:11 CodeSize:11 Lat:15 SizeLat:11 for: %V32 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:13 CodeSize:13 Lat:17 SizeLat:13 for: %V64 = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:14 CodeSize:14 Lat:18 SizeLat:14 for: %V128 = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V2 = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> undef) %V4 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef) @@ -154,70 +173,70 @@ define i32 @reduce_i8(i32 %arg) { define i32 @reduce_i1(i32 %arg) { ; SSE-LABEL: 'reduce_i1' -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE-NEXT: Cost Model: Found costs of 0 for: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef) +; SSE-NEXT: Cost Model: Found costs of 2 for: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef) +; SSE-NEXT: Cost Model: Found costs of 2 for: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef) +; SSE-NEXT: Cost Model: Found costs of 2 for: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef) +; SSE-NEXT: Cost Model: Found costs of 2 for: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef) +; SSE-NEXT: Cost Model: Found costs of 3 for: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef) +; SSE-NEXT: Cost Model: Found costs of 5 for: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef) +; SSE-NEXT: Cost Model: Found costs of 9 for: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef) +; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX1-LABEL: 'reduce_i1' -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX1-NEXT: Cost Model: Found costs of 0 for: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef) +; AVX1-NEXT: Cost Model: Found costs of 2 for: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef) +; AVX1-NEXT: Cost Model: Found costs of 2 for: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef) +; AVX1-NEXT: Cost Model: Found costs of 2 for: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef) +; AVX1-NEXT: Cost Model: Found costs of 2 for: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef) +; AVX1-NEXT: Cost Model: Found costs of 4 for: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:5 SizeLat:6 for: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:7 SizeLat:10 for: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX2-LABEL: 'reduce_i1' -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX2-NEXT: Cost Model: Found costs of 0 for: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef) +; AVX2-NEXT: Cost Model: Found costs of 2 for: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef) +; AVX2-NEXT: Cost Model: Found costs of 2 for: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef) +; AVX2-NEXT: Cost Model: Found costs of 2 for: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef) +; AVX2-NEXT: Cost Model: Found costs of 2 for: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef) +; AVX2-NEXT: Cost Model: Found costs of 2 for: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:3 SizeLat:4 for: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:5 SizeLat:8 for: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i1' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512F-NEXT: Cost Model: Found costs of 1 for: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef) +; AVX512F-NEXT: Cost Model: Found costs of 3 for: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef) +; AVX512F-NEXT: Cost Model: Found costs of 5 for: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef) +; AVX512F-NEXT: Cost Model: Found costs of 7 for: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef) +; AVX512F-NEXT: Cost Model: Found costs of 9 for: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef) +; AVX512F-NEXT: Cost Model: Found costs of 10 for: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef) +; AVX512F-NEXT: Cost Model: Found costs of 12 for: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef) +; AVX512F-NEXT: Cost Model: Found costs of 16 for: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i1' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512BW-NEXT: Cost Model: Found costs of 1 for: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 5 for: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 7 for: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 9 for: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 11 for: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 13 for: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 14 for: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i1' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512DQ-NEXT: Cost Model: Found costs of 1 for: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 3 for: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 5 for: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 7 for: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 9 for: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 10 for: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 12 for: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 16 for: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef) %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef) diff --git a/llvm/test/Analysis/CostModel/X86/reduce-smax.ll b/llvm/test/Analysis/CostModel/X86/reduce-smax.ll index e036a22ba2647..f11229e313d3a 100644 --- a/llvm/test/Analysis/CostModel/X86/reduce-smax.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-smax.ll @@ -1,62 +1,62 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4,SSE41 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE4,SSE42 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx | FileCheck %s --check-prefixes=AVX1 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE4 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx | FileCheck %s --check-prefixes=AVX1 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ define i32 @reduce_i64(i32 %arg) { ; SSE2-LABEL: 'reduce_i64' -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:6 CodeSize:17 Lat:10 SizeLat:17 for: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:10 CodeSize:32 Lat:18 SizeLat:32 for: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:18 CodeSize:62 Lat:34 SizeLat:62 for: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:34 CodeSize:122 Lat:66 SizeLat:122 for: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i64' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSSE3-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:6 CodeSize:17 Lat:10 SizeLat:17 for: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:10 CodeSize:32 Lat:18 SizeLat:32 for: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:18 CodeSize:62 Lat:34 SizeLat:62 for: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:34 CodeSize:122 Lat:66 SizeLat:122 for: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SSE4-LABEL: 'reduce_i64' -; SSE4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE4-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef) +; SSE4-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:9 SizeLat:5 for: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef) +; SSE4-NEXT: Cost Model: Found costs of RThru:8 CodeSize:6 Lat:16 SizeLat:8 for: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef) +; SSE4-NEXT: Cost Model: Found costs of RThru:14 CodeSize:10 Lat:30 SizeLat:14 for: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef) +; SSE4-NEXT: Cost Model: Found costs of RThru:26 CodeSize:18 Lat:58 SizeLat:26 for: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef) +; SSE4-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX1-LABEL: 'reduce_i64' -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX1-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:9 SizeLat:6 for: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:9 CodeSize:7 Lat:17 SizeLat:11 for: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:15 CodeSize:13 Lat:26 SizeLat:23 for: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:27 CodeSize:25 Lat:44 SizeLat:47 for: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX2-LABEL: 'reduce_i64' -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX2-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:9 SizeLat:5 for: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:17 SizeLat:9 for: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:24 SizeLat:12 for: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:13 CodeSize:13 Lat:38 SizeLat:18 for: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512-LABEL: 'reduce_i64' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:7 SizeLat:3 for: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:11 SizeLat:5 for: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:15 SizeLat:7 for: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:18 SizeLat:8 for: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef) %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef) @@ -68,52 +68,52 @@ define i32 @reduce_i64(i32 %arg) { define i32 @reduce_i32(i32 %arg) { ; SSE2-LABEL: 'reduce_i32' -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:7 Lat:6 SizeLat:7 for: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:7 CodeSize:13 Lat:11 SizeLat:13 for: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:18 Lat:15 SizeLat:18 for: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:13 CodeSize:28 Lat:23 SizeLat:28 for: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:21 CodeSize:48 Lat:39 SizeLat:48 for: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i32' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSSE3-NEXT: Cost Model: Found costs of RThru:4 CodeSize:7 Lat:6 SizeLat:7 for: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:7 CodeSize:13 Lat:11 SizeLat:13 for: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:9 CodeSize:18 Lat:15 SizeLat:18 for: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:13 CodeSize:28 Lat:23 SizeLat:28 for: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:21 CodeSize:48 Lat:39 SizeLat:48 for: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SSE4-LABEL: 'reduce_i32' -; SSE4-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE4-NEXT: Cost Model: Found costs of 3 for: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef) +; SSE4-NEXT: Cost Model: Found costs of 5 for: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef) +; SSE4-NEXT: Cost Model: Found costs of 6 for: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef) +; SSE4-NEXT: Cost Model: Found costs of 8 for: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef) +; SSE4-NEXT: Cost Model: Found costs of 12 for: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef) +; SSE4-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX1-LABEL: 'reduce_i32' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX1-NEXT: Cost Model: Found costs of 3 for: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef) +; AVX1-NEXT: Cost Model: Found costs of 5 for: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef) +; AVX1-NEXT: Cost Model: Found costs of 7 for: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:11 CodeSize:12 Lat:13 SizeLat:13 for: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:19 CodeSize:22 Lat:25 SizeLat:25 for: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX2-LABEL: 'reduce_i32' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX2-NEXT: Cost Model: Found costs of 3 for: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef) +; AVX2-NEXT: Cost Model: Found costs of 5 for: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef) +; AVX2-NEXT: Cost Model: Found costs of 7 for: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:8 SizeLat:9 for: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:10 CodeSize:10 Lat:10 SizeLat:13 for: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512-LABEL: 'reduce_i32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:9 SizeLat:5 for: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:11 SizeLat:7 for: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:10 for: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:10 CodeSize:10 Lat:14 SizeLat:11 for: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef) %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef) @@ -125,76 +125,76 @@ define i32 @reduce_i32(i32 %arg) { define i32 @reduce_i16(i32 %arg) { ; SSE2-LABEL: 'reduce_i16' -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-NEXT: Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef) +; SSE2-NEXT: Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef) +; SSE2-NEXT: Cost Model: Found costs of 7 for: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef) +; SSE2-NEXT: Cost Model: Found costs of 8 for: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef) +; SSE2-NEXT: Cost Model: Found costs of 10 for: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef) +; SSE2-NEXT: Cost Model: Found costs of 14 for: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i16' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSSE3-NEXT: Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef) +; SSSE3-NEXT: Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef) +; SSSE3-NEXT: Cost Model: Found costs of 7 for: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef) +; SSSE3-NEXT: Cost Model: Found costs of 8 for: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef) +; SSSE3-NEXT: Cost Model: Found costs of 10 for: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef) +; SSSE3-NEXT: Cost Model: Found costs of 14 for: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SSE4-LABEL: 'reduce_i16' -; SSE4-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE4-NEXT: Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef) +; SSE4-NEXT: Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef) +; SSE4-NEXT: Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef) +; SSE4-NEXT: Cost Model: Found costs of 5 for: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef) +; SSE4-NEXT: Cost Model: Found costs of 7 for: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef) +; SSE4-NEXT: Cost Model: Found costs of 11 for: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef) +; SSE4-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX1-LABEL: 'reduce_i16' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX1-NEXT: Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of 6 for: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:10 CodeSize:11 Lat:12 SizeLat:12 for: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:18 CodeSize:21 Lat:24 SizeLat:24 for: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX2-LABEL: 'reduce_i16' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX2-NEXT: Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of 6 for: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:7 SizeLat:8 for: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:9 SizeLat:12 for: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i16' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512F-NEXT: Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef) +; AVX512F-NEXT: Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef) +; AVX512F-NEXT: Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef) +; AVX512F-NEXT: Cost Model: Found costs of 6 for: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:11 CodeSize:11 Lat:15 SizeLat:12 for: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:14 CodeSize:16 Lat:22 SizeLat:17 for: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i16' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 8 for: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 9 for: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i16' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512DQ-NEXT: Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 6 for: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:11 CodeSize:11 Lat:15 SizeLat:12 for: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:14 CodeSize:16 Lat:22 SizeLat:17 for: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef) %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef) @@ -207,84 +207,84 @@ define i32 @reduce_i16(i32 %arg) { define i32 @reduce_i8(i32 %arg) { ; SSE2-LABEL: 'reduce_i8' -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:7 Lat:6 SizeLat:7 for: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:7 CodeSize:13 Lat:11 SizeLat:13 for: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:10 CodeSize:19 Lat:16 SizeLat:19 for: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:13 CodeSize:25 Lat:21 SizeLat:25 for: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:15 CodeSize:30 Lat:25 SizeLat:30 for: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:19 CodeSize:40 Lat:33 SizeLat:40 for: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:27 CodeSize:60 Lat:49 SizeLat:60 for: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i8' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSSE3-NEXT: Cost Model: Found costs of RThru:4 CodeSize:7 Lat:6 SizeLat:7 for: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:7 CodeSize:13 Lat:11 SizeLat:13 for: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:10 CodeSize:19 Lat:16 SizeLat:19 for: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:13 CodeSize:25 Lat:21 SizeLat:25 for: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:15 CodeSize:30 Lat:25 SizeLat:30 for: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:19 CodeSize:40 Lat:33 SizeLat:40 for: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:27 CodeSize:60 Lat:49 SizeLat:60 for: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SSE4-LABEL: 'reduce_i8' -; SSE4-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE4-NEXT: Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef) +; SSE4-NEXT: Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef) +; SSE4-NEXT: Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef) +; SSE4-NEXT: Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef) +; SSE4-NEXT: Cost Model: Found costs of 7 for: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef) +; SSE4-NEXT: Cost Model: Found costs of 9 for: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef) +; SSE4-NEXT: Cost Model: Found costs of 13 for: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef) +; SSE4-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX1-LABEL: 'reduce_i8' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX1-NEXT: Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of 8 for: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:12 CodeSize:13 Lat:14 SizeLat:14 for: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:20 CodeSize:23 Lat:26 SizeLat:26 for: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX2-LABEL: 'reduce_i8' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX2-NEXT: Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of 8 for: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:9 SizeLat:10 for: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:11 CodeSize:11 Lat:11 SizeLat:14 for: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i8' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512F-NEXT: Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of 8 for: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:13 CodeSize:13 Lat:17 SizeLat:14 for: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:16 CodeSize:18 Lat:24 SizeLat:19 for: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i8' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 8 for: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 10 for: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 11 for: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i8' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512DQ-NEXT: Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 8 for: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:13 CodeSize:13 Lat:17 SizeLat:14 for: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:16 CodeSize:18 Lat:24 SizeLat:19 for: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef) %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef) @@ -322,6 +322,3 @@ declare i8 @llvm.vector.reduce.smax.v16i8(<16 x i8>) declare i8 @llvm.vector.reduce.smax.v32i8(<32 x i8>) declare i8 @llvm.vector.reduce.smax.v64i8(<64 x i8>) declare i8 @llvm.vector.reduce.smax.v128i8(<128 x i8>) -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; SSE41: {{.*}} -; SSE42: {{.*}} diff --git a/llvm/test/Analysis/CostModel/X86/reduce-smin.ll b/llvm/test/Analysis/CostModel/X86/reduce-smin.ll index c40f2fd2f2967..5ae31798de6c5 100644 --- a/llvm/test/Analysis/CostModel/X86/reduce-smin.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-smin.ll @@ -1,62 +1,62 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4,SSE41 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE4,SSE42 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx | FileCheck %s --check-prefixes=AVX1 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE4 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx | FileCheck %s --check-prefixes=AVX1 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ define i32 @reduce_i64(i32 %arg) { ; SSE2-LABEL: 'reduce_i64' -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:6 CodeSize:17 Lat:10 SizeLat:17 for: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:10 CodeSize:32 Lat:18 SizeLat:32 for: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:18 CodeSize:62 Lat:34 SizeLat:62 for: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:34 CodeSize:122 Lat:66 SizeLat:122 for: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i64' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSSE3-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:6 CodeSize:17 Lat:10 SizeLat:17 for: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:10 CodeSize:32 Lat:18 SizeLat:32 for: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:18 CodeSize:62 Lat:34 SizeLat:62 for: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:34 CodeSize:122 Lat:66 SizeLat:122 for: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SSE4-LABEL: 'reduce_i64' -; SSE4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE4-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef) +; SSE4-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:9 SizeLat:5 for: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef) +; SSE4-NEXT: Cost Model: Found costs of RThru:8 CodeSize:6 Lat:16 SizeLat:8 for: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef) +; SSE4-NEXT: Cost Model: Found costs of RThru:14 CodeSize:10 Lat:30 SizeLat:14 for: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef) +; SSE4-NEXT: Cost Model: Found costs of RThru:26 CodeSize:18 Lat:58 SizeLat:26 for: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef) +; SSE4-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX1-LABEL: 'reduce_i64' -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX1-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:9 SizeLat:5 for: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:9 CodeSize:7 Lat:17 SizeLat:9 for: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:15 CodeSize:13 Lat:26 SizeLat:21 for: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:27 CodeSize:25 Lat:44 SizeLat:45 for: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX2-LABEL: 'reduce_i64' -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX2-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:9 SizeLat:5 for: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:17 SizeLat:9 for: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:24 SizeLat:12 for: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:13 CodeSize:13 Lat:38 SizeLat:18 for: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512-LABEL: 'reduce_i64' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:7 SizeLat:3 for: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:11 SizeLat:5 for: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:15 SizeLat:7 for: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:18 SizeLat:8 for: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef) %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef) @@ -68,52 +68,52 @@ define i32 @reduce_i64(i32 %arg) { define i32 @reduce_i32(i32 %arg) { ; SSE2-LABEL: 'reduce_i32' -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:7 Lat:6 SizeLat:7 for: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:7 CodeSize:13 Lat:11 SizeLat:13 for: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:18 Lat:15 SizeLat:18 for: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:13 CodeSize:28 Lat:23 SizeLat:28 for: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:21 CodeSize:48 Lat:39 SizeLat:48 for: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i32' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSSE3-NEXT: Cost Model: Found costs of RThru:4 CodeSize:7 Lat:6 SizeLat:7 for: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:7 CodeSize:13 Lat:11 SizeLat:13 for: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:9 CodeSize:18 Lat:15 SizeLat:18 for: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:13 CodeSize:28 Lat:23 SizeLat:28 for: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:21 CodeSize:48 Lat:39 SizeLat:48 for: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SSE4-LABEL: 'reduce_i32' -; SSE4-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE4-NEXT: Cost Model: Found costs of 3 for: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef) +; SSE4-NEXT: Cost Model: Found costs of 5 for: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef) +; SSE4-NEXT: Cost Model: Found costs of 6 for: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef) +; SSE4-NEXT: Cost Model: Found costs of 8 for: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef) +; SSE4-NEXT: Cost Model: Found costs of 12 for: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef) +; SSE4-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX1-LABEL: 'reduce_i32' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX1-NEXT: Cost Model: Found costs of 3 for: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef) +; AVX1-NEXT: Cost Model: Found costs of 5 for: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef) +; AVX1-NEXT: Cost Model: Found costs of 7 for: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:11 CodeSize:12 Lat:13 SizeLat:13 for: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:19 CodeSize:22 Lat:25 SizeLat:25 for: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX2-LABEL: 'reduce_i32' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX2-NEXT: Cost Model: Found costs of 3 for: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef) +; AVX2-NEXT: Cost Model: Found costs of 5 for: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef) +; AVX2-NEXT: Cost Model: Found costs of 7 for: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:8 SizeLat:9 for: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:10 CodeSize:10 Lat:10 SizeLat:13 for: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512-LABEL: 'reduce_i32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:9 SizeLat:5 for: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:11 SizeLat:7 for: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:10 for: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:10 CodeSize:10 Lat:14 SizeLat:11 for: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef) %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef) @@ -125,76 +125,76 @@ define i32 @reduce_i32(i32 %arg) { define i32 @reduce_i16(i32 %arg) { ; SSE2-LABEL: 'reduce_i16' -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-NEXT: Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef) +; SSE2-NEXT: Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef) +; SSE2-NEXT: Cost Model: Found costs of 7 for: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef) +; SSE2-NEXT: Cost Model: Found costs of 8 for: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef) +; SSE2-NEXT: Cost Model: Found costs of 10 for: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef) +; SSE2-NEXT: Cost Model: Found costs of 14 for: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i16' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSSE3-NEXT: Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef) +; SSSE3-NEXT: Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef) +; SSSE3-NEXT: Cost Model: Found costs of 7 for: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef) +; SSSE3-NEXT: Cost Model: Found costs of 8 for: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef) +; SSSE3-NEXT: Cost Model: Found costs of 10 for: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef) +; SSSE3-NEXT: Cost Model: Found costs of 14 for: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SSE4-LABEL: 'reduce_i16' -; SSE4-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE4-NEXT: Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef) +; SSE4-NEXT: Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef) +; SSE4-NEXT: Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef) +; SSE4-NEXT: Cost Model: Found costs of 5 for: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef) +; SSE4-NEXT: Cost Model: Found costs of 7 for: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef) +; SSE4-NEXT: Cost Model: Found costs of 11 for: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef) +; SSE4-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX1-LABEL: 'reduce_i16' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX1-NEXT: Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of 6 for: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:10 CodeSize:11 Lat:12 SizeLat:12 for: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:18 CodeSize:21 Lat:24 SizeLat:24 for: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX2-LABEL: 'reduce_i16' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX2-NEXT: Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of 6 for: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:7 SizeLat:8 for: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:9 SizeLat:12 for: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i16' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512F-NEXT: Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef) +; AVX512F-NEXT: Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef) +; AVX512F-NEXT: Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef) +; AVX512F-NEXT: Cost Model: Found costs of 6 for: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:11 CodeSize:11 Lat:15 SizeLat:12 for: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:14 CodeSize:16 Lat:22 SizeLat:17 for: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i16' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 8 for: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 9 for: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i16' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512DQ-NEXT: Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 6 for: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:11 CodeSize:11 Lat:15 SizeLat:12 for: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:14 CodeSize:16 Lat:22 SizeLat:17 for: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef) %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef) @@ -207,84 +207,84 @@ define i32 @reduce_i16(i32 %arg) { define i32 @reduce_i8(i32 %arg) { ; SSE2-LABEL: 'reduce_i8' -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:7 Lat:6 SizeLat:7 for: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:7 CodeSize:13 Lat:11 SizeLat:13 for: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:10 CodeSize:19 Lat:16 SizeLat:19 for: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:13 CodeSize:25 Lat:21 SizeLat:25 for: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:15 CodeSize:30 Lat:25 SizeLat:30 for: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:19 CodeSize:40 Lat:33 SizeLat:40 for: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:27 CodeSize:60 Lat:49 SizeLat:60 for: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i8' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSSE3-NEXT: Cost Model: Found costs of RThru:4 CodeSize:7 Lat:6 SizeLat:7 for: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:7 CodeSize:13 Lat:11 SizeLat:13 for: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:10 CodeSize:19 Lat:16 SizeLat:19 for: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:13 CodeSize:25 Lat:21 SizeLat:25 for: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:15 CodeSize:30 Lat:25 SizeLat:30 for: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:19 CodeSize:40 Lat:33 SizeLat:40 for: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:27 CodeSize:60 Lat:49 SizeLat:60 for: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SSE4-LABEL: 'reduce_i8' -; SSE4-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE4-NEXT: Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef) +; SSE4-NEXT: Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef) +; SSE4-NEXT: Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef) +; SSE4-NEXT: Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef) +; SSE4-NEXT: Cost Model: Found costs of 7 for: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef) +; SSE4-NEXT: Cost Model: Found costs of 9 for: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef) +; SSE4-NEXT: Cost Model: Found costs of 13 for: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef) +; SSE4-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX1-LABEL: 'reduce_i8' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX1-NEXT: Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of 8 for: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:12 CodeSize:13 Lat:14 SizeLat:14 for: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:20 CodeSize:23 Lat:26 SizeLat:26 for: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX2-LABEL: 'reduce_i8' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX2-NEXT: Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of 8 for: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:9 SizeLat:10 for: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:11 CodeSize:11 Lat:11 SizeLat:14 for: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i8' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512F-NEXT: Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of 8 for: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:13 CodeSize:13 Lat:17 SizeLat:14 for: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:16 CodeSize:18 Lat:24 SizeLat:19 for: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i8' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 8 for: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 10 for: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 11 for: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i8' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512DQ-NEXT: Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 8 for: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:13 CodeSize:13 Lat:17 SizeLat:14 for: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:16 CodeSize:18 Lat:24 SizeLat:19 for: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef) %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef) @@ -322,6 +322,3 @@ declare i8 @llvm.vector.reduce.smin.v16i8(<16 x i8>) declare i8 @llvm.vector.reduce.smin.v32i8(<32 x i8>) declare i8 @llvm.vector.reduce.smin.v64i8(<64 x i8>) declare i8 @llvm.vector.reduce.smin.v128i8(<128 x i8>) -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; SSE41: {{.*}} -; SSE42: {{.*}} diff --git a/llvm/test/Analysis/CostModel/X86/reduce-umax.ll b/llvm/test/Analysis/CostModel/X86/reduce-umax.ll index 5b0e0cd103843..3025192ddf35b 100644 --- a/llvm/test/Analysis/CostModel/X86/reduce-umax.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-umax.ll @@ -1,62 +1,62 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4,SSE41 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE4,SSE42 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx | FileCheck %s --check-prefixes=AVX1 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE4 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx | FileCheck %s --check-prefixes=AVX1 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ define i32 @reduce_i64(i32 %arg) { ; SSE2-LABEL: 'reduce_i64' -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:6 CodeSize:17 Lat:10 SizeLat:17 for: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:10 CodeSize:32 Lat:18 SizeLat:32 for: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:18 CodeSize:62 Lat:34 SizeLat:62 for: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:34 CodeSize:122 Lat:66 SizeLat:122 for: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i64' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSSE3-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:6 CodeSize:17 Lat:10 SizeLat:17 for: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:10 CodeSize:32 Lat:18 SizeLat:32 for: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:18 CodeSize:62 Lat:34 SizeLat:62 for: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:34 CodeSize:122 Lat:66 SizeLat:122 for: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SSE4-LABEL: 'reduce_i64' -; SSE4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE4-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef) +; SSE4-NEXT: Cost Model: Found costs of RThru:4 CodeSize:8 Lat:13 SizeLat:9 for: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef) +; SSE4-NEXT: Cost Model: Found costs of RThru:6 CodeSize:14 Lat:24 SizeLat:16 for: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef) +; SSE4-NEXT: Cost Model: Found costs of RThru:10 CodeSize:26 Lat:46 SizeLat:30 for: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef) +; SSE4-NEXT: Cost Model: Found costs of RThru:18 CodeSize:50 Lat:90 SizeLat:58 for: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef) +; SSE4-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX1-LABEL: 'reduce_i64' -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX1-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:10 SizeLat:9 for: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:11 CodeSize:13 Lat:19 SizeLat:17 for: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:20 CodeSize:24 Lat:29 SizeLat:34 for: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:38 CodeSize:46 Lat:49 SizeLat:68 for: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX2-LABEL: 'reduce_i64' -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX2-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:7 Lat:10 SizeLat:8 for: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:7 CodeSize:13 Lat:19 SizeLat:15 for: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:18 Lat:27 SizeLat:23 for: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:13 CodeSize:28 Lat:43 SizeLat:39 for: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512-LABEL: 'reduce_i64' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:7 SizeLat:3 for: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:11 SizeLat:5 for: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:15 SizeLat:7 for: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:18 SizeLat:8 for: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef) %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef) @@ -68,52 +68,52 @@ define i32 @reduce_i64(i32 %arg) { define i32 @reduce_i32(i32 %arg) { ; SSE2-LABEL: 'reduce_i32' -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:10 Lat:7 SizeLat:10 for: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:7 CodeSize:19 Lat:13 SizeLat:19 for: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:27 Lat:18 SizeLat:27 for: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:13 CodeSize:43 Lat:28 SizeLat:43 for: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:21 CodeSize:75 Lat:48 SizeLat:75 for: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i32' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSSE3-NEXT: Cost Model: Found costs of RThru:4 CodeSize:10 Lat:7 SizeLat:10 for: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:7 CodeSize:19 Lat:13 SizeLat:19 for: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:9 CodeSize:27 Lat:18 SizeLat:27 for: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:13 CodeSize:43 Lat:28 SizeLat:43 for: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:21 CodeSize:75 Lat:48 SizeLat:75 for: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SSE4-LABEL: 'reduce_i32' -; SSE4-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE4-NEXT: Cost Model: Found costs of 3 for: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef) +; SSE4-NEXT: Cost Model: Found costs of 5 for: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef) +; SSE4-NEXT: Cost Model: Found costs of 6 for: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef) +; SSE4-NEXT: Cost Model: Found costs of 8 for: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef) +; SSE4-NEXT: Cost Model: Found costs of 12 for: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef) +; SSE4-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX1-LABEL: 'reduce_i32' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX1-NEXT: Cost Model: Found costs of 3 for: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef) +; AVX1-NEXT: Cost Model: Found costs of 5 for: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef) +; AVX1-NEXT: Cost Model: Found costs of 7 for: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:11 CodeSize:12 Lat:13 SizeLat:13 for: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:19 CodeSize:22 Lat:25 SizeLat:25 for: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX2-LABEL: 'reduce_i32' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX2-NEXT: Cost Model: Found costs of 3 for: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef) +; AVX2-NEXT: Cost Model: Found costs of 5 for: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef) +; AVX2-NEXT: Cost Model: Found costs of 7 for: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:8 SizeLat:9 for: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:10 CodeSize:10 Lat:10 SizeLat:13 for: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512-LABEL: 'reduce_i32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:9 SizeLat:5 for: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:11 SizeLat:7 for: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:10 for: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:10 CodeSize:10 Lat:14 SizeLat:11 for: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef) %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef) @@ -125,76 +125,76 @@ define i32 @reduce_i32(i32 %arg) { define i32 @reduce_i16(i32 %arg) { ; SSE2-LABEL: 'reduce_i16' -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-NEXT: Cost Model: Found costs of 5 for: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef) +; SSE2-NEXT: Cost Model: Found costs of 7 for: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef) +; SSE2-NEXT: Cost Model: Found costs of 9 for: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:12 CodeSize:18 Lat:18 SizeLat:18 for: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:16 CodeSize:30 Lat:30 SizeLat:30 for: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i16' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSSE3-NEXT: Cost Model: Found costs of 5 for: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef) +; SSSE3-NEXT: Cost Model: Found costs of 7 for: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef) +; SSSE3-NEXT: Cost Model: Found costs of 9 for: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:12 CodeSize:18 Lat:18 SizeLat:18 for: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:16 CodeSize:30 Lat:30 SizeLat:30 for: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SSE4-LABEL: 'reduce_i16' -; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE4-NEXT: Cost Model: Found costs of 5 for: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef) +; SSE4-NEXT: Cost Model: Found costs of 7 for: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef) +; SSE4-NEXT: Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef) +; SSE4-NEXT: Cost Model: Found costs of 5 for: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef) +; SSE4-NEXT: Cost Model: Found costs of 7 for: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef) +; SSE4-NEXT: Cost Model: Found costs of 11 for: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef) +; SSE4-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX1-LABEL: 'reduce_i16' -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX1-NEXT: Cost Model: Found costs of 5 for: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of 7 for: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of 6 for: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:10 CodeSize:11 Lat:12 SizeLat:12 for: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:18 CodeSize:21 Lat:24 SizeLat:24 for: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX2-LABEL: 'reduce_i16' -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX2-NEXT: Cost Model: Found costs of 5 for: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of 7 for: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of 6 for: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:7 SizeLat:8 for: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:9 SizeLat:12 for: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i16' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512F-NEXT: Cost Model: Found costs of 5 for: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef) +; AVX512F-NEXT: Cost Model: Found costs of 7 for: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef) +; AVX512F-NEXT: Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef) +; AVX512F-NEXT: Cost Model: Found costs of 6 for: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:11 CodeSize:11 Lat:15 SizeLat:12 for: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:14 CodeSize:16 Lat:22 SizeLat:17 for: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i16' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512BW-NEXT: Cost Model: Found costs of 5 for: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 7 for: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 8 for: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 9 for: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i16' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512DQ-NEXT: Cost Model: Found costs of 5 for: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 7 for: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 6 for: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:11 CodeSize:11 Lat:15 SizeLat:12 for: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:14 CodeSize:16 Lat:22 SizeLat:17 for: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef) %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef) @@ -207,84 +207,84 @@ define i32 @reduce_i16(i32 %arg) { define i32 @reduce_i8(i32 %arg) { ; SSE2-LABEL: 'reduce_i8' -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-NEXT: Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef) +; SSE2-NEXT: Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef) +; SSE2-NEXT: Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef) +; SSE2-NEXT: Cost Model: Found costs of 9 for: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef) +; SSE2-NEXT: Cost Model: Found costs of 10 for: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef) +; SSE2-NEXT: Cost Model: Found costs of 12 for: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef) +; SSE2-NEXT: Cost Model: Found costs of 16 for: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i8' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSSE3-NEXT: Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef) +; SSSE3-NEXT: Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef) +; SSSE3-NEXT: Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef) +; SSSE3-NEXT: Cost Model: Found costs of 9 for: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef) +; SSSE3-NEXT: Cost Model: Found costs of 10 for: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef) +; SSSE3-NEXT: Cost Model: Found costs of 12 for: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef) +; SSSE3-NEXT: Cost Model: Found costs of 16 for: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SSE4-LABEL: 'reduce_i8' -; SSE4-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE4-NEXT: Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef) +; SSE4-NEXT: Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef) +; SSE4-NEXT: Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef) +; SSE4-NEXT: Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef) +; SSE4-NEXT: Cost Model: Found costs of 7 for: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef) +; SSE4-NEXT: Cost Model: Found costs of 9 for: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef) +; SSE4-NEXT: Cost Model: Found costs of 13 for: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef) +; SSE4-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX1-LABEL: 'reduce_i8' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX1-NEXT: Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of 8 for: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:12 CodeSize:13 Lat:14 SizeLat:14 for: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:20 CodeSize:23 Lat:26 SizeLat:26 for: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX2-LABEL: 'reduce_i8' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX2-NEXT: Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of 8 for: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:9 SizeLat:10 for: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:11 CodeSize:11 Lat:11 SizeLat:14 for: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i8' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512F-NEXT: Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of 8 for: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:13 CodeSize:13 Lat:17 SizeLat:14 for: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:16 CodeSize:18 Lat:24 SizeLat:19 for: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i8' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 8 for: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 10 for: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 11 for: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i8' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512DQ-NEXT: Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 8 for: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:13 CodeSize:13 Lat:17 SizeLat:14 for: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:16 CodeSize:18 Lat:24 SizeLat:19 for: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef) %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef) @@ -322,6 +322,3 @@ declare i8 @llvm.vector.reduce.umax.v16i8(<16 x i8>) declare i8 @llvm.vector.reduce.umax.v32i8(<32 x i8>) declare i8 @llvm.vector.reduce.umax.v64i8(<64 x i8>) declare i8 @llvm.vector.reduce.umax.v128i8(<128 x i8>) -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; SSE41: {{.*}} -; SSE42: {{.*}} diff --git a/llvm/test/Analysis/CostModel/X86/reduce-umin.ll b/llvm/test/Analysis/CostModel/X86/reduce-umin.ll index acd38421ba937..51d06a925f4c9 100644 --- a/llvm/test/Analysis/CostModel/X86/reduce-umin.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-umin.ll @@ -1,62 +1,62 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4,SSE41 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE4,SSE42 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx | FileCheck %s --check-prefixes=AVX1 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE4 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx | FileCheck %s --check-prefixes=AVX1 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ define i32 @reduce_i64(i32 %arg) { ; SSE2-LABEL: 'reduce_i64' -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:6 CodeSize:17 Lat:10 SizeLat:17 for: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:10 CodeSize:32 Lat:18 SizeLat:32 for: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:18 CodeSize:62 Lat:34 SizeLat:62 for: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:34 CodeSize:122 Lat:66 SizeLat:122 for: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i64' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSSE3-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:6 CodeSize:17 Lat:10 SizeLat:17 for: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:10 CodeSize:32 Lat:18 SizeLat:32 for: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:18 CodeSize:62 Lat:34 SizeLat:62 for: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:34 CodeSize:122 Lat:66 SizeLat:122 for: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SSE4-LABEL: 'reduce_i64' -; SSE4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE4-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef) +; SSE4-NEXT: Cost Model: Found costs of RThru:4 CodeSize:8 Lat:13 SizeLat:9 for: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef) +; SSE4-NEXT: Cost Model: Found costs of RThru:6 CodeSize:14 Lat:24 SizeLat:16 for: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef) +; SSE4-NEXT: Cost Model: Found costs of RThru:10 CodeSize:26 Lat:46 SizeLat:30 for: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef) +; SSE4-NEXT: Cost Model: Found costs of RThru:18 CodeSize:50 Lat:90 SizeLat:58 for: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef) +; SSE4-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX1-LABEL: 'reduce_i64' -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX1-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:10 SizeLat:9 for: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:11 CodeSize:13 Lat:19 SizeLat:17 for: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:20 CodeSize:24 Lat:29 SizeLat:34 for: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:38 CodeSize:46 Lat:49 SizeLat:68 for: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX2-LABEL: 'reduce_i64' -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX2-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:7 Lat:10 SizeLat:8 for: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:7 CodeSize:13 Lat:19 SizeLat:15 for: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:18 Lat:27 SizeLat:23 for: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:13 CodeSize:28 Lat:43 SizeLat:39 for: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512-LABEL: 'reduce_i64' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:7 SizeLat:3 for: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:11 SizeLat:5 for: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:15 SizeLat:7 for: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:18 SizeLat:8 for: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef) %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef) @@ -68,52 +68,52 @@ define i32 @reduce_i64(i32 %arg) { define i32 @reduce_i32(i32 %arg) { ; SSE2-LABEL: 'reduce_i32' -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:10 Lat:7 SizeLat:10 for: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:7 CodeSize:19 Lat:13 SizeLat:19 for: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:27 Lat:18 SizeLat:27 for: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:13 CodeSize:43 Lat:28 SizeLat:43 for: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:21 CodeSize:75 Lat:48 SizeLat:75 for: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i32' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSSE3-NEXT: Cost Model: Found costs of RThru:4 CodeSize:10 Lat:7 SizeLat:10 for: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:7 CodeSize:19 Lat:13 SizeLat:19 for: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:9 CodeSize:27 Lat:18 SizeLat:27 for: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:13 CodeSize:43 Lat:28 SizeLat:43 for: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:21 CodeSize:75 Lat:48 SizeLat:75 for: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SSE4-LABEL: 'reduce_i32' -; SSE4-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE4-NEXT: Cost Model: Found costs of 3 for: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef) +; SSE4-NEXT: Cost Model: Found costs of 5 for: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef) +; SSE4-NEXT: Cost Model: Found costs of 6 for: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef) +; SSE4-NEXT: Cost Model: Found costs of 8 for: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef) +; SSE4-NEXT: Cost Model: Found costs of 12 for: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef) +; SSE4-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX1-LABEL: 'reduce_i32' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX1-NEXT: Cost Model: Found costs of 3 for: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef) +; AVX1-NEXT: Cost Model: Found costs of 5 for: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef) +; AVX1-NEXT: Cost Model: Found costs of 7 for: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:11 CodeSize:12 Lat:13 SizeLat:13 for: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:19 CodeSize:22 Lat:25 SizeLat:25 for: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX2-LABEL: 'reduce_i32' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX2-NEXT: Cost Model: Found costs of 3 for: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef) +; AVX2-NEXT: Cost Model: Found costs of 5 for: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef) +; AVX2-NEXT: Cost Model: Found costs of 7 for: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:8 SizeLat:9 for: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:10 CodeSize:10 Lat:10 SizeLat:13 for: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512-LABEL: 'reduce_i32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:9 SizeLat:5 for: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:11 SizeLat:7 for: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:10 for: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:10 CodeSize:10 Lat:14 SizeLat:11 for: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef) %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef) @@ -125,76 +125,76 @@ define i32 @reduce_i32(i32 %arg) { define i32 @reduce_i16(i32 %arg) { ; SSE2-LABEL: 'reduce_i16' -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-NEXT: Cost Model: Found costs of 5 for: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef) +; SSE2-NEXT: Cost Model: Found costs of 7 for: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef) +; SSE2-NEXT: Cost Model: Found costs of 9 for: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:12 CodeSize:18 Lat:18 SizeLat:18 for: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:16 CodeSize:30 Lat:30 SizeLat:30 for: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i16' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSSE3-NEXT: Cost Model: Found costs of 5 for: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef) +; SSSE3-NEXT: Cost Model: Found costs of 7 for: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef) +; SSSE3-NEXT: Cost Model: Found costs of 9 for: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:12 CodeSize:18 Lat:18 SizeLat:18 for: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:16 CodeSize:30 Lat:30 SizeLat:30 for: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SSE4-LABEL: 'reduce_i16' -; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE4-NEXT: Cost Model: Found costs of 5 for: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef) +; SSE4-NEXT: Cost Model: Found costs of 7 for: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef) +; SSE4-NEXT: Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef) +; SSE4-NEXT: Cost Model: Found costs of 5 for: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef) +; SSE4-NEXT: Cost Model: Found costs of 7 for: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef) +; SSE4-NEXT: Cost Model: Found costs of 11 for: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef) +; SSE4-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX1-LABEL: 'reduce_i16' -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX1-NEXT: Cost Model: Found costs of 5 for: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of 7 for: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of 6 for: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:10 CodeSize:11 Lat:12 SizeLat:12 for: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:18 CodeSize:21 Lat:24 SizeLat:24 for: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX2-LABEL: 'reduce_i16' -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX2-NEXT: Cost Model: Found costs of 5 for: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of 7 for: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of 6 for: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:7 SizeLat:8 for: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:9 SizeLat:12 for: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i16' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512F-NEXT: Cost Model: Found costs of 5 for: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef) +; AVX512F-NEXT: Cost Model: Found costs of 7 for: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef) +; AVX512F-NEXT: Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef) +; AVX512F-NEXT: Cost Model: Found costs of 6 for: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:11 CodeSize:11 Lat:15 SizeLat:12 for: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:14 CodeSize:16 Lat:22 SizeLat:17 for: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i16' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512BW-NEXT: Cost Model: Found costs of 5 for: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 7 for: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 8 for: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 9 for: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i16' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512DQ-NEXT: Cost Model: Found costs of 5 for: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 7 for: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 6 for: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:11 CodeSize:11 Lat:15 SizeLat:12 for: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:14 CodeSize:16 Lat:22 SizeLat:17 for: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef) %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef) @@ -207,84 +207,84 @@ define i32 @reduce_i16(i32 %arg) { define i32 @reduce_i8(i32 %arg) { ; SSE2-LABEL: 'reduce_i8' -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-NEXT: Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef) +; SSE2-NEXT: Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef) +; SSE2-NEXT: Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef) +; SSE2-NEXT: Cost Model: Found costs of 9 for: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef) +; SSE2-NEXT: Cost Model: Found costs of 10 for: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef) +; SSE2-NEXT: Cost Model: Found costs of 12 for: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef) +; SSE2-NEXT: Cost Model: Found costs of 16 for: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i8' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSSE3-NEXT: Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef) +; SSSE3-NEXT: Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef) +; SSSE3-NEXT: Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef) +; SSSE3-NEXT: Cost Model: Found costs of 9 for: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef) +; SSSE3-NEXT: Cost Model: Found costs of 10 for: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef) +; SSSE3-NEXT: Cost Model: Found costs of 12 for: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef) +; SSSE3-NEXT: Cost Model: Found costs of 16 for: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SSE4-LABEL: 'reduce_i8' -; SSE4-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE4-NEXT: Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef) +; SSE4-NEXT: Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef) +; SSE4-NEXT: Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef) +; SSE4-NEXT: Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef) +; SSE4-NEXT: Cost Model: Found costs of 7 for: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef) +; SSE4-NEXT: Cost Model: Found costs of 9 for: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef) +; SSE4-NEXT: Cost Model: Found costs of 13 for: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef) +; SSE4-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX1-LABEL: 'reduce_i8' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX1-NEXT: Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of 8 for: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:12 CodeSize:13 Lat:14 SizeLat:14 for: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:20 CodeSize:23 Lat:26 SizeLat:26 for: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX2-LABEL: 'reduce_i8' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX2-NEXT: Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of 8 for: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:9 SizeLat:10 for: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:11 CodeSize:11 Lat:11 SizeLat:14 for: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i8' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512F-NEXT: Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of 8 for: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:13 CodeSize:13 Lat:17 SizeLat:14 for: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:16 CodeSize:18 Lat:24 SizeLat:19 for: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i8' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 8 for: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 10 for: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 11 for: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i8' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512DQ-NEXT: Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 8 for: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:13 CodeSize:13 Lat:17 SizeLat:14 for: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:16 CodeSize:18 Lat:24 SizeLat:19 for: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef) %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef) @@ -322,6 +322,3 @@ declare i8 @llvm.vector.reduce.umin.v16i8(<16 x i8>) declare i8 @llvm.vector.reduce.umin.v32i8(<32 x i8>) declare i8 @llvm.vector.reduce.umin.v64i8(<64 x i8>) declare i8 @llvm.vector.reduce.umin.v128i8(<128 x i8>) -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; SSE41: {{.*}} -; SSE42: {{.*}} diff --git a/llvm/test/Analysis/CostModel/X86/reduce-xor.ll b/llvm/test/Analysis/CostModel/X86/reduce-xor.ll index 757db4a5a41be..c9cb6ef547e86 100644 --- a/llvm/test/Analysis/CostModel/X86/reduce-xor.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-xor.ll @@ -1,37 +1,37 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW -; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: opt < %s -passes="print" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ define i32 @reduce_i64(i32 %arg) { ; SSE-LABEL: 'reduce_i64' -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> undef) +; SSE-NEXT: Cost Model: Found costs of 3 for: %V2 = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> undef) +; SSE-NEXT: Cost Model: Found costs of 4 for: %V4 = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> undef) +; SSE-NEXT: Cost Model: Found costs of 6 for: %V8 = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> undef) +; SSE-NEXT: Cost Model: Found costs of 10 for: %V16 = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> undef) +; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX-LABEL: 'reduce_i64' -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> undef) +; AVX-NEXT: Cost Model: Found costs of 3 for: %V2 = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> undef) +; AVX-NEXT: Cost Model: Found costs of 5 for: %V4 = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> undef) +; AVX-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:6 SizeLat:7 for: %V8 = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> undef) +; AVX-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:8 SizeLat:11 for: %V16 = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> undef) +; AVX-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512-LABEL: 'reduce_i64' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-NEXT: Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %V2 = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:7 SizeLat:5 for: %V4 = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:7 for: %V8 = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:10 SizeLat:8 for: %V16 = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V1 = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> undef) %V2 = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> undef) @@ -43,28 +43,28 @@ define i32 @reduce_i64(i32 %arg) { define i32 @reduce_i32(i32 %arg) { ; SSE-LABEL: 'reduce_i32' -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.vector.reduce.xor.v32i32(<32 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE-NEXT: Cost Model: Found costs of 3 for: %V2 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> undef) +; SSE-NEXT: Cost Model: Found costs of 5 for: %V4 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> undef) +; SSE-NEXT: Cost Model: Found costs of 6 for: %V8 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef) +; SSE-NEXT: Cost Model: Found costs of 8 for: %V16 = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> undef) +; SSE-NEXT: Cost Model: Found costs of 12 for: %V32 = call i32 @llvm.vector.reduce.xor.v32i32(<32 x i32> undef) +; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX-LABEL: 'reduce_i32' -; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.xor.v32i32(<32 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX-NEXT: Cost Model: Found costs of 3 for: %V2 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> undef) +; AVX-NEXT: Cost Model: Found costs of 5 for: %V4 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> undef) +; AVX-NEXT: Cost Model: Found costs of 7 for: %V8 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef) +; AVX-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:8 SizeLat:9 for: %V16 = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> undef) +; AVX-NEXT: Cost Model: Found costs of RThru:10 CodeSize:10 Lat:10 SizeLat:13 for: %V32 = call i32 @llvm.vector.reduce.xor.v32i32(<32 x i32> undef) +; AVX-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512-LABEL: 'reduce_i32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.xor.v32i32(<32 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %V2 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:9 SizeLat:5 for: %V4 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:11 SizeLat:7 for: %V8 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:9 for: %V16 = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:10 CodeSize:10 Lat:14 SizeLat:10 for: %V32 = call i32 @llvm.vector.reduce.xor.v32i32(<32 x i32> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V2 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> undef) %V4 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> undef) @@ -76,31 +76,40 @@ define i32 @reduce_i32(i32 %arg) { define i32 @reduce_i16(i32 %arg) { ; SSE-LABEL: 'reduce_i16' -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE-NEXT: Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> undef) +; SSE-NEXT: Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef) +; SSE-NEXT: Cost Model: Found costs of 7 for: %V8 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef) +; SSE-NEXT: Cost Model: Found costs of 8 for: %V16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef) +; SSE-NEXT: Cost Model: Found costs of 10 for: %V32 = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> undef) +; SSE-NEXT: Cost Model: Found costs of 14 for: %V64 = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> undef) +; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; -; AVX-LABEL: 'reduce_i16' -; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX1-LABEL: 'reduce_i16' +; AVX1-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %V2 = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:6 SizeLat:5 for: %V4 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:8 SizeLat:7 for: %V8 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:10 SizeLat:9 for: %V16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:10 CodeSize:10 Lat:11 SizeLat:11 for: %V32 = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:12 CodeSize:12 Lat:13 SizeLat:15 for: %V64 = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef +; +; AVX2-LABEL: 'reduce_i16' +; AVX2-NEXT: Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of 7 for: %V8 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of 9 for: %V16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:10 CodeSize:10 Lat:10 SizeLat:11 for: %V32 = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:12 CodeSize:12 Lat:12 SizeLat:15 for: %V64 = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512-LABEL: 'reduce_i16' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-NEXT: Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:7 SizeLat:5 for: %V4 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:11 SizeLat:7 for: %V8 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:9 for: %V16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:11 CodeSize:11 Lat:15 SizeLat:11 for: %V32 = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:12 CodeSize:12 Lat:16 SizeLat:12 for: %V64 = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V2 = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> undef) %V4 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef) @@ -113,34 +122,44 @@ define i32 @reduce_i16(i32 %arg) { define i32 @reduce_i8(i32 %arg) { ; SSE-LABEL: 'reduce_i8' -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE-NEXT: Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> undef) +; SSE-NEXT: Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef) +; SSE-NEXT: Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef) +; SSE-NEXT: Cost Model: Found costs of 9 for: %V16 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef) +; SSE-NEXT: Cost Model: Found costs of 10 for: %V32 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef) +; SSE-NEXT: Cost Model: Found costs of 12 for: %V64 = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> undef) +; SSE-NEXT: Cost Model: Found costs of 16 for: %V128 = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> undef) +; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef +; +; AVX1-LABEL: 'reduce_i8' +; AVX1-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %V2 = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:7 SizeLat:5 for: %V4 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:7 for: %V8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:11 SizeLat:9 for: %V16 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:11 CodeSize:11 Lat:13 SizeLat:11 for: %V32 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:12 CodeSize:12 Lat:14 SizeLat:13 for: %V64 = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:14 CodeSize:14 Lat:16 SizeLat:17 for: %V128 = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; -; AVX-LABEL: 'reduce_i8' -; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX2-LABEL: 'reduce_i8' +; AVX2-NEXT: Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of 9 for: %V16 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of 11 for: %V32 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:12 CodeSize:12 Lat:12 SizeLat:13 for: %V64 = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:14 CodeSize:14 Lat:14 SizeLat:17 for: %V128 = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512-LABEL: 'reduce_i8' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-NEXT: Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> undef) +; AVX512-NEXT: Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:7 for: %V8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:9 for: %V16 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:11 CodeSize:11 Lat:15 SizeLat:11 for: %V32 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:13 CodeSize:13 Lat:17 SizeLat:13 for: %V64 = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:14 CodeSize:14 Lat:18 SizeLat:14 for: %V128 = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> undef) +; AVX512-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V2 = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> undef) %V4 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef) @@ -154,92 +173,92 @@ define i32 @reduce_i8(i32 %arg) { define i32 @reduce_i1(i32 %arg) { ; SSE2-LABEL: 'reduce_i1' -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE2-NEXT: Cost Model: Found costs of 0 for: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef) +; SSE2-NEXT: Cost Model: Found costs of 3 for: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef) +; SSE2-NEXT: Cost Model: Found costs of 5 for: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:13 CodeSize:19 Lat:19 SizeLat:19 for: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:38 CodeSize:46 Lat:46 SizeLat:46 for: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:40 CodeSize:48 Lat:48 SizeLat:48 for: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:44 CodeSize:52 Lat:52 SizeLat:52 for: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef) +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i1' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSSE3-NEXT: Cost Model: Found costs of 0 for: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef) +; SSSE3-NEXT: Cost Model: Found costs of 3 for: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef) +; SSSE3-NEXT: Cost Model: Found costs of 5 for: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef) +; SSSE3-NEXT: Cost Model: Found costs of 7 for: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef) +; SSSE3-NEXT: Cost Model: Found costs of 9 for: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef) +; SSSE3-NEXT: Cost Model: Found costs of 10 for: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef) +; SSSE3-NEXT: Cost Model: Found costs of 12 for: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef) +; SSSE3-NEXT: Cost Model: Found costs of 16 for: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef) +; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SSE42-LABEL: 'reduce_i1' -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE42-NEXT: Cost Model: Found costs of 0 for: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef) +; SSE42-NEXT: Cost Model: Found costs of 3 for: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef) +; SSE42-NEXT: Cost Model: Found costs of 5 for: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef) +; SSE42-NEXT: Cost Model: Found costs of 7 for: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef) +; SSE42-NEXT: Cost Model: Found costs of 9 for: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef) +; SSE42-NEXT: Cost Model: Found costs of 10 for: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef) +; SSE42-NEXT: Cost Model: Found costs of 12 for: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef) +; SSE42-NEXT: Cost Model: Found costs of 16 for: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef) +; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX1-LABEL: 'reduce_i1' -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX1-NEXT: Cost Model: Found costs of 0 for: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef) +; AVX1-NEXT: Cost Model: Found costs of 3 for: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef) +; AVX1-NEXT: Cost Model: Found costs of 5 for: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef) +; AVX1-NEXT: Cost Model: Found costs of 7 for: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef) +; AVX1-NEXT: Cost Model: Found costs of 9 for: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:46 CodeSize:46 Lat:46 SizeLat:51 for: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:47 CodeSize:47 Lat:47 SizeLat:53 for: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:49 CodeSize:49 Lat:49 SizeLat:57 for: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef) +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX2-LABEL: 'reduce_i1' -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX2-NEXT: Cost Model: Found costs of 0 for: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef) +; AVX2-NEXT: Cost Model: Found costs of 3 for: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef) +; AVX2-NEXT: Cost Model: Found costs of 5 for: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef) +; AVX2-NEXT: Cost Model: Found costs of 7 for: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef) +; AVX2-NEXT: Cost Model: Found costs of 9 for: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:26 CodeSize:26 Lat:26 SizeLat:31 for: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:27 CodeSize:27 Lat:27 SizeLat:33 for: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:29 CodeSize:29 Lat:29 SizeLat:37 for: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef) +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i1' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512F-NEXT: Cost Model: Found costs of 1 for: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef) +; AVX512F-NEXT: Cost Model: Found costs of 6 for: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef) +; AVX512F-NEXT: Cost Model: Found costs of 19 for: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef) +; AVX512F-NEXT: Cost Model: Found costs of 52 for: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef) +; AVX512F-NEXT: Cost Model: Found costs of 133 for: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef) +; AVX512F-NEXT: Cost Model: Found costs of 134 for: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:136 CodeSize:136 Lat:135 SizeLat:135 for: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:140 CodeSize:140 Lat:136 SizeLat:136 for: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef) +; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i1' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 326 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 775 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 776 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512BW-NEXT: Cost Model: Found costs of 1 for: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 19 for: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 52 for: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 133 for: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 326 for: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 775 for: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found costs of 776 for: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i1' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512DQ-NEXT: Cost Model: Found costs of 1 for: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 6 for: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 19 for: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 52 for: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 133 for: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of 134 for: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:136 CodeSize:136 Lat:135 SizeLat:135 for: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:140 CodeSize:140 Lat:136 SizeLat:136 for: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef) %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef) diff --git a/llvm/test/Analysis/CostModel/X86/reduction.ll b/llvm/test/Analysis/CostModel/X86/reduction.ll index 4ad0887a27884..0e0ad7e14f4eb 100644 --- a/llvm/test/Analysis/CostModel/X86/reduction.ll +++ b/llvm/test/Analysis/CostModel/X86/reduction.ll @@ -1,54 +1,62 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -passes="print" 2>&1 -disable-output -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 -; RUN: opt < %s -passes="print" 2>&1 -disable-output -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 -; RUN: opt < %s -passes="print" 2>&1 -disable-output -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 -; RUN: opt < %s -passes="print" 2>&1 -disable-output -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 -; RUN: opt < %s -passes="print" 2>&1 -disable-output -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -passes="print" 2>&1 -disable-output -cost-kind=all -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 +; RUN: opt < %s -passes="print" 2>&1 -disable-output -cost-kind=all -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 +; RUN: opt < %s -passes="print" 2>&1 -disable-output -cost-kind=all -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -passes="print" 2>&1 -disable-output -cost-kind=all -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -passes="print" 2>&1 -disable-output -cost-kind=all -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 -; RUN: opt < %s -passes="print" 2>&1 -disable-output -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -mcpu=slm | FileCheck %s --check-prefixes=SLM +; RUN: opt < %s -passes="print" 2>&1 -disable-output -cost-kind=all -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -mcpu=slm | FileCheck %s --check-prefixes=SLM ; These are old tests for matching reduction costs from extract elements - something that has now been removed. define fastcc float @reduction_cost_float(<4 x float> %rdx) { ; SSE2-LABEL: 'reduction_cost_float' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r +; SSE2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf +; SSE2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 +; SSE2-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r ; ; SSSE3-LABEL: 'reduction_cost_float' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SSSE3-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> +; SSSE3-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 +; SSSE3-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r ; ; SSE42-LABEL: 'reduction_cost_float' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r -; -; AVX-LABEL: 'reduction_cost_float' -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r +; SSE42-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SSE42-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf +; SSE42-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> +; SSE42-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 +; SSE42-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r +; +; AVX1-LABEL: 'reduction_cost_float' +; AVX1-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; AVX1-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf +; AVX1-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> +; AVX1-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 +; AVX1-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r +; +; AVX2-LABEL: 'reduction_cost_float' +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 +; AVX2-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r ; ; SLM-LABEL: 'reduction_cost_float' -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 +; SLM-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r ; %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf @@ -61,44 +69,44 @@ define fastcc float @reduction_cost_float(<4 x float> %rdx) { define fastcc i32 @reduction_cost_int(<8 x i32> %rdx) { ; SSE-LABEL: 'reduction_cost_int' -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %rdx, %rdx.shuf -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2 -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3 -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0 -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r +; SSE-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %bin.rdx = add <8 x i32> %rdx, %rdx.shuf +; SSE-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2 +; SSE-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3 +; SSE-NEXT: Cost Model: Found costs of 1 for: %r = extractelement <8 x i32> %bin.rdx.3, i32 0 +; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r ; ; AVX1-LABEL: 'reduction_cost_int' -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = add <8 x i32> %rdx, %rdx.shuf -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2 -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3 -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0 -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r +; AVX1-NEXT: Cost Model: Found costs of 4 for: %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> +; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:5 Lat:2 SizeLat:6 for: %bin.rdx = add <8 x i32> %rdx, %rdx.shuf +; AVX1-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> +; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:5 Lat:2 SizeLat:6 for: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2 +; AVX1-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> +; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:5 Lat:2 SizeLat:6 for: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3 +; AVX1-NEXT: Cost Model: Found costs of 1 for: %r = extractelement <8 x i32> %bin.rdx.3, i32 0 +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r ; ; AVX2-LABEL: 'reduction_cost_int' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i32> %rdx, %rdx.shuf -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2 -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3 -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0 -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:1 SizeLat:2 for: %bin.rdx = add <8 x i32> %rdx, %rdx.shuf +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:1 SizeLat:2 for: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2 +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:1 SizeLat:2 for: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3 +; AVX2-NEXT: Cost Model: Found costs of 1 for: %r = extractelement <8 x i32> %bin.rdx.3, i32 0 +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r ; ; SLM-LABEL: 'reduction_cost_int' -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %rdx, %rdx.shuf -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2 -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3 -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r +; SLM-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %bin.rdx = add <8 x i32> %rdx, %rdx.shuf +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2 +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3 +; SLM-NEXT: Cost Model: Found costs of 1 for: %r = extractelement <8 x i32> %bin.rdx.3, i32 0 +; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r ; %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> %rdx) { define fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) { ; SSE2-LABEL: 'pairwise_hadd' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 +; SSE2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 +; SSE2-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SSE2-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SSE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %r2 = fadd float %r, %f1 +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r2 ; ; SSSE3-LABEL: 'pairwise_hadd' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SSSE3-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 +; SSSE3-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> +; SSSE3-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SSSE3-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SSSE3-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %r2 = fadd float %r, %f1 +; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r2 ; ; SSE42-LABEL: 'pairwise_hadd' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 -; -; AVX-LABEL: 'pairwise_hadd' -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 +; SSE42-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SSE42-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SSE42-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 +; SSE42-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> +; SSE42-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> +; SSE42-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SSE42-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SSE42-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %r2 = fadd float %r, %f1 +; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r2 +; +; AVX1-LABEL: 'pairwise_hadd' +; AVX1-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; AVX1-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; AVX1-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 +; AVX1-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> +; AVX1-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> +; AVX1-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 +; AVX1-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; AVX1-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %r2 = fadd float %r, %f1 +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r2 +; +; AVX2-LABEL: 'pairwise_hadd' +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 +; AVX2-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 +; AVX2-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %r2 = fadd float %r, %f1 +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r2 ; ; SLM-LABEL: 'pairwise_hadd' -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 +; SLM-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SLM-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SLM-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %r2 = fadd float %r, %f1 +; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r2 ; %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> @@ -191,59 +210,70 @@ define fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) { define fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) { ; SSE2-LABEL: 'pairwise_hadd_assoc' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.1, %rdx.shuf.0.0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 +; SSE2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.1, %rdx.shuf.0.0 +; SSE2-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SSE2-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SSE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %r2 = fadd float %r, %f1 +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r2 ; ; SSSE3-LABEL: 'pairwise_hadd_assoc' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.1, %rdx.shuf.0.0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SSSE3-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.1, %rdx.shuf.0.0 +; SSSE3-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> +; SSSE3-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SSSE3-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SSSE3-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %r2 = fadd float %r, %f1 +; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r2 ; ; SSE42-LABEL: 'pairwise_hadd_assoc' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.1, %rdx.shuf.0.0 -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 -; -; AVX-LABEL: 'pairwise_hadd_assoc' -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.1, %rdx.shuf.0.0 -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 +; SSE42-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SSE42-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SSE42-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.1, %rdx.shuf.0.0 +; SSE42-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> +; SSE42-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> +; SSE42-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SSE42-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SSE42-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %r2 = fadd float %r, %f1 +; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r2 +; +; AVX1-LABEL: 'pairwise_hadd_assoc' +; AVX1-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; AVX1-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; AVX1-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.1, %rdx.shuf.0.0 +; AVX1-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> +; AVX1-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> +; AVX1-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 +; AVX1-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; AVX1-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %r2 = fadd float %r, %f1 +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r2 +; +; AVX2-LABEL: 'pairwise_hadd_assoc' +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.1, %rdx.shuf.0.0 +; AVX2-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 +; AVX2-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %r2 = fadd float %r, %f1 +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r2 ; ; SLM-LABEL: 'pairwise_hadd_assoc' -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.1, %rdx.shuf.0.0 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.1, %rdx.shuf.0.0 +; SLM-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SLM-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SLM-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %r2 = fadd float %r, %f1 +; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r2 ; %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> @@ -263,54 +293,64 @@ define fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) { define fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) { ; SSE2-LABEL: 'pairwise_hadd_skip_first' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 +; SSE2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 +; SSE2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1 +; SSE2-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SSE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %r2 = fadd float %r, %f1 +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r2 ; ; SSSE3-LABEL: 'pairwise_hadd_skip_first' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SSSE3-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> +; SSSE3-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1 +; SSSE3-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SSSE3-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %r2 = fadd float %r, %f1 +; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r2 ; ; SSE42-LABEL: 'pairwise_hadd_skip_first' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1 -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 -; -; AVX-LABEL: 'pairwise_hadd_skip_first' -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1 -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 +; SSE42-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SSE42-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SSE42-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 +; SSE42-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> +; SSE42-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1 +; SSE42-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SSE42-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %r2 = fadd float %r, %f1 +; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r2 +; +; AVX1-LABEL: 'pairwise_hadd_skip_first' +; AVX1-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; AVX1-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; AVX1-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 +; AVX1-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> +; AVX1-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1 +; AVX1-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; AVX1-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %r2 = fadd float %r, %f1 +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r2 +; +; AVX2-LABEL: 'pairwise_hadd_skip_first' +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1 +; AVX2-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %r2 = fadd float %r, %f1 +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r2 ; ; SLM-LABEL: 'pairwise_hadd_skip_first' -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1 +; SLM-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SLM-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %r2 = fadd float %r, %f1 +; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r2 ; %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> @@ -328,34 +368,40 @@ define fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) { define fastcc double @no_pairwise_reduction2double(<2 x double> %rdx, double %f1) { ; SSE2-LABEL: 'no_pairwise_reduction2double' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r +; SSE2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> +; SSE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf +; SSE2-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <2 x double> %bin.rdx, i32 0 +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r ; ; SSSE3-LABEL: 'no_pairwise_reduction2double' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> +; SSSE3-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf +; SSSE3-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <2 x double> %bin.rdx, i32 0 +; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r ; ; SSE42-LABEL: 'no_pairwise_reduction2double' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0 -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r -; -; AVX-LABEL: 'no_pairwise_reduction2double' -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0 -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r +; SSE42-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> +; SSE42-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf +; SSE42-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <2 x double> %bin.rdx, i32 0 +; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r +; +; AVX1-LABEL: 'no_pairwise_reduction2double' +; AVX1-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> +; AVX1-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf +; AVX1-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <2 x double> %bin.rdx, i32 0 +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r +; +; AVX2-LABEL: 'no_pairwise_reduction2double' +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf +; AVX2-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <2 x double> %bin.rdx, i32 0 +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r ; ; SLM-LABEL: 'no_pairwise_reduction2double' -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> +; SLM-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf +; SLM-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <2 x double> %bin.rdx, i32 0 +; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r ; %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf @@ -366,44 +412,52 @@ define fastcc double @no_pairwise_reduction2double(<2 x double> %rdx, double %f1 define fastcc float @no_pairwise_reduction4float(<4 x float> %rdx, float %f1) { ; SSE2-LABEL: 'no_pairwise_reduction4float' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r +; SSE2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf +; SSE2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 +; SSE2-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r ; ; SSSE3-LABEL: 'no_pairwise_reduction4float' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SSSE3-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> +; SSSE3-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 +; SSSE3-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r ; ; SSE42-LABEL: 'no_pairwise_reduction4float' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r -; -; AVX-LABEL: 'no_pairwise_reduction4float' -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r +; SSE42-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SSE42-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf +; SSE42-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> +; SSE42-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 +; SSE42-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r +; +; AVX1-LABEL: 'no_pairwise_reduction4float' +; AVX1-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; AVX1-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf +; AVX1-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> +; AVX1-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 +; AVX1-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r +; +; AVX2-LABEL: 'no_pairwise_reduction4float' +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 +; AVX2-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r ; ; SLM-LABEL: 'no_pairwise_reduction4float' -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 +; SLM-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r ; %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf @@ -416,52 +470,52 @@ define fastcc float @no_pairwise_reduction4float(<4 x float> %rdx, float %f1) { define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1) { ; SSE2-LABEL: 'no_pairwise_reduction4double' -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r +; SSE2-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf +; SSE2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 +; SSE2-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x double> %bin.rdx8, i32 0 +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r ; ; SSSE3-LABEL: 'no_pairwise_reduction4double' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r +; SSSE3-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; SSSE3-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> +; SSSE3-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 +; SSSE3-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x double> %bin.rdx8, i32 0 +; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r ; ; SSE42-LABEL: 'no_pairwise_reduction4double' -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r +; SSE42-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; SSE42-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf +; SSE42-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> +; SSE42-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 +; SSE42-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x double> %bin.rdx8, i32 0 +; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r ; ; AVX1-LABEL: 'no_pairwise_reduction4double' -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r +; AVX1-NEXT: Cost Model: Found costs of 2 for: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; AVX1-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:5 SizeLat:2 for: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf +; AVX1-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> +; AVX1-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:5 SizeLat:2 for: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 +; AVX1-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x double> %bin.rdx8, i32 0 +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r ; ; AVX2-LABEL: 'no_pairwise_reduction4double' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:2 for: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:2 for: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 +; AVX2-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x double> %bin.rdx8, i32 0 +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r ; ; SLM-LABEL: 'no_pairwise_reduction4double' -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r +; SLM-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:8 SizeLat:2 for: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:8 SizeLat:2 for: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 +; SLM-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x double> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r ; %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf @@ -474,64 +528,64 @@ define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1 define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) { ; SSE2-LABEL: 'no_pairwise_reduction8float' -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3 -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r +; SSE2-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3 +; SSE2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf +; SSE2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 +; SSE2-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <8 x float> %bin.rdx8, i32 0 +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r ; ; SSSE3-LABEL: 'no_pairwise_reduction8float' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r +; SSSE3-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; SSSE3-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3 +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> +; SSSE3-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; SSSE3-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 +; SSSE3-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <8 x float> %bin.rdx8, i32 0 +; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r ; ; SSE42-LABEL: 'no_pairwise_reduction8float' -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3 -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r +; SSE42-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; SSE42-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3 +; SSE42-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> +; SSE42-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf +; SSE42-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; SSE42-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 +; SSE42-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <8 x float> %bin.rdx8, i32 0 +; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r ; ; AVX1-LABEL: 'no_pairwise_reduction8float' -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3 -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r +; AVX1-NEXT: Cost Model: Found costs of 4 for: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; AVX1-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:5 SizeLat:2 for: %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3 +; AVX1-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> +; AVX1-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:5 SizeLat:2 for: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf +; AVX1-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; AVX1-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:5 SizeLat:2 for: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 +; AVX1-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <8 x float> %bin.rdx8, i32 0 +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r ; ; AVX2-LABEL: 'no_pairwise_reduction8float' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3 -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:2 for: %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3 +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:2 for: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:2 for: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 +; AVX2-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <8 x float> %bin.rdx8, i32 0 +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r ; ; SLM-LABEL: 'no_pairwise_reduction8float' -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3 -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r +; SLM-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3 +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 +; SLM-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <8 x float> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r ; %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3 @@ -545,17 +599,23 @@ define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) { } define fastcc i64 @no_pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) { -; CHECK-LABEL: 'no_pairwise_reduction2i64' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <2 x i64> %rdx, %rdx.shuf -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <2 x i64> %bin.rdx, i32 0 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r +; SSE-LABEL: 'no_pairwise_reduction2i64' +; SSE-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> +; SSE-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %bin.rdx = add <2 x i64> %rdx, %rdx.shuf +; SSE-NEXT: Cost Model: Found costs of 1 for: %r = extractelement <2 x i64> %bin.rdx, i32 0 +; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r +; +; AVX-LABEL: 'no_pairwise_reduction2i64' +; AVX-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> +; AVX-NEXT: Cost Model: Found costs of 1 for: %bin.rdx = add <2 x i64> %rdx, %rdx.shuf +; AVX-NEXT: Cost Model: Found costs of 1 for: %r = extractelement <2 x i64> %bin.rdx, i32 0 +; AVX-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r ; ; SLM-LABEL: 'no_pairwise_reduction2i64' -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = add <2 x i64> %rdx, %rdx.shuf -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <2 x i64> %bin.rdx, i32 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> +; SLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:2 SizeLat:2 for: %bin.rdx = add <2 x i64> %rdx, %rdx.shuf +; SLM-NEXT: Cost Model: Found costs of 1 for: %r = extractelement <2 x i64> %bin.rdx, i32 0 +; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r ; %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> %bin.rdx = add <2 x i64> %rdx, %rdx.shuf @@ -566,20 +626,20 @@ define fastcc i64 @no_pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) { define fastcc i32 @no_pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) { ; CHECK-LABEL: 'no_pairwise_reduction4i32' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <4 x i32> %rdx, %rdx.shuf -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i32> %bin.rdx, %rdx.shuf7 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r +; CHECK-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found costs of 1 for: %bin.rdx = add <4 x i32> %rdx, %rdx.shuf +; CHECK-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found costs of 1 for: %bin.rdx8 = add <4 x i32> %bin.rdx, %rdx.shuf7 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %r = extractelement <4 x i32> %bin.rdx8, i32 0 +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r ; ; SLM-LABEL: 'no_pairwise_reduction4i32' -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <4 x i32> %rdx, %rdx.shuf -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i32> %bin.rdx, %rdx.shuf7 -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found costs of 1 for: %bin.rdx = add <4 x i32> %rdx, %rdx.shuf +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found costs of 1 for: %bin.rdx8 = add <4 x i32> %bin.rdx, %rdx.shuf7 +; SLM-NEXT: Cost Model: Found costs of 1 for: %r = extractelement <4 x i32> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r ; %rdx.shuf = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> %bin.rdx = add <4 x i32> %rdx, %rdx.shuf @@ -592,36 +652,36 @@ define fastcc i32 @no_pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) { define fastcc i64 @no_pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) { ; SSE-LABEL: 'no_pairwise_reduction4i64' -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7 -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r +; SSE-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> +; SSE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:4 for: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf +; SSE-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> +; SSE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:4 for: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7 +; SSE-NEXT: Cost Model: Found costs of 1 for: %r = extractelement <4 x i64> %bin.rdx8, i32 0 +; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r ; ; AVX1-LABEL: 'no_pairwise_reduction4i64' -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7 -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r +; AVX1-NEXT: Cost Model: Found costs of 2 for: %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> +; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:5 Lat:2 SizeLat:6 for: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf +; AVX1-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> +; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:5 Lat:2 SizeLat:6 for: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7 +; AVX1-NEXT: Cost Model: Found costs of 1 for: %r = extractelement <4 x i64> %bin.rdx8, i32 0 +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r ; ; AVX2-LABEL: 'no_pairwise_reduction4i64' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7 -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:1 SizeLat:2 for: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:1 SizeLat:2 for: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7 +; AVX2-NEXT: Cost Model: Found costs of 1 for: %r = extractelement <4 x i64> %bin.rdx8, i32 0 +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r ; ; SLM-LABEL: 'no_pairwise_reduction4i64' -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7 -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r +; SLM-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found costs of RThru:8 CodeSize:2 Lat:4 SizeLat:4 for: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found costs of RThru:8 CodeSize:2 Lat:4 SizeLat:4 for: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7 +; SLM-NEXT: Cost Model: Found costs of 1 for: %r = extractelement <4 x i64> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r ; %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> %bin.rdx = add <4 x i64> %rdx, %rdx.shuf @@ -634,54 +694,54 @@ define fastcc i64 @no_pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) { define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) { ; SSE2-LABEL: 'no_pairwise_reduction8i16' -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %rdx.shuf3 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx4 = add <8 x i16> %rdx, %rdx.shuf3 -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %rdx.shuf = shufflevector <8 x i16> %bin.rdx4, <8 x i16> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7 -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r +; SSE2-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:5 SizeLat:5 for: %rdx.shuf3 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found costs of 1 for: %bin.rdx4 = add <8 x i16> %rdx, %rdx.shuf3 +; SSE2-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:5 SizeLat:5 for: %rdx.shuf = shufflevector <8 x i16> %bin.rdx4, <8 x i16> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found costs of 1 for: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf +; SSE2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found costs of 1 for: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7 +; SSE2-NEXT: Cost Model: Found costs of 1 for: %r = extractelement <8 x i16> %bin.rdx8, i32 0 +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %r ; ; SSSE3-LABEL: 'no_pairwise_reduction8i16' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf3 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx4 = add <8 x i16> %rdx, %rdx.shuf3 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x i16> %bin.rdx4, <8 x i16> undef, <8 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf3 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %bin.rdx4 = add <8 x i16> %rdx, %rdx.shuf3 +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <8 x i16> %bin.rdx4, <8 x i16> undef, <8 x i32> +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7 +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %r = extractelement <8 x i16> %bin.rdx8, i32 0 +; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %r ; ; SSE42-LABEL: 'no_pairwise_reduction8i16' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf3 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx4 = add <8 x i16> %rdx, %rdx.shuf3 -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x i16> %bin.rdx4, <8 x i16> undef, <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7 -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0 -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r +; SSE42-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf3 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> +; SSE42-NEXT: Cost Model: Found costs of 1 for: %bin.rdx4 = add <8 x i16> %rdx, %rdx.shuf3 +; SSE42-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <8 x i16> %bin.rdx4, <8 x i16> undef, <8 x i32> +; SSE42-NEXT: Cost Model: Found costs of 1 for: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf +; SSE42-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> +; SSE42-NEXT: Cost Model: Found costs of 1 for: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7 +; SSE42-NEXT: Cost Model: Found costs of 1 for: %r = extractelement <8 x i16> %bin.rdx8, i32 0 +; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %r ; ; AVX-LABEL: 'no_pairwise_reduction8i16' -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf3 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx4 = add <8 x i16> %rdx, %rdx.shuf3 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x i16> %bin.rdx4, <8 x i16> undef, <8 x i32> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0 -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r +; AVX-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf3 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> +; AVX-NEXT: Cost Model: Found costs of 1 for: %bin.rdx4 = add <8 x i16> %rdx, %rdx.shuf3 +; AVX-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <8 x i16> %bin.rdx4, <8 x i16> undef, <8 x i32> +; AVX-NEXT: Cost Model: Found costs of 1 for: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf +; AVX-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> +; AVX-NEXT: Cost Model: Found costs of 1 for: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7 +; AVX-NEXT: Cost Model: Found costs of 1 for: %r = extractelement <8 x i16> %bin.rdx8, i32 0 +; AVX-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %r ; ; SLM-LABEL: 'no_pairwise_reduction8i16' -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf3 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx4 = add <8 x i16> %rdx, %rdx.shuf3 -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x i16> %bin.rdx4, <8 x i16> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7 -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf3 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found costs of 1 for: %bin.rdx4 = add <8 x i16> %rdx, %rdx.shuf3 +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <8 x i16> %bin.rdx4, <8 x i16> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found costs of 1 for: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found costs of 1 for: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7 +; SLM-NEXT: Cost Model: Found costs of 1 for: %r = extractelement <8 x i16> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %r ; %rdx.shuf3 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> %bin.rdx4 = add <8 x i16> %rdx, %rdx.shuf3 @@ -696,44 +756,44 @@ define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) { define fastcc i32 @no_pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) { ; SSE-LABEL: 'no_pairwise_reduction8i32' -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3 -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7 -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0 -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r +; SSE-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3 +; SSE-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf +; SSE-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7 +; SSE-NEXT: Cost Model: Found costs of 1 for: %r = extractelement <8 x i32> %bin.rdx8, i32 0 +; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r ; ; AVX1-LABEL: 'no_pairwise_reduction8i32' -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3 -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7 -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0 -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r +; AVX1-NEXT: Cost Model: Found costs of 4 for: %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> +; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:5 Lat:2 SizeLat:6 for: %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3 +; AVX1-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> +; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:5 Lat:2 SizeLat:6 for: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf +; AVX1-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> +; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:5 Lat:2 SizeLat:6 for: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7 +; AVX1-NEXT: Cost Model: Found costs of 1 for: %r = extractelement <8 x i32> %bin.rdx8, i32 0 +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r ; ; AVX2-LABEL: 'no_pairwise_reduction8i32' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3 -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7 -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0 -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:1 SizeLat:2 for: %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3 +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:1 SizeLat:2 for: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:1 SizeLat:2 for: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7 +; AVX2-NEXT: Cost Model: Found costs of 1 for: %r = extractelement <8 x i32> %bin.rdx8, i32 0 +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r ; ; SLM-LABEL: 'no_pairwise_reduction8i32' -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3 -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7 -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r +; SLM-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3 +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7 +; SLM-NEXT: Cost Model: Found costs of 1 for: %r = extractelement <8 x i32> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r ; %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3 @@ -748,39 +808,46 @@ define fastcc i32 @no_pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) { define fastcc double @pairwise_reduction2double(<2 x double> %rdx, double %f1) { ; SSE2-LABEL: 'pairwise_reduction2double' -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r +; SSE2-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> +; SSE2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> +; SSE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SSE2-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <2 x double> %bin.rdx8, i32 0 +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r ; ; SSSE3-LABEL: 'pairwise_reduction2double' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r +; SSSE3-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> +; SSSE3-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SSSE3-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <2 x double> %bin.rdx8, i32 0 +; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r ; ; SSE42-LABEL: 'pairwise_reduction2double' -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0 -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r -; -; AVX-LABEL: 'pairwise_reduction2double' -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0 -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r +; SSE42-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> +; SSE42-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> +; SSE42-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SSE42-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <2 x double> %bin.rdx8, i32 0 +; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r +; +; AVX1-LABEL: 'pairwise_reduction2double' +; AVX1-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> +; AVX1-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> +; AVX1-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 +; AVX1-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <2 x double> %bin.rdx8, i32 0 +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r +; +; AVX2-LABEL: 'pairwise_reduction2double' +; AVX2-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 +; AVX2-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <2 x double> %bin.rdx8, i32 0 +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r ; ; SLM-LABEL: 'pairwise_reduction2double' -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r +; SLM-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> +; SLM-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SLM-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <2 x double> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r ; %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> @@ -792,54 +859,64 @@ define fastcc double @pairwise_reduction2double(<2 x double> %rdx, double %f1) { define fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) { ; SSE2-LABEL: 'pairwise_reduction4float' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r +; SSE2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 +; SSE2-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SSE2-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r ; ; SSSE3-LABEL: 'pairwise_reduction4float' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SSSE3-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 +; SSSE3-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> +; SSSE3-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SSSE3-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r ; ; SSE42-LABEL: 'pairwise_reduction4float' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r -; -; AVX-LABEL: 'pairwise_reduction4float' -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r +; SSE42-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SSE42-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SSE42-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 +; SSE42-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> +; SSE42-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> +; SSE42-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SSE42-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r +; +; AVX1-LABEL: 'pairwise_reduction4float' +; AVX1-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; AVX1-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; AVX1-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %bin.rdx = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 +; AVX1-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> +; AVX1-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> +; AVX1-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 +; AVX1-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r +; +; AVX2-LABEL: 'pairwise_reduction4float' +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %bin.rdx = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 +; AVX2-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 +; AVX2-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r ; ; SLM-LABEL: 'pairwise_reduction4float' -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 +; SLM-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SLM-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r ; %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> @@ -854,64 +931,64 @@ define fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) { define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) { ; SSE2-LABEL: 'pairwise_reduction4double' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r +; SSE2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1 +; SSE2-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SSE2-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x double> %bin.rdx8, i32 0 +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r ; ; SSSE3-LABEL: 'pairwise_reduction4double' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; SSSE3-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1 +; SSSE3-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> +; SSSE3-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SSSE3-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x double> %bin.rdx8, i32 0 +; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r ; ; SSE42-LABEL: 'pairwise_reduction4double' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r +; SSE42-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; SSE42-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; SSE42-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1 +; SSE42-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> +; SSE42-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> +; SSE42-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SSE42-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x double> %bin.rdx8, i32 0 +; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r ; ; AVX1-LABEL: 'pairwise_reduction4double' -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1 -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r +; AVX1-NEXT: Cost Model: Found costs of 2 for: %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; AVX1-NEXT: Cost Model: Found costs of 2 for: %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; AVX1-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:5 SizeLat:2 for: %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1 +; AVX1-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> +; AVX1-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> +; AVX1-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:5 SizeLat:2 for: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 +; AVX1-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x double> %bin.rdx8, i32 0 +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r ; ; AVX2-LABEL: 'pairwise_reduction4double' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1 -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:2 for: %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1 +; AVX2-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:2 for: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 +; AVX2-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x double> %bin.rdx8, i32 0 +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r ; ; SLM-LABEL: 'pairwise_reduction4double' -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:8 SizeLat:2 for: %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1 +; SLM-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:8 SizeLat:2 for: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SLM-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <4 x double> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r ; %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> @@ -926,82 +1003,82 @@ define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) { define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) { ; SSE2-LABEL: 'pairwise_reduction8float' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r +; SSE2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 +; SSE2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SSE2-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 +; SSE2-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <8 x float> %bin.rdx9, i32 0 +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r ; ; SSSE3-LABEL: 'pairwise_reduction8float' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; SSSE3-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; SSSE3-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SSSE3-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> +; SSSE3-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 +; SSSE3-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <8 x float> %bin.rdx9, i32 0 +; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r ; ; SSE42-LABEL: 'pairwise_reduction8float' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r +; SSE42-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; SSE42-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; SSE42-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 +; SSE42-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; SSE42-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; SSE42-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SSE42-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> +; SSE42-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> +; SSE42-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 +; SSE42-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <8 x float> %bin.rdx9, i32 0 +; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r ; ; AVX1-LABEL: 'pairwise_reduction8float' -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r +; AVX1-NEXT: Cost Model: Found costs of 4 for: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; AVX1-NEXT: Cost Model: Found costs of 4 for: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; AVX1-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:5 SizeLat:2 for: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 +; AVX1-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; AVX1-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; AVX1-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:5 SizeLat:2 for: %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 +; AVX1-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> +; AVX1-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> +; AVX1-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:5 SizeLat:2 for: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 +; AVX1-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <8 x float> %bin.rdx9, i32 0 +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r ; ; AVX2-LABEL: 'pairwise_reduction8float' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:2 for: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:2 for: %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 +; AVX2-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:2 for: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 +; AVX2-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <8 x float> %bin.rdx9, i32 0 +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r ; ; SLM-LABEL: 'pairwise_reduction8float' -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SLM-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 +; SLM-NEXT: Cost Model: Found costs of 0 for: %r = extractelement <8 x float> %bin.rdx9, i32 0 +; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r ; %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> @@ -1018,19 +1095,26 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) { } define fastcc i64 @pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) { -; CHECK-LABEL: 'pairwise_reduction2i64' -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <2 x i64> %bin.rdx8, i32 0 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r +; SSE-LABEL: 'pairwise_reduction2i64' +; SSE-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> +; SSE-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> +; SSE-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SSE-NEXT: Cost Model: Found costs of 1 for: %r = extractelement <2 x i64> %bin.rdx8, i32 0 +; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r +; +; AVX-LABEL: 'pairwise_reduction2i64' +; AVX-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> +; AVX-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> +; AVX-NEXT: Cost Model: Found costs of 1 for: %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 +; AVX-NEXT: Cost Model: Found costs of 1 for: %r = extractelement <2 x i64> %bin.rdx8, i32 0 +; AVX-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r ; ; SLM-LABEL: 'pairwise_reduction2i64' -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <2 x i64> %bin.rdx8, i32 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r +; SLM-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> +; SLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:2 SizeLat:2 for: %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SLM-NEXT: Cost Model: Found costs of 1 for: %r = extractelement <2 x i64> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r ; %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> @@ -1042,24 +1126,24 @@ define fastcc i64 @pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) { define fastcc i32 @pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) { ; CHECK-LABEL: 'pairwise_reduction4i32' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <4 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r +; CHECK-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found costs of 1 for: %bin.rdx = add <4 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found costs of 1 for: %bin.rdx8 = add <4 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %r = extractelement <4 x i32> %bin.rdx8, i32 0 +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r ; ; SLM-LABEL: 'pairwise_reduction4i32' -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <4 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found costs of 1 for: %bin.rdx = add <4 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1 +; SLM-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found costs of 1 for: %bin.rdx8 = add <4 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SLM-NEXT: Cost Model: Found costs of 1 for: %r = extractelement <4 x i32> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r ; %rdx.shuf.0.0 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> %rdx.shuf.0.1 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> @@ -1074,44 +1158,44 @@ define fastcc i32 @pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) { define fastcc i64 @pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) { ; SSE-LABEL: 'pairwise_reduction4i64' -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <4 x i64> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r +; SSE-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> +; SSE-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> +; SSE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:4 for: %bin.rdx = add <4 x i64> %rdx.shuf.0.0, %rdx.shuf.0.1 +; SSE-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> +; SSE-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> +; SSE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:4 for: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SSE-NEXT: Cost Model: Found costs of 1 for: %r = extractelement <4 x i64> %bin.rdx8, i32 0 +; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r ; ; AVX1-LABEL: 'pairwise_reduction4i64' -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = add <4 x i64> %rdx.shuf.0.0, %rdx.shuf.0.1 -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r +; AVX1-NEXT: Cost Model: Found costs of 2 for: %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> +; AVX1-NEXT: Cost Model: Found costs of 2 for: %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> +; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:5 Lat:2 SizeLat:6 for: %bin.rdx = add <4 x i64> %rdx.shuf.0.0, %rdx.shuf.0.1 +; AVX1-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> +; AVX1-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> +; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:5 Lat:2 SizeLat:6 for: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 +; AVX1-NEXT: Cost Model: Found costs of 1 for: %r = extractelement <4 x i64> %bin.rdx8, i32 0 +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r ; ; AVX2-LABEL: 'pairwise_reduction4i64' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <4 x i64> %rdx.shuf.0.0, %rdx.shuf.0.1 -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:1 SizeLat:2 for: %bin.rdx = add <4 x i64> %rdx.shuf.0.0, %rdx.shuf.0.1 +; AVX2-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:1 SizeLat:2 for: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 +; AVX2-NEXT: Cost Model: Found costs of 1 for: %r = extractelement <4 x i64> %bin.rdx8, i32 0 +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r ; ; SLM-LABEL: 'pairwise_reduction4i64' -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bin.rdx = add <4 x i64> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found costs of RThru:8 CodeSize:2 Lat:4 SizeLat:4 for: %bin.rdx = add <4 x i64> %rdx.shuf.0.0, %rdx.shuf.0.1 +; SLM-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found costs of RThru:8 CodeSize:2 Lat:4 SizeLat:4 for: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SLM-NEXT: Cost Model: Found costs of 1 for: %r = extractelement <4 x i64> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r ; %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> @@ -1126,69 +1210,69 @@ define fastcc i64 @pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) { define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) { ; SSE2-LABEL: 'pairwise_reduction8i16' -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %rdx.shuf.0.1 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %rdx.shuf.1.0 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %rdx.shuf.1.1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r +; SSE2-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:5 SizeLat:5 for: %rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:5 SizeLat:5 for: %rdx.shuf.0.1 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found costs of 1 for: %bin.rdx = add <8 x i16> %rdx.shuf.0.0, %rdx.shuf.0.1 +; SSE2-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:5 SizeLat:5 for: %rdx.shuf.1.0 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:5 SizeLat:5 for: %rdx.shuf.1.1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found costs of 1 for: %bin.rdx8 = add <8 x i16> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SSE2-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found costs of 1 for: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1 +; SSE2-NEXT: Cost Model: Found costs of 1 for: %r = extractelement <8 x i16> %bin.rdx9, i32 0 +; SSE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %r ; ; SSSE3-LABEL: 'pairwise_reduction8i16' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %bin.rdx = add <8 x i16> %rdx.shuf.0.0, %rdx.shuf.0.1 +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.0 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %bin.rdx8 = add <8 x i16> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SSSE3-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1 +; SSSE3-NEXT: Cost Model: Found costs of 1 for: %r = extractelement <8 x i16> %bin.rdx9, i32 0 +; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %r ; ; SSE42-LABEL: 'pairwise_reduction8i16' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1 -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0 -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r +; SSE42-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> +; SSE42-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> +; SSE42-NEXT: Cost Model: Found costs of 1 for: %bin.rdx = add <8 x i16> %rdx.shuf.0.0, %rdx.shuf.0.1 +; SSE42-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.0 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> +; SSE42-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> +; SSE42-NEXT: Cost Model: Found costs of 1 for: %bin.rdx8 = add <8 x i16> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SSE42-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> +; SSE42-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> +; SSE42-NEXT: Cost Model: Found costs of 1 for: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1 +; SSE42-NEXT: Cost Model: Found costs of 1 for: %r = extractelement <8 x i16> %bin.rdx9, i32 0 +; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %r ; ; AVX-LABEL: 'pairwise_reduction8i16' -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %rdx.shuf.0.0, %rdx.shuf.0.1 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %rdx.shuf.1.0, %rdx.shuf.1.1 -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0 -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r +; AVX-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> +; AVX-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> +; AVX-NEXT: Cost Model: Found costs of 1 for: %bin.rdx = add <8 x i16> %rdx.shuf.0.0, %rdx.shuf.0.1 +; AVX-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.0 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> +; AVX-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> +; AVX-NEXT: Cost Model: Found costs of 1 for: %bin.rdx8 = add <8 x i16> %rdx.shuf.1.0, %rdx.shuf.1.1 +; AVX-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> +; AVX-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> +; AVX-NEXT: Cost Model: Found costs of 1 for: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1 +; AVX-NEXT: Cost Model: Found costs of 1 for: %r = extractelement <8 x i16> %bin.rdx9, i32 0 +; AVX-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %r ; ; SLM-LABEL: 'pairwise_reduction8i16' -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found costs of 1 for: %bin.rdx = add <8 x i16> %rdx.shuf.0.0, %rdx.shuf.0.1 +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.0 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found costs of 1 for: %bin.rdx8 = add <8 x i16> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SLM-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found costs of 1 for: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1 +; SLM-NEXT: Cost Model: Found costs of 1 for: %r = extractelement <8 x i16> %bin.rdx9, i32 0 +; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %r ; %rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> %rdx.shuf.0.1 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> @@ -1206,56 +1290,56 @@ define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) { define fastcc i32 @pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) { ; SSE-LABEL: 'pairwise_reduction8i32' -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <8 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1 -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0 -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r +; SSE-NEXT: Cost Model: Found costs of 2 for: %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found costs of 2 for: %rdx.shuf.0.1 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %bin.rdx = add <8 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1 +; SSE-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.0 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %bin.rdx8 = add <8 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SSE-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1 +; SSE-NEXT: Cost Model: Found costs of 1 for: %r = extractelement <8 x i32> %bin.rdx9, i32 0 +; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r ; ; AVX1-LABEL: 'pairwise_reduction8i32' -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.0.1 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = add <8 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1 -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = add <8 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1 -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1 -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0 -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r +; AVX1-NEXT: Cost Model: Found costs of 4 for: %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> +; AVX1-NEXT: Cost Model: Found costs of 4 for: %rdx.shuf.0.1 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> +; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:5 Lat:2 SizeLat:6 for: %bin.rdx = add <8 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1 +; AVX1-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.0 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> +; AVX1-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> +; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:5 Lat:2 SizeLat:6 for: %bin.rdx8 = add <8 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1 +; AVX1-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> +; AVX1-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> +; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:5 Lat:2 SizeLat:6 for: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1 +; AVX1-NEXT: Cost Model: Found costs of 1 for: %r = extractelement <8 x i32> %bin.rdx9, i32 0 +; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r ; ; AVX2-LABEL: 'pairwise_reduction8i32' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1 -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1 -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1 -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0 -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:1 SizeLat:2 for: %bin.rdx = add <8 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1 +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.0 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:1 SizeLat:2 for: %bin.rdx8 = add <8 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1 +; AVX2-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> +; AVX2-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> +; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:1 SizeLat:2 for: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1 +; AVX2-NEXT: Cost Model: Found costs of 1 for: %r = extractelement <8 x i32> %bin.rdx9, i32 0 +; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r ; ; SLM-LABEL: 'pairwise_reduction8i32' -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <8 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r +; SLM-NEXT: Cost Model: Found costs of 2 for: %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found costs of 2 for: %rdx.shuf.0.1 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %bin.rdx = add <8 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1 +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.0 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %bin.rdx8 = add <8 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SLM-NEXT: Cost Model: Found costs of 0 for: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found costs of 1 for: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1 +; SLM-NEXT: Cost Model: Found costs of 1 for: %r = extractelement <8 x i32> %bin.rdx9, i32 0 +; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r ; %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> %rdx.shuf.0.1 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32>