diff --git a/llvm/test/Analysis/CostModel/X86/reduce-add.ll b/llvm/test/Analysis/CostModel/X86/reduce-add.ll
index c869d0e3032b9..9a717e7dbef73 100644
--- a/llvm/test/Analysis/CostModel/X86/reduce-add.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-add.ll
@@ -1,55 +1,71 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx | FileCheck %s --check-prefixes=AVX1
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx | FileCheck %s --check-prefixes=AVX1
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
 
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mcpu=slm | FileCheck %s --check-prefixes=SLM
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mcpu=slm | FileCheck %s --check-prefixes=SLM
 
 define i32 @reduce_i64(i32 %arg) {
 ; SSE-LABEL: 'reduce_i64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+; SSE-NEXT:  Cost Model: Found costs of 2 for: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; SSE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:4 for: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; SSE-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:8 SizeLat:8 for: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; SSE-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:16 SizeLat:16 for: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 3 for: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:8 Lat:5 SizeLat:9 for: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:18 Lat:9 SizeLat:21 for: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 3 for: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:4 SizeLat:5 for: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:6 Lat:6 SizeLat:9 for: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
-; AVX512-LABEL: 'reduce_i64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512F-LABEL: 'reduce_i64'
+; AVX512F-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 3 for: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:8 for: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:8 Lat:10 SizeLat:9 for: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i64'
+; AVX512BW-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 3 for: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:7 for: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:8 Lat:10 SizeLat:8 for: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i64'
+; AVX512DQ-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 1 for: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 3 for: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:8 for: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:8 Lat:10 SizeLat:9 for: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SLM-LABEL: 'reduce_i64'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SLM-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+; SLM-NEXT:  Cost Model: Found costs of 5 for: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:6 Lat:7 SizeLat:7 for: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:8 Lat:11 SizeLat:11 for: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:12 Lat:19 SizeLat:19 for: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
+; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V1  = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
   %V2  = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
@@ -61,44 +77,60 @@ define i32 @reduce_i64(i32 %arg) {
 
 define i32 @reduce_i32(i32 %arg) {
 ; SSE-LABEL: 'reduce_i32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE-NEXT:  Cost Model: Found costs of 2 for: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
+; SSE-NEXT:  Cost Model: Found costs of 3 for: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
+; SSE-NEXT:  Cost Model: Found costs of 4 for: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
+; SSE-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
+; SSE-NEXT:  Cost Model: Found costs of 10 for: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 3 for: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 5 for: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:10 Lat:7 SizeLat:11 for: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:20 Lat:11 SizeLat:23 for: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX2-NEXT:  Cost Model: Found costs of 2 for: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 3 for: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 5 for: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:6 Lat:6 SizeLat:7 for: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:8 Lat:8 SizeLat:11 for: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i32'
+; AVX512F-NEXT:  Cost Model: Found costs of 2 for: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 3 for: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 5 for: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:10 for: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:10 Lat:14 SizeLat:11 for: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i32'
+; AVX512BW-NEXT:  Cost Model: Found costs of 2 for: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 3 for: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 5 for: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:9 for: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:10 Lat:14 SizeLat:10 for: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
-; AVX512-LABEL: 'reduce_i32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512DQ-LABEL: 'reduce_i32'
+; AVX512DQ-NEXT:  Cost Model: Found costs of 2 for: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 3 for: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 5 for: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:10 for: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:10 Lat:14 SizeLat:11 for: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SLM-LABEL: 'reduce_i32'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SLM-NEXT:  Cost Model: Found costs of 2 for: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
+; SLM-NEXT:  Cost Model: Found costs of 3 for: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
+; SLM-NEXT:  Cost Model: Found costs of 4 for: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found costs of 10 for: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
+; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V2  = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
   %V4  = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
@@ -110,67 +142,67 @@ define i32 @reduce_i32(i32 %arg) {
 
 define i32 @reduce_i16(i32 %arg) {
 ; SSE-LABEL: 'reduce_i16'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE-NEXT:  Cost Model: Found costs of 2 for: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
+; SSE-NEXT:  Cost Model: Found costs of 3 for: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
+; SSE-NEXT:  Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
+; SSE-NEXT:  Cost Model: Found costs of 5 for: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
+; SSE-NEXT:  Cost Model: Found costs of 7 for: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
+; SSE-NEXT:  Cost Model: Found costs of 11 for: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 3 for: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 5 for: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:10 Lat:7 SizeLat:11 for: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:20 Lat:11 SizeLat:23 for: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX2-NEXT:  Cost Model: Found costs of 2 for: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 3 for: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 5 for: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:6 Lat:6 SizeLat:7 for: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:8 Lat:8 SizeLat:11 for: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512F-NEXT:  Cost Model: Found costs of 2 for: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 3 for: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 5 for: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:11 Lat:15 SizeLat:12 for: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:16 Lat:22 SizeLat:17 for: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512BW-NEXT:  Cost Model: Found costs of 2 for: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 3 for: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 5 for: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:11 Lat:15 SizeLat:11 for: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:12 Lat:16 SizeLat:12 for: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i16'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512DQ-NEXT:  Cost Model: Found costs of 2 for: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 3 for: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 5 for: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:11 Lat:15 SizeLat:12 for: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:16 Lat:22 SizeLat:17 for: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SLM-LABEL: 'reduce_i16'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SLM-NEXT:  Cost Model: Found costs of 2 for: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
+; SLM-NEXT:  Cost Model: Found costs of 3 for: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
+; SLM-NEXT:  Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
+; SLM-NEXT:  Cost Model: Found costs of 5 for: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
+; SLM-NEXT:  Cost Model: Found costs of 7 for: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
+; SLM-NEXT:  Cost Model: Found costs of 11 for: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
+; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V2  = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
   %V4  = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
@@ -183,74 +215,74 @@ define i32 @reduce_i16(i32 %arg) {
 
 define i32 @reduce_i8(i32 %arg) {
 ; SSE-LABEL: 'reduce_i8'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE-NEXT:  Cost Model: Found costs of 2 for: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; SSE-NEXT:  Cost Model: Found costs of 2 for: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; SSE-NEXT:  Cost Model: Found costs of 2 for: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; SSE-NEXT:  Cost Model: Found costs of 3 for: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; SSE-NEXT:  Cost Model: Found costs of 4 for: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; SSE-NEXT:  Cost Model: Found costs of 6 for: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; SSE-NEXT:  Cost Model: Found costs of 10 for: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i8'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 3 for: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 4 for: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:9 Lat:6 SizeLat:10 for: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:19 Lat:10 SizeLat:22 for: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX2-NEXT:  Cost Model: Found costs of 2 for: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 2 for: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 2 for: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 3 for: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 4 for: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:5 SizeLat:6 for: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:7 SizeLat:10 for: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i8'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512F-NEXT:  Cost Model: Found costs of 2 for: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 2 for: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 2 for: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 3 for: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 4 for: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:13 Lat:17 SizeLat:14 for: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:18 Lat:24 SizeLat:19 for: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i8'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512BW-NEXT:  Cost Model: Found costs of 2 for: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 2 for: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 2 for: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 3 for: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 4 for: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:13 Lat:17 SizeLat:13 for: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:14 Lat:18 SizeLat:14 for: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i8'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512DQ-NEXT:  Cost Model: Found costs of 2 for: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 2 for: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 2 for: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 3 for: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 4 for: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:13 Lat:17 SizeLat:14 for: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:18 Lat:24 SizeLat:19 for: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SLM-LABEL: 'reduce_i8'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SLM-NEXT:  Cost Model: Found costs of 2 for: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; SLM-NEXT:  Cost Model: Found costs of 2 for: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; SLM-NEXT:  Cost Model: Found costs of 2 for: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; SLM-NEXT:  Cost Model: Found costs of 3 for: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; SLM-NEXT:  Cost Model: Found costs of 4 for: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; SLM-NEXT:  Cost Model: Found costs of 6 for: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; SLM-NEXT:  Cost Model: Found costs of 10 for: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
+; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V2   = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
   %V4   = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-and.ll b/llvm/test/Analysis/CostModel/X86/reduce-and.ll
index 5da1109cabe58..21338a12381fc 100644
--- a/llvm/test/Analysis/CostModel/X86/reduce-and.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-and.ll
@@ -1,37 +1,37 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ
 
 define i32 @reduce_i64(i32 %arg) {
 ; SSE-LABEL: 'reduce_i64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> undef)
+; SSE-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> undef)
+; SSE-NEXT:  Cost Model: Found costs of 4 for: %V4 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> undef)
+; SSE-NEXT:  Cost Model: Found costs of 6 for: %V8 = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> undef)
+; SSE-NEXT:  Cost Model: Found costs of 10 for: %V16 = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> undef)
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_i64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:6 Lat:6 SizeLat:7 for: %V8 = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:8 Lat:8 SizeLat:11 for: %V16 = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %V2 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:7 SizeLat:5 for: %V4 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:7 for: %V8 = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:8 Lat:10 SizeLat:8 for: %V16 = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V1  = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> undef)
   %V2  = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> undef)
@@ -43,28 +43,28 @@ define i32 @reduce_i64(i32 %arg) {
 
 define i32 @reduce_i32(i32 %arg) {
 ; SSE-LABEL: 'reduce_i32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> undef)
+; SSE-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> undef)
+; SSE-NEXT:  Cost Model: Found costs of 6 for: %V8 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef)
+; SSE-NEXT:  Cost Model: Found costs of 8 for: %V16 = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> undef)
+; SSE-NEXT:  Cost Model: Found costs of 12 for: %V32 = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> undef)
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_i32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:8 Lat:8 SizeLat:9 for: %V16 = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:10 Lat:10 SizeLat:13 for: %V32 = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %V2 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:9 SizeLat:5 for: %V4 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:11 SizeLat:7 for: %V8 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:9 for: %V16 = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:10 Lat:14 SizeLat:10 for: %V32 = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V2  = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> undef)
   %V4  = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> undef)
@@ -76,31 +76,40 @@ define i32 @reduce_i32(i32 %arg) {
 
 define i32 @reduce_i16(i32 %arg) {
 ; SSE-LABEL: 'reduce_i16'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> undef)
+; SSE-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef)
+; SSE-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef)
+; SSE-NEXT:  Cost Model: Found costs of 8 for: %V16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef)
+; SSE-NEXT:  Cost Model: Found costs of 10 for: %V32 = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> undef)
+; SSE-NEXT:  Cost Model: Found costs of 14 for: %V64 = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> undef)
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
-; AVX-LABEL: 'reduce_i16'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-LABEL: 'reduce_i16'
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %V2 = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:6 SizeLat:5 for: %V4 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:8 SizeLat:7 for: %V8 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:10 SizeLat:9 for: %V16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:10 Lat:11 SizeLat:11 for: %V32 = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:12 Lat:13 SizeLat:15 for: %V64 = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i16'
+; AVX2-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 9 for: %V16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:10 Lat:10 SizeLat:11 for: %V32 = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:12 Lat:12 SizeLat:15 for: %V64 = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i16'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:7 SizeLat:5 for: %V4 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:11 SizeLat:7 for: %V8 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:9 for: %V16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:11 Lat:15 SizeLat:11 for: %V32 = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:12 Lat:16 SizeLat:12 for: %V64 = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V2  = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> undef)
   %V4  = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef)
@@ -113,34 +122,44 @@ define i32 @reduce_i16(i32 %arg) {
 
 define i32 @reduce_i8(i32 %arg) {
 ; SSE-LABEL: 'reduce_i8'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> undef)
+; SSE-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef)
+; SSE-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef)
+; SSE-NEXT:  Cost Model: Found costs of 9 for: %V16 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef)
+; SSE-NEXT:  Cost Model: Found costs of 10 for: %V32 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef)
+; SSE-NEXT:  Cost Model: Found costs of 12 for: %V64 = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> undef)
+; SSE-NEXT:  Cost Model: Found costs of 16 for: %V128 = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> undef)
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i8'
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %V2 = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:7 SizeLat:5 for: %V4 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:7 for: %V8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:11 SizeLat:9 for: %V16 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:11 Lat:13 SizeLat:11 for: %V32 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:12 Lat:14 SizeLat:13 for: %V64 = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:14 Lat:16 SizeLat:17 for: %V128 = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
-; AVX-LABEL: 'reduce_i8'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX2-LABEL: 'reduce_i8'
+; AVX2-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 9 for: %V16 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 11 for: %V32 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:12 Lat:12 SizeLat:13 for: %V64 = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:14 Lat:14 SizeLat:17 for: %V128 = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i8'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:7 for: %V8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:9 for: %V16 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:11 Lat:15 SizeLat:11 for: %V32 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:13 Lat:17 SizeLat:13 for: %V64 = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:14 Lat:18 SizeLat:14 for: %V128 = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V2   = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> undef)
   %V4   = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef)
@@ -154,70 +173,70 @@ define i32 @reduce_i8(i32 %arg) {
 
 define i32 @reduce_i1(i32 %arg) {
 ; SSE-LABEL: 'reduce_i1'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
+; SSE-NEXT:  Cost Model: Found costs of 2 for: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef)
+; SSE-NEXT:  Cost Model: Found costs of 2 for: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef)
+; SSE-NEXT:  Cost Model: Found costs of 2 for: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef)
+; SSE-NEXT:  Cost Model: Found costs of 2 for: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef)
+; SSE-NEXT:  Cost Model: Found costs of 3 for: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
+; SSE-NEXT:  Cost Model: Found costs of 5 for: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
+; SSE-NEXT:  Cost Model: Found costs of 9 for: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i1'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 4 for: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:5 SizeLat:6 for: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:7 SizeLat:10 for: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i1'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 2 for: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 2 for: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 2 for: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 2 for: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 2 for: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:3 SizeLat:4 for: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:5 SizeLat:8 for: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i1'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 9 for: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 10 for: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 12 for: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 16 for: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i1'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 9 for: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 11 for: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 13 for: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 14 for: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i1'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512DQ-NEXT:  Cost Model: Found costs of 1 for: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 9 for: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 10 for: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 12 for: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 16 for: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V1   = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
   %V2   = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef)
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-fadd.ll b/llvm/test/Analysis/CostModel/X86/reduce-fadd.ll
index 225d79b82ab77..122903d1e7a1c 100644
--- a/llvm/test/Analysis/CostModel/X86/reduce-fadd.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-fadd.ll
@@ -1,70 +1,70 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx | FileCheck %s --check-prefixes=AVX1
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx | FileCheck %s --check-prefixes=AVX1
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512
 
 define void @reduce_f64(double %arg) {
 ; SSE2-LABEL: 'reduce_f64'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:3 Lat:7 SizeLat:3 for: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:6 Lat:14 SizeLat:6 for: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:28 SizeLat:12 for: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:24 Lat:56 SizeLat:24 for: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSSE3-LABEL: 'reduce_f64'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:3 Lat:7 SizeLat:3 for: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:6 Lat:14 SizeLat:6 for: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:28 SizeLat:12 for: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:24 Lat:56 SizeLat:24 for: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE41-LABEL: 'reduce_f64'
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE41-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:3 Lat:7 SizeLat:3 for: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:6 Lat:14 SizeLat:6 for: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:28 SizeLat:12 for: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:24 Lat:56 SizeLat:24 for: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'reduce_f64'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:7 SizeLat:3 for: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:6 Lat:14 SizeLat:6 for: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:12 Lat:28 SizeLat:12 for: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:24 Lat:56 SizeLat:24 for: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX1-LABEL: 'reduce_f64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:11 SizeLat:3 for: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:23 SizeLat:7 for: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:14 Lat:46 SizeLat:14 for: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:28 Lat:92 SizeLat:28 for: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'reduce_f64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:9 SizeLat:3 for: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:19 SizeLat:7 for: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:14 Lat:38 SizeLat:14 for: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:28 Lat:76 SizeLat:28 for: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'reduce_f64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:9 SizeLat:3 for: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:19 SizeLat:7 for: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:15 Lat:39 SizeLat:15 for: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:30 Lat:78 SizeLat:30 for: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V1  = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef)
   %V2  = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef)
@@ -76,67 +76,67 @@ define void @reduce_f64(double %arg) {
 
 define void @reduce_f32(float %arg) {
 ; SSE2-LABEL: 'reduce_f32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:3 Lat:7 SizeLat:3 for: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:7 Lat:15 SizeLat:7 for: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:14 Lat:30 SizeLat:14 for: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:28 Lat:60 SizeLat:28 for: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:88 CodeSize:56 Lat:120 SizeLat:56 for: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSSE3-LABEL: 'reduce_f32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:3 Lat:7 SizeLat:3 for: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:7 Lat:15 SizeLat:7 for: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:14 Lat:30 SizeLat:14 for: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:28 Lat:60 SizeLat:28 for: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:88 CodeSize:56 Lat:120 SizeLat:56 for: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE41-LABEL: 'reduce_f32'
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE41-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:3 Lat:7 SizeLat:3 for: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:7 Lat:15 SizeLat:7 for: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:14 Lat:30 SizeLat:14 for: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:28 Lat:60 SizeLat:28 for: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:88 CodeSize:56 Lat:120 SizeLat:56 for: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'reduce_f32'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:7 SizeLat:3 for: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:15 SizeLat:7 for: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:14 Lat:30 SizeLat:14 for: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:28 Lat:60 SizeLat:28 for: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:56 Lat:120 SizeLat:56 for: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX1-LABEL: 'reduce_f32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:11 SizeLat:3 for: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:23 SizeLat:7 for: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:15 Lat:47 SizeLat:15 for: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:30 Lat:94 SizeLat:30 for: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:60 Lat:188 SizeLat:60 for: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'reduce_f32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:9 SizeLat:3 for: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:19 SizeLat:7 for: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:15 Lat:39 SizeLat:15 for: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:30 Lat:78 SizeLat:30 for: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:60 Lat:156 SizeLat:60 for: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'reduce_f32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:9 SizeLat:3 for: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:19 SizeLat:7 for: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:15 Lat:39 SizeLat:15 for: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:31 CodeSize:31 Lat:79 SizeLat:31 for: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:62 CodeSize:62 Lat:158 SizeLat:62 for: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V1  = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef)
   %V2  = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef)
@@ -151,60 +151,60 @@ define void @reduce_f32(float %arg) {
 
 define void @reduce_f64_fast(double %arg) {
 ; SSE2-LABEL: 'reduce_f64_fast'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE2-NEXT:  Cost Model: Found costs of 0 for: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 2 for: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:5 SizeLat:3 for: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:5 Lat:11 SizeLat:5 for: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:9 Lat:23 SizeLat:9 for: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSSE3-LABEL: 'reduce_f64_fast'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSSE3-NEXT:  Cost Model: Found costs of 0 for: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of 2 for: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:5 SizeLat:3 for: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:5 Lat:11 SizeLat:5 for: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:9 Lat:23 SizeLat:9 for: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE41-LABEL: 'reduce_f64_fast'
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE41-NEXT:  Cost Model: Found costs of 0 for: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef)
+; SSE41-NEXT:  Cost Model: Found costs of 2 for: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:5 SizeLat:3 for: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:5 Lat:11 SizeLat:5 for: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:9 Lat:23 SizeLat:9 for: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'reduce_f64_fast'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE42-NEXT:  Cost Model: Found costs of 0 for: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of 2 for: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:11 SizeLat:5 for: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:23 SizeLat:9 for: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX1-LABEL: 'reduce_f64_fast'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX1-NEXT:  Cost Model: Found costs of 0 for: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 3 for: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:8 SizeLat:5 for: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:6 Lat:18 SizeLat:9 for: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'reduce_f64_fast'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 2 for: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 3 for: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:7 SizeLat:5 for: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:6 Lat:15 SizeLat:9 for: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'reduce_f64_fast'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX512-NEXT:  Cost Model: Found costs of 0 for: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of 2 for: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of 3 for: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:6 Lat:17 SizeLat:6 for: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:21 SizeLat:7 for: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V1  = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef)
   %V2  = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef)
@@ -216,67 +216,67 @@ define void @reduce_f64_fast(double %arg) {
 
 define void @reduce_f32_fast(float %arg) {
 ; SSE2-LABEL: 'reduce_f32_fast'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE2-NEXT:  Cost Model: Found costs of 0 for: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 2 for: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 4 for: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:5 Lat:7 SizeLat:5 for: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:7 Lat:13 SizeLat:7 for: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:11 Lat:25 SizeLat:11 for: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSSE3-LABEL: 'reduce_f32_fast'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSSE3-NEXT:  Cost Model: Found costs of 0 for: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of 2 for: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of 4 for: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:5 Lat:7 SizeLat:5 for: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:7 Lat:13 SizeLat:7 for: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:11 Lat:25 SizeLat:11 for: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE41-LABEL: 'reduce_f32_fast'
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE41-NEXT:  Cost Model: Found costs of 0 for: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef)
+; SSE41-NEXT:  Cost Model: Found costs of 2 for: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef)
+; SSE41-NEXT:  Cost Model: Found costs of 4 for: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:5 Lat:7 SizeLat:5 for: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:7 Lat:13 SizeLat:7 for: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:11 Lat:25 SizeLat:11 for: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'reduce_f32_fast'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE42-NEXT:  Cost Model: Found costs of 0 for: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of 2 for: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of 4 for: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:7 SizeLat:5 for: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:13 SizeLat:7 for: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:11 Lat:25 SizeLat:11 for: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX1-LABEL: 'reduce_f32_fast'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX1-NEXT:  Cost Model: Found costs of 0 for: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 3 for: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 4 for: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:5 Lat:9 SizeLat:6 for: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:7 Lat:19 SizeLat:10 for: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'reduce_f32_fast'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 2 for: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 3 for: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 4 for: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:8 SizeLat:6 for: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:16 SizeLat:10 for: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'reduce_f32_fast'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX512-NEXT:  Cost Model: Found costs of 0 for: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of 2 for: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of 3 for: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of 4 for: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:8 Lat:24 SizeLat:8 for: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:28 SizeLat:9 for: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V1  = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef)
   %V2  = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef)
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-fmax.ll b/llvm/test/Analysis/CostModel/X86/reduce-fmax.ll
index bd8ba23e93297..922a18fe47a6f 100644
--- a/llvm/test/Analysis/CostModel/X86/reduce-fmax.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-fmax.ll
@@ -1,46 +1,62 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512
 
 define i32 @reduce_f64(i32 %arg) {
-; SSE-LABEL: 'reduce_f64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-LABEL: 'reduce_f64'
+; SSE2-NEXT:  Cost Model: Found costs of 0 for: %V1 = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:7 Lat:7 SizeLat:7 for: %V2 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:13 Lat:13 SizeLat:13 for: %V4 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:25 Lat:25 SizeLat:25 for: %V8 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:49 Lat:49 SizeLat:49 for: %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
+;
+; SSE41-LABEL: 'reduce_f64'
+; SSE41-NEXT:  Cost Model: Found costs of 0 for: %V1 = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:7 Lat:7 SizeLat:7 for: %V2 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:13 Lat:13 SizeLat:13 for: %V4 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:25 Lat:25 SizeLat:25 for: %V8 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:49 Lat:49 SizeLat:49 for: %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_f64'
+; SSE42-NEXT:  Cost Model: Found costs of 0 for: %V1 = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:5 SizeLat:6 for: %V2 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:9 SizeLat:11 for: %V4 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:17 Lat:17 SizeLat:21 for: %V8 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:33 Lat:33 SizeLat:41 for: %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_f64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-NEXT:  Cost Model: Found costs of 0 for: %V1 = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:7 SizeLat:6 for: %V2 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:8 Lat:14 SizeLat:12 for: %V4 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:11 Lat:21 SizeLat:22 for: %V8 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:17 Lat:35 SizeLat:42 for: %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_f64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %V1 = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:4 Lat:8 SizeLat:6 for: %V2 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:16 SizeLat:12 for: %V4 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:11 Lat:23 SizeLat:18 for: %V8 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:17 Lat:37 SizeLat:30 for: %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_f64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-NEXT:  Cost Model: Found costs of 0 for: %V1 = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V2 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:8 Lat:6 SizeLat:8 for: %V4 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:12 Lat:9 SizeLat:12 for: %V8 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:15 Lat:12 SizeLat:15 for: %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V1  = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef)
   %V2  = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
@@ -51,41 +67,59 @@ define i32 @reduce_f64(i32 %arg) {
 }
 
 define i32 @reduce_f32(i32 %arg) {
-; SSE-LABEL: 'reduce_f32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-LABEL: 'reduce_f32'
+; SSE2-NEXT:  Cost Model: Found costs of 0 for: %V1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:7 Lat:7 SizeLat:7 for: %V2 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:14 Lat:14 SizeLat:14 for: %V4 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:20 Lat:20 SizeLat:20 for: %V8 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:32 Lat:32 SizeLat:32 for: %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:56 Lat:56 SizeLat:56 for: %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
+;
+; SSE41-LABEL: 'reduce_f32'
+; SSE41-NEXT:  Cost Model: Found costs of 0 for: %V1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:7 Lat:7 SizeLat:7 for: %V2 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:14 Lat:14 SizeLat:14 for: %V4 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:20 Lat:20 SizeLat:20 for: %V8 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:32 Lat:32 SizeLat:32 for: %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:56 Lat:56 SizeLat:56 for: %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_f32'
+; SSE42-NEXT:  Cost Model: Found costs of 0 for: %V1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:5 SizeLat:6 for: %V2 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:10 Lat:10 SizeLat:12 for: %V4 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:14 Lat:14 SizeLat:17 for: %V8 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:22 Lat:22 SizeLat:27 for: %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:38 Lat:38 SizeLat:47 for: %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_f32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-NEXT:  Cost Model: Found costs of 0 for: %V1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:7 SizeLat:6 for: %V2 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:8 Lat:14 SizeLat:12 for: %V4 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:12 Lat:21 SizeLat:18 for: %V8 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:15 Lat:28 SizeLat:28 for: %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:21 Lat:42 SizeLat:48 for: %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_f32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %V1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:4 Lat:8 SizeLat:6 for: %V2 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:16 SizeLat:12 for: %V4 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:12 Lat:24 SizeLat:18 for: %V8 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:31 SizeLat:24 for: %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:21 Lat:45 SizeLat:36 for: %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_f32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-NEXT:  Cost Model: Found costs of 0 for: %V1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V2 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:8 Lat:8 SizeLat:8 for: %V4 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:12 Lat:10 SizeLat:12 for: %V8 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:16 Lat:13 SizeLat:16 for: %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:19 Lat:17 SizeLat:19 for: %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V1  = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef)
   %V2  = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
@@ -100,28 +134,28 @@ define i32 @reduce_f32(i32 %arg) {
 
 define i32 @reduce_f64_fast(i32 %arg) {
 ; SSE-LABEL: 'reduce_f64_fast'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call fast double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE-NEXT:  Cost Model: Found costs of 0 for: %V1 = call fast double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef)
+; SSE-NEXT:  Cost Model: Found costs of 2 for: %V2 = call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
+; SSE-NEXT:  Cost Model: Found costs of 3 for: %V4 = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
+; SSE-NEXT:  Cost Model: Found costs of 5 for: %V8 = call fast double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef)
+; SSE-NEXT:  Cost Model: Found costs of 9 for: %V16 = call fast double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef)
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_f64_fast'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX-NEXT:  Cost Model: Found costs of 0 for: %V1 = call fast double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %V2 = call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of 4 for: %V4 = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V8 = call fast double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of 7 for: %V16 = call fast double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_f64_fast'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-NEXT:  Cost Model: Found costs of 0 for: %V1 = call fast double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:2 for: %V2 = call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:6 SizeLat:4 for: %V4 = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:6 Lat:8 SizeLat:6 for: %V8 = call fast double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:7 for: %V16 = call fast double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V1  = call fast double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef)
   %V2  = call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
@@ -133,31 +167,31 @@ define i32 @reduce_f64_fast(i32 %arg) {
 
 define i32 @reduce_f32_fast(i32 %arg) {
 ; SSE-LABEL: 'reduce_f32_fast'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call fast float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE-NEXT:  Cost Model: Found costs of 0 for: %V1 = call fast float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef)
+; SSE-NEXT:  Cost Model: Found costs of 2 for: %V2 = call fast float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
+; SSE-NEXT:  Cost Model: Found costs of 4 for: %V4 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
+; SSE-NEXT:  Cost Model: Found costs of 5 for: %V8 = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
+; SSE-NEXT:  Cost Model: Found costs of 7 for: %V16 = call fast float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef)
+; SSE-NEXT:  Cost Model: Found costs of 11 for: %V32 = call fast float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef)
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_f32_fast'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call fast float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX-NEXT:  Cost Model: Found costs of 0 for: %V1 = call fast float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %V2 = call fast float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 4 for: %V4 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 6 for: %V8 = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 7 for: %V16 = call fast float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: %V32 = call fast float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_f32_fast'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call fast float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call fast float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-NEXT:  Cost Model: Found costs of 0 for: %V1 = call fast float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:2 for: %V2 = call fast float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:4 for: %V4 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:6 Lat:10 SizeLat:6 for: %V8 = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:8 Lat:12 SizeLat:8 for: %V16 = call fast float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:9 for: %V32 = call fast float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V1  = call fast float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef)
   %V2  = call fast float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-fmin.ll b/llvm/test/Analysis/CostModel/X86/reduce-fmin.ll
index c857104746094..23e11cd812f4d 100644
--- a/llvm/test/Analysis/CostModel/X86/reduce-fmin.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-fmin.ll
@@ -1,46 +1,62 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512
 
 define i32 @reduce_f64(i32 %arg) {
-; SSE-LABEL: 'reduce_f64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8 = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-LABEL: 'reduce_f64'
+; SSE2-NEXT:  Cost Model: Found costs of 0 for: %V1 = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:7 Lat:7 SizeLat:7 for: %V2 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:13 Lat:13 SizeLat:13 for: %V4 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:25 Lat:25 SizeLat:25 for: %V8 = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:49 Lat:49 SizeLat:49 for: %V16 = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
+;
+; SSE41-LABEL: 'reduce_f64'
+; SSE41-NEXT:  Cost Model: Found costs of 0 for: %V1 = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:7 Lat:7 SizeLat:7 for: %V2 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:13 Lat:13 SizeLat:13 for: %V4 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:25 Lat:25 SizeLat:25 for: %V8 = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:49 Lat:49 SizeLat:49 for: %V16 = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_f64'
+; SSE42-NEXT:  Cost Model: Found costs of 0 for: %V1 = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:5 SizeLat:6 for: %V2 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:9 SizeLat:11 for: %V4 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:17 Lat:17 SizeLat:21 for: %V8 = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:33 Lat:33 SizeLat:41 for: %V16 = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_f64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8 = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16 = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-NEXT:  Cost Model: Found costs of 0 for: %V1 = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:7 SizeLat:6 for: %V2 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:8 Lat:14 SizeLat:12 for: %V4 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:11 Lat:21 SizeLat:22 for: %V8 = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:17 Lat:35 SizeLat:42 for: %V16 = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_f64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16 = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %V1 = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:4 Lat:8 SizeLat:6 for: %V2 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:16 SizeLat:12 for: %V4 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:11 Lat:23 SizeLat:18 for: %V8 = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:17 Lat:37 SizeLat:30 for: %V16 = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_f64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-NEXT:  Cost Model: Found costs of 0 for: %V1 = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V2 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:8 Lat:6 SizeLat:8 for: %V4 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:12 Lat:9 SizeLat:12 for: %V8 = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:15 Lat:12 SizeLat:15 for: %V16 = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V1  = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef)
   %V2  = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
@@ -51,41 +67,59 @@ define i32 @reduce_f64(i32 %arg) {
 }
 
 define i32 @reduce_f32(i32 %arg) {
-; SSE-LABEL: 'reduce_f32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16 = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32 = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-LABEL: 'reduce_f32'
+; SSE2-NEXT:  Cost Model: Found costs of 0 for: %V1 = call float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:7 Lat:7 SizeLat:7 for: %V2 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:14 Lat:14 SizeLat:14 for: %V4 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:20 Lat:20 SizeLat:20 for: %V8 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:32 Lat:32 SizeLat:32 for: %V16 = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:56 Lat:56 SizeLat:56 for: %V32 = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
+;
+; SSE41-LABEL: 'reduce_f32'
+; SSE41-NEXT:  Cost Model: Found costs of 0 for: %V1 = call float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:7 Lat:7 SizeLat:7 for: %V2 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:14 Lat:14 SizeLat:14 for: %V4 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:20 Lat:20 SizeLat:20 for: %V8 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:32 Lat:32 SizeLat:32 for: %V16 = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:56 Lat:56 SizeLat:56 for: %V32 = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_f32'
+; SSE42-NEXT:  Cost Model: Found costs of 0 for: %V1 = call float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:5 SizeLat:6 for: %V2 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:10 Lat:10 SizeLat:12 for: %V4 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:14 Lat:14 SizeLat:17 for: %V8 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:22 Lat:22 SizeLat:27 for: %V16 = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:38 Lat:38 SizeLat:47 for: %V32 = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_f32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-NEXT:  Cost Model: Found costs of 0 for: %V1 = call float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:7 SizeLat:6 for: %V2 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:8 Lat:14 SizeLat:12 for: %V4 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:12 Lat:21 SizeLat:18 for: %V8 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:15 Lat:28 SizeLat:28 for: %V16 = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:21 Lat:42 SizeLat:48 for: %V32 = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_f32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32 = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %V1 = call float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:4 Lat:8 SizeLat:6 for: %V2 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:16 SizeLat:12 for: %V4 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:12 Lat:24 SizeLat:18 for: %V8 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:31 SizeLat:24 for: %V16 = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:21 Lat:45 SizeLat:36 for: %V32 = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_f32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32 = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-NEXT:  Cost Model: Found costs of 0 for: %V1 = call float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V2 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:8 Lat:8 SizeLat:8 for: %V4 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:12 Lat:10 SizeLat:12 for: %V8 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:16 Lat:13 SizeLat:16 for: %V16 = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:19 Lat:17 SizeLat:19 for: %V32 = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V1  = call float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef)
   %V2  = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
@@ -100,28 +134,28 @@ define i32 @reduce_f32(i32 %arg) {
 
 define i32 @reduce_f64_fast(i32 %arg) {
 ; SSE-LABEL: 'reduce_f64_fast'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call fast double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE-NEXT:  Cost Model: Found costs of 0 for: %V1 = call fast double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef)
+; SSE-NEXT:  Cost Model: Found costs of 2 for: %V2 = call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
+; SSE-NEXT:  Cost Model: Found costs of 3 for: %V4 = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
+; SSE-NEXT:  Cost Model: Found costs of 5 for: %V8 = call fast double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef)
+; SSE-NEXT:  Cost Model: Found costs of 9 for: %V16 = call fast double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef)
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_f64_fast'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX-NEXT:  Cost Model: Found costs of 0 for: %V1 = call fast double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %V2 = call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of 4 for: %V4 = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V8 = call fast double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of 7 for: %V16 = call fast double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_f64_fast'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-NEXT:  Cost Model: Found costs of 0 for: %V1 = call fast double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:2 for: %V2 = call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:6 SizeLat:4 for: %V4 = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:6 Lat:8 SizeLat:6 for: %V8 = call fast double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:7 for: %V16 = call fast double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V1  = call fast double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef)
   %V2  = call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
@@ -133,31 +167,31 @@ define i32 @reduce_f64_fast(i32 %arg) {
 
 define i32 @reduce_f32_fast(i32 %arg) {
 ; SSE-LABEL: 'reduce_f32_fast'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call fast float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE-NEXT:  Cost Model: Found costs of 0 for: %V1 = call fast float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef)
+; SSE-NEXT:  Cost Model: Found costs of 2 for: %V2 = call fast float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
+; SSE-NEXT:  Cost Model: Found costs of 4 for: %V4 = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
+; SSE-NEXT:  Cost Model: Found costs of 5 for: %V8 = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
+; SSE-NEXT:  Cost Model: Found costs of 7 for: %V16 = call fast float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef)
+; SSE-NEXT:  Cost Model: Found costs of 11 for: %V32 = call fast float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef)
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_f32_fast'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call fast float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX-NEXT:  Cost Model: Found costs of 0 for: %V1 = call fast float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %V2 = call fast float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 4 for: %V4 = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 6 for: %V8 = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 7 for: %V16 = call fast float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: %V32 = call fast float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_f32_fast'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call fast float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call fast float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-NEXT:  Cost Model: Found costs of 0 for: %V1 = call fast float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:2 for: %V2 = call fast float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:4 for: %V4 = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:6 Lat:10 SizeLat:6 for: %V8 = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:8 Lat:12 SizeLat:8 for: %V16 = call fast float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:9 for: %V32 = call fast float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V1  = call fast float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef)
   %V2  = call fast float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-fmul.ll b/llvm/test/Analysis/CostModel/X86/reduce-fmul.ll
index 14da7f5e539a2..06ff41e8f7101 100644
--- a/llvm/test/Analysis/CostModel/X86/reduce-fmul.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-fmul.ll
@@ -1,70 +1,70 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx | FileCheck %s --check-prefixes=AVX1
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx | FileCheck %s --check-prefixes=AVX1
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512
 
 define void @reduce_f64(double %arg) {
 ; SSE2-LABEL: 'reduce_f64'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:5 SizeLat:1 for: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:3 Lat:11 SizeLat:3 for: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:6 Lat:22 SizeLat:6 for: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:44 SizeLat:12 for: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:24 Lat:88 SizeLat:24 for: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSSE3-LABEL: 'reduce_f64'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:5 SizeLat:1 for: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:3 Lat:11 SizeLat:3 for: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:6 Lat:22 SizeLat:6 for: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:44 SizeLat:12 for: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:24 Lat:88 SizeLat:24 for: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE41-LABEL: 'reduce_f64'
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE41-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:5 SizeLat:1 for: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:3 Lat:11 SizeLat:3 for: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:6 Lat:22 SizeLat:6 for: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:44 SizeLat:12 for: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:24 Lat:88 SizeLat:24 for: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'reduce_f64'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:11 SizeLat:3 for: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:6 Lat:22 SizeLat:6 for: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:12 Lat:44 SizeLat:12 for: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:24 Lat:88 SizeLat:24 for: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX1-LABEL: 'reduce_f64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:5 SizeLat:1 for: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:3 Lat:11 SizeLat:3 for: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:7 Lat:23 SizeLat:7 for: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:14 Lat:46 SizeLat:14 for: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:28 Lat:92 SizeLat:28 for: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'reduce_f64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:11 SizeLat:3 for: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:23 SizeLat:7 for: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:14 Lat:46 SizeLat:14 for: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:28 Lat:92 SizeLat:28 for: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'reduce_f64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:9 SizeLat:3 for: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:19 SizeLat:7 for: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:15 Lat:39 SizeLat:15 for: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:30 Lat:78 SizeLat:30 for: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V1  = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
   %V2  = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
@@ -76,67 +76,67 @@ define void @reduce_f64(double %arg) {
 
 define void @reduce_f32(float %arg) {
 ; SSE2-LABEL: 'reduce_f32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:5 SizeLat:1 for: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:3 Lat:11 SizeLat:3 for: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:7 Lat:23 SizeLat:7 for: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:14 Lat:46 SizeLat:14 for: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:28 Lat:92 SizeLat:28 for: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:88 CodeSize:56 Lat:184 SizeLat:56 for: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSSE3-LABEL: 'reduce_f32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:5 SizeLat:1 for: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:3 Lat:11 SizeLat:3 for: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:7 Lat:23 SizeLat:7 for: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:14 Lat:46 SizeLat:14 for: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:28 Lat:92 SizeLat:28 for: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:88 CodeSize:56 Lat:184 SizeLat:56 for: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE41-LABEL: 'reduce_f32'
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE41-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:5 SizeLat:1 for: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:3 Lat:11 SizeLat:3 for: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:7 Lat:23 SizeLat:7 for: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:14 Lat:46 SizeLat:14 for: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:28 Lat:92 SizeLat:28 for: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:88 CodeSize:56 Lat:184 SizeLat:56 for: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'reduce_f32'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:11 SizeLat:3 for: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:23 SizeLat:7 for: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:14 Lat:46 SizeLat:14 for: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:28 Lat:92 SizeLat:28 for: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:56 Lat:184 SizeLat:56 for: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX1-LABEL: 'reduce_f32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:11 SizeLat:3 for: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:23 SizeLat:7 for: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:15 Lat:47 SizeLat:15 for: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:30 Lat:94 SizeLat:30 for: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:60 Lat:188 SizeLat:60 for: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'reduce_f32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:11 SizeLat:3 for: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:23 SizeLat:7 for: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:15 Lat:47 SizeLat:15 for: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:30 Lat:94 SizeLat:30 for: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:60 Lat:188 SizeLat:60 for: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'reduce_f32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:9 SizeLat:3 for: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:19 SizeLat:7 for: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:15 Lat:39 SizeLat:15 for: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:31 CodeSize:31 Lat:79 SizeLat:31 for: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:62 CodeSize:62 Lat:158 SizeLat:62 for: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V1  = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
   %V2  = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
@@ -151,60 +151,60 @@ define void @reduce_f32(float %arg) {
 
 define void @reduce_f64_fast(double %arg) {
 ; SSE2-LABEL: 'reduce_f64_fast'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE2-NEXT:  Cost Model: Found costs of 0 for: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:2 Lat:6 SizeLat:2 for: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:3 Lat:11 SizeLat:3 for: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:5 Lat:21 SizeLat:5 for: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:9 Lat:41 SizeLat:9 for: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSSE3-LABEL: 'reduce_f64_fast'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSSE3-NEXT:  Cost Model: Found costs of 0 for: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:2 Lat:6 SizeLat:2 for: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:3 Lat:11 SizeLat:3 for: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:5 Lat:21 SizeLat:5 for: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:9 Lat:41 SizeLat:9 for: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE41-LABEL: 'reduce_f64_fast'
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE41-NEXT:  Cost Model: Found costs of 0 for: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:2 Lat:6 SizeLat:2 for: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:3 Lat:11 SizeLat:3 for: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:5 Lat:21 SizeLat:5 for: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:9 Lat:41 SizeLat:9 for: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'reduce_f64_fast'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE42-NEXT:  Cost Model: Found costs of 0 for: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:11 SizeLat:3 for: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:21 SizeLat:5 for: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:41 SizeLat:9 for: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX1-LABEL: 'reduce_f64_fast'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX1-NEXT:  Cost Model: Found costs of 0 for: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:2 Lat:6 SizeLat:2 for: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:12 SizeLat:4 for: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:5 Lat:17 SizeLat:6 for: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:7 Lat:27 SizeLat:10 for: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'reduce_f64_fast'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:12 SizeLat:4 for: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:17 SizeLat:6 for: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:27 SizeLat:10 for: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'reduce_f64_fast'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX512-NEXT:  Cost Model: Found costs of 0 for: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:7 SizeLat:2 for: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:12 SizeLat:4 for: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:6 Lat:17 SizeLat:6 for: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:21 SizeLat:7 for: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V1  = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
   %V2  = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
@@ -216,67 +216,67 @@ define void @reduce_f64_fast(double %arg) {
 
 define void @reduce_f32_fast(float %arg) {
 ; SSE2-LABEL: 'reduce_f32_fast'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE2-NEXT:  Cost Model: Found costs of 0 for: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:2 Lat:6 SizeLat:2 for: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:12 SizeLat:4 for: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:5 Lat:17 SizeLat:5 for: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:7 Lat:27 SizeLat:7 for: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:11 Lat:47 SizeLat:11 for: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSSE3-LABEL: 'reduce_f32_fast'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSSE3-NEXT:  Cost Model: Found costs of 0 for: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:2 Lat:6 SizeLat:2 for: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:12 SizeLat:4 for: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:5 Lat:17 SizeLat:5 for: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:7 Lat:27 SizeLat:7 for: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:11 Lat:47 SizeLat:11 for: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE41-LABEL: 'reduce_f32_fast'
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE41-NEXT:  Cost Model: Found costs of 0 for: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:2 Lat:6 SizeLat:2 for: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:12 SizeLat:4 for: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:5 Lat:17 SizeLat:5 for: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:7 Lat:27 SizeLat:7 for: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:11 Lat:47 SizeLat:11 for: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
+; SSE41-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'reduce_f32_fast'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE42-NEXT:  Cost Model: Found costs of 0 for: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:12 SizeLat:4 for: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:17 SizeLat:5 for: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:27 SizeLat:7 for: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:11 Lat:47 SizeLat:11 for: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX1-LABEL: 'reduce_f32_fast'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX1-NEXT:  Cost Model: Found costs of 0 for: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:12 SizeLat:4 for: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:6 Lat:18 SizeLat:6 for: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:7 Lat:23 SizeLat:8 for: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:9 Lat:33 SizeLat:12 for: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'reduce_f32_fast'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:12 SizeLat:4 for: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:6 Lat:18 SizeLat:6 for: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:23 SizeLat:8 for: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:33 SizeLat:12 for: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'reduce_f32_fast'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX512-NEXT:  Cost Model: Found costs of 0 for: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:7 SizeLat:2 for: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:14 SizeLat:4 for: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:6 Lat:19 SizeLat:6 for: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:8 Lat:24 SizeLat:8 for: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:28 SizeLat:9 for: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V1  = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
   %V2  = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-mul.ll b/llvm/test/Analysis/CostModel/X86/reduce-mul.ll
index 93d32466136d7..cebc3e577297c 100644
--- a/llvm/test/Analysis/CostModel/X86/reduce-mul.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-mul.ll
@@ -1,77 +1,77 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx | FileCheck %s --check-prefixes=AVX1
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx | FileCheck %s --check-prefixes=AVX1
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ
 
 define i32 @reduce_i64(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i64'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:12 Lat:12 SizeLat:12 for: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:22 Lat:22 SizeLat:22 for: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:42 Lat:42 SizeLat:42 for: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:58 CodeSize:82 Lat:82 SizeLat:82 for: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i64'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSSE3-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:12 Lat:12 SizeLat:12 for: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:22 Lat:22 SizeLat:22 for: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:42 Lat:42 SizeLat:42 for: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:58 CodeSize:82 Lat:82 SizeLat:82 for: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i64'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE42-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:12 Lat:12 SizeLat:12 for: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:22 Lat:22 SizeLat:22 for: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:42 Lat:42 SizeLat:42 for: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:50 CodeSize:82 Lat:82 SizeLat:82 for: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:12 Lat:12 SizeLat:12 for: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:23 Lat:23 SizeLat:23 for: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:42 Lat:38 SizeLat:43 for: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:80 Lat:68 SizeLat:83 for: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:12 SizeLat:10 for: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:23 SizeLat:19 for: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:27 Lat:33 SizeLat:32 for: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:43 Lat:53 SizeLat:58 for: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i64'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512F-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:14 SizeLat:10 for: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:25 SizeLat:19 for: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:28 Lat:36 SizeLat:33 for: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:45 SizeLat:41 for: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i64'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512BW-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:14 SizeLat:10 for: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:25 SizeLat:19 for: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:28 Lat:36 SizeLat:33 for: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:45 SizeLat:41 for: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i64'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512DQ-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:19 SizeLat:5 for: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:35 SizeLat:9 for: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:7 Lat:51 SizeLat:13 for: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:8 Lat:66 SizeLat:16 for: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V1  = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef)
   %V2  = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
@@ -83,52 +83,52 @@ define i32 @reduce_i64(i32 %arg) {
 
 define i32 @reduce_i32(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:9 Lat:10 SizeLat:9 for: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:17 Lat:19 SizeLat:17 for: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:24 Lat:27 SizeLat:24 for: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:38 Lat:43 SizeLat:38 for: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:57 CodeSize:66 Lat:75 SizeLat:66 for: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:9 Lat:10 SizeLat:9 for: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:17 Lat:19 SizeLat:17 for: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:24 Lat:27 SizeLat:24 for: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:38 Lat:43 SizeLat:38 for: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:57 CodeSize:66 Lat:75 SizeLat:66 for: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i32'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE42-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:13 SizeLat:3 for: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:25 SizeLat:5 for: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:6 Lat:36 SizeLat:6 for: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:8 Lat:58 SizeLat:8 for: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:12 Lat:102 SizeLat:12 for: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:7 SizeLat:5 for: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:13 SizeLat:9 for: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:7 Lat:19 SizeLat:13 for: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:12 Lat:27 SizeLat:23 for: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:22 Lat:43 SizeLat:43 for: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:12 SizeLat:4 for: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:23 SizeLat:7 for: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:7 Lat:34 SizeLat:10 for: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:8 Lat:44 SizeLat:12 for: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:10 Lat:64 SizeLat:16 for: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:14 SizeLat:4 for: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:27 SizeLat:7 for: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:38 SizeLat:10 for: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:49 SizeLat:13 for: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:10 Lat:59 SizeLat:15 for: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V2  = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef)
   %V4  = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef)
@@ -140,58 +140,58 @@ define i32 @reduce_i32(i32 %arg) {
 
 define i32 @reduce_i16(i32 %arg) {
 ; SSE-LABEL: 'reduce_i16'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:7 SizeLat:3 for: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef)
+; SSE-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:13 SizeLat:5 for: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef)
+; SSE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:19 SizeLat:7 for: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef)
+; SSE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:8 Lat:24 SizeLat:8 for: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef)
+; SSE-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:10 Lat:34 SizeLat:10 for: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef)
+; SSE-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:14 Lat:54 SizeLat:14 for: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef)
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:8 SizeLat:3 for: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:14 SizeLat:5 for: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:20 SizeLat:7 for: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:26 SizeLat:9 for: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:14 Lat:34 SizeLat:15 for: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:24 Lat:50 SizeLat:27 for: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX2-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:7 SizeLat:3 for: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:13 SizeLat:5 for: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:19 SizeLat:7 for: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:25 SizeLat:9 for: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:10 Lat:30 SizeLat:11 for: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:12 Lat:40 SizeLat:15 for: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:7 SizeLat:3 for: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:15 SizeLat:5 for: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:23 SizeLat:7 for: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:29 SizeLat:9 for: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:11 Lat:35 SizeLat:12 for: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:12 Lat:36 SizeLat:13 for: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:7 SizeLat:3 for: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:15 SizeLat:5 for: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:23 SizeLat:7 for: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:29 SizeLat:9 for: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:11 Lat:35 SizeLat:12 for: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:12 Lat:40 SizeLat:13 for: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i16'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:7 SizeLat:3 for: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:15 SizeLat:5 for: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:23 SizeLat:7 for: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:29 SizeLat:9 for: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:11 Lat:35 SizeLat:12 for: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:12 Lat:36 SizeLat:13 for: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V2  = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef)
   %V4  = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef)
@@ -204,64 +204,64 @@ define i32 @reduce_i16(i32 %arg) {
 
 define i32 @reduce_i8(i32 %arg) {
 ; SSE-LABEL: 'reduce_i8'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:4 for: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
+; SSE-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:6 Lat:14 SizeLat:6 for: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
+; SSE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:8 Lat:20 SizeLat:8 for: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
+; SSE-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:10 Lat:26 SizeLat:10 for: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
+; SSE-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:14 Lat:38 SizeLat:14 for: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef)
+; SSE-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:22 Lat:62 SizeLat:22 for: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef)
+; SSE-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:38 Lat:110 SizeLat:38 for: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i8'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:9 SizeLat:4 for: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:6 Lat:15 SizeLat:6 for: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:8 Lat:21 SizeLat:8 for: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:10 Lat:27 SizeLat:10 for: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:15 Lat:35 SizeLat:16 for: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:25 Lat:51 SizeLat:28 for: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:45 Lat:83 SizeLat:52 for: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:4 for: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:6 Lat:14 SizeLat:6 for: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:8 Lat:20 SizeLat:8 for: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:10 Lat:26 SizeLat:10 for: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:11 Lat:31 SizeLat:12 for: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:13 Lat:41 SizeLat:16 for: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:17 Lat:61 SizeLat:24 for: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i8'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:4 for: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:6 Lat:16 SizeLat:6 for: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:8 Lat:24 SizeLat:8 for: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:10 Lat:30 SizeLat:10 for: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:12 Lat:36 SizeLat:13 for: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:13 Lat:37 SizeLat:14 for: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:15 Lat:39 SizeLat:16 for: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i8'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:4 for: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:6 Lat:16 SizeLat:6 for: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:8 Lat:24 SizeLat:8 for: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:10 Lat:30 SizeLat:10 for: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:12 Lat:36 SizeLat:13 for: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:13 Lat:41 SizeLat:14 for: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:15 Lat:51 SizeLat:16 for: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i8'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:4 for: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:6 Lat:16 SizeLat:6 for: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:8 Lat:24 SizeLat:8 for: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:10 Lat:30 SizeLat:10 for: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:12 Lat:36 SizeLat:13 for: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:13 Lat:37 SizeLat:14 for: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:15 Lat:39 SizeLat:16 for: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V2   = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
   %V4   = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-or.ll b/llvm/test/Analysis/CostModel/X86/reduce-or.ll
index 799a49fce26ba..4b82bb9c685e5 100644
--- a/llvm/test/Analysis/CostModel/X86/reduce-or.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-or.ll
@@ -1,37 +1,37 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ
 
 define i32 @reduce_i64(i32 %arg) {
 ; SSE-LABEL: 'reduce_i64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> undef)
+; SSE-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> undef)
+; SSE-NEXT:  Cost Model: Found costs of 4 for: %V4 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> undef)
+; SSE-NEXT:  Cost Model: Found costs of 6 for: %V8 = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> undef)
+; SSE-NEXT:  Cost Model: Found costs of 10 for: %V16 = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> undef)
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_i64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:6 Lat:6 SizeLat:7 for: %V8 = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:8 Lat:8 SizeLat:11 for: %V16 = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %V2 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:7 SizeLat:5 for: %V4 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:7 for: %V8 = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:8 Lat:10 SizeLat:8 for: %V16 = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V1  = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> undef)
   %V2  = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> undef)
@@ -43,28 +43,28 @@ define i32 @reduce_i64(i32 %arg) {
 
 define i32 @reduce_i32(i32 %arg) {
 ; SSE-LABEL: 'reduce_i32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> undef)
+; SSE-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> undef)
+; SSE-NEXT:  Cost Model: Found costs of 6 for: %V8 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef)
+; SSE-NEXT:  Cost Model: Found costs of 8 for: %V16 = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> undef)
+; SSE-NEXT:  Cost Model: Found costs of 12 for: %V32 = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> undef)
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_i32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:8 Lat:8 SizeLat:9 for: %V16 = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:10 Lat:10 SizeLat:13 for: %V32 = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %V2 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:9 SizeLat:5 for: %V4 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:11 SizeLat:7 for: %V8 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:9 for: %V16 = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:10 Lat:14 SizeLat:10 for: %V32 = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V2  = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> undef)
   %V4  = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> undef)
@@ -76,31 +76,40 @@ define i32 @reduce_i32(i32 %arg) {
 
 define i32 @reduce_i16(i32 %arg) {
 ; SSE-LABEL: 'reduce_i16'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> undef)
+; SSE-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef)
+; SSE-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef)
+; SSE-NEXT:  Cost Model: Found costs of 8 for: %V16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef)
+; SSE-NEXT:  Cost Model: Found costs of 10 for: %V32 = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> undef)
+; SSE-NEXT:  Cost Model: Found costs of 14 for: %V64 = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> undef)
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
-; AVX-LABEL: 'reduce_i16'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-LABEL: 'reduce_i16'
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %V2 = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:6 SizeLat:5 for: %V4 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:8 SizeLat:7 for: %V8 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:10 SizeLat:9 for: %V16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:10 Lat:11 SizeLat:11 for: %V32 = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:12 Lat:13 SizeLat:15 for: %V64 = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i16'
+; AVX2-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 9 for: %V16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:10 Lat:10 SizeLat:11 for: %V32 = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:12 Lat:12 SizeLat:15 for: %V64 = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i16'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:7 SizeLat:5 for: %V4 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:11 SizeLat:7 for: %V8 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:9 for: %V16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:11 Lat:15 SizeLat:11 for: %V32 = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:12 Lat:16 SizeLat:12 for: %V64 = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V2  = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> undef)
   %V4  = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef)
@@ -113,34 +122,44 @@ define i32 @reduce_i16(i32 %arg) {
 
 define i32 @reduce_i8(i32 %arg) {
 ; SSE-LABEL: 'reduce_i8'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> undef)
+; SSE-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef)
+; SSE-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef)
+; SSE-NEXT:  Cost Model: Found costs of 9 for: %V16 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef)
+; SSE-NEXT:  Cost Model: Found costs of 10 for: %V32 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef)
+; SSE-NEXT:  Cost Model: Found costs of 12 for: %V64 = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> undef)
+; SSE-NEXT:  Cost Model: Found costs of 16 for: %V128 = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> undef)
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i8'
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %V2 = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:7 SizeLat:5 for: %V4 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:7 for: %V8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:11 SizeLat:9 for: %V16 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:11 Lat:13 SizeLat:11 for: %V32 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:12 Lat:14 SizeLat:13 for: %V64 = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:14 Lat:16 SizeLat:17 for: %V128 = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
-; AVX-LABEL: 'reduce_i8'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX2-LABEL: 'reduce_i8'
+; AVX2-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 9 for: %V16 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 11 for: %V32 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:12 Lat:12 SizeLat:13 for: %V64 = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:14 Lat:14 SizeLat:17 for: %V128 = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i8'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:7 for: %V8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:9 for: %V16 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:11 Lat:15 SizeLat:11 for: %V32 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:13 Lat:17 SizeLat:13 for: %V64 = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:14 Lat:18 SizeLat:14 for: %V128 = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V2   = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> undef)
   %V4   = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef)
@@ -154,70 +173,70 @@ define i32 @reduce_i8(i32 %arg) {
 
 define i32 @reduce_i1(i32 %arg) {
 ; SSE-LABEL: 'reduce_i1'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef)
+; SSE-NEXT:  Cost Model: Found costs of 2 for: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef)
+; SSE-NEXT:  Cost Model: Found costs of 2 for: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef)
+; SSE-NEXT:  Cost Model: Found costs of 2 for: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef)
+; SSE-NEXT:  Cost Model: Found costs of 2 for: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef)
+; SSE-NEXT:  Cost Model: Found costs of 3 for: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
+; SSE-NEXT:  Cost Model: Found costs of 5 for: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
+; SSE-NEXT:  Cost Model: Found costs of 9 for: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i1'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 4 for: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:5 SizeLat:6 for: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:7 SizeLat:10 for: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i1'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 2 for: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 2 for: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 2 for: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 2 for: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 2 for: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:3 SizeLat:4 for: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:5 SizeLat:8 for: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i1'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 9 for: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 10 for: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 12 for: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 16 for: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i1'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 9 for: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 11 for: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 13 for: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 14 for: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i1'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512DQ-NEXT:  Cost Model: Found costs of 1 for: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 9 for: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 10 for: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 12 for: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 16 for: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V1   = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef)
   %V2   = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef)
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-smax.ll b/llvm/test/Analysis/CostModel/X86/reduce-smax.ll
index e036a22ba2647..f11229e313d3a 100644
--- a/llvm/test/Analysis/CostModel/X86/reduce-smax.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-smax.ll
@@ -1,62 +1,62 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4,SSE41
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE4,SSE42
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx | FileCheck %s --check-prefixes=AVX1
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE4
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx | FileCheck %s --check-prefixes=AVX1
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ
 
 define i32 @reduce_i64(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i64'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:17 Lat:10 SizeLat:17 for: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:32 Lat:18 SizeLat:32 for: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:62 Lat:34 SizeLat:62 for: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:122 Lat:66 SizeLat:122 for: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i64'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSSE3-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:17 Lat:10 SizeLat:17 for: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:32 Lat:18 SizeLat:32 for: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:62 Lat:34 SizeLat:62 for: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:122 Lat:66 SizeLat:122 for: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SSE4-LABEL: 'reduce_i64'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE4-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
+; SSE4-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:9 SizeLat:5 for: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
+; SSE4-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:6 Lat:16 SizeLat:8 for: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
+; SSE4-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:10 Lat:30 SizeLat:14 for: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
+; SSE4-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:18 Lat:58 SizeLat:26 for: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
+; SSE4-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:9 SizeLat:6 for: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:7 Lat:17 SizeLat:11 for: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:13 Lat:26 SizeLat:23 for: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:25 Lat:44 SizeLat:47 for: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:9 SizeLat:5 for: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:17 SizeLat:9 for: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:24 SizeLat:12 for: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:13 Lat:38 SizeLat:18 for: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:7 SizeLat:3 for: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:11 SizeLat:5 for: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:15 SizeLat:7 for: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:8 Lat:18 SizeLat:8 for: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V1  = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
   %V2  = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
@@ -68,52 +68,52 @@ define i32 @reduce_i64(i32 %arg) {
 
 define i32 @reduce_i32(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:7 Lat:6 SizeLat:7 for: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:13 Lat:11 SizeLat:13 for: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:18 Lat:15 SizeLat:18 for: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:28 Lat:23 SizeLat:28 for: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:48 Lat:39 SizeLat:48 for: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:7 Lat:6 SizeLat:7 for: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:13 Lat:11 SizeLat:13 for: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:18 Lat:15 SizeLat:18 for: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:28 Lat:23 SizeLat:28 for: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:48 Lat:39 SizeLat:48 for: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SSE4-LABEL: 'reduce_i32'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE4-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 6 for: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 8 for: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 12 for: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:12 Lat:13 SizeLat:13 for: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:22 Lat:25 SizeLat:25 for: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX2-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:8 Lat:8 SizeLat:9 for: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:10 Lat:10 SizeLat:13 for: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:9 SizeLat:5 for: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:11 SizeLat:7 for: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:10 for: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:10 Lat:14 SizeLat:11 for: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V2  = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
   %V4  = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
@@ -125,76 +125,76 @@ define i32 @reduce_i32(i32 %arg) {
 
 define i32 @reduce_i16(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i16'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 8 for: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 10 for: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 14 for: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i16'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSSE3-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of 8 for: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of 10 for: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of 14 for: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SSE4-LABEL: 'reduce_i16'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE4-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 5 for: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 7 for: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 11 for: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:11 Lat:12 SizeLat:12 for: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:21 Lat:24 SizeLat:24 for: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX2-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:7 SizeLat:8 for: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:9 SizeLat:12 for: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512F-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:11 Lat:15 SizeLat:12 for: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:16 Lat:22 SizeLat:17 for: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512BW-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 8 for: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 9 for: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i16'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512DQ-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:11 Lat:15 SizeLat:12 for: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:16 Lat:22 SizeLat:17 for: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V2  = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
   %V4  = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
@@ -207,84 +207,84 @@ define i32 @reduce_i16(i32 %arg) {
 
 define i32 @reduce_i8(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:7 Lat:6 SizeLat:7 for: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:13 Lat:11 SizeLat:13 for: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:19 Lat:16 SizeLat:19 for: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:25 Lat:21 SizeLat:25 for: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:30 Lat:25 SizeLat:30 for: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:40 Lat:33 SizeLat:40 for: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:60 Lat:49 SizeLat:60 for: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i8'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:7 Lat:6 SizeLat:7 for: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:13 Lat:11 SizeLat:13 for: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:19 Lat:16 SizeLat:19 for: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:25 Lat:21 SizeLat:25 for: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:30 Lat:25 SizeLat:30 for: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:40 Lat:33 SizeLat:40 for: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:60 Lat:49 SizeLat:60 for: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SSE4-LABEL: 'reduce_i8'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE4-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 7 for: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 9 for: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 13 for: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i8'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 8 for: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:13 Lat:14 SizeLat:14 for: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:23 Lat:26 SizeLat:26 for: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX2-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 8 for: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:9 SizeLat:10 for: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:11 Lat:11 SizeLat:14 for: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i8'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512F-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 8 for: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:13 Lat:17 SizeLat:14 for: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:18 Lat:24 SizeLat:19 for: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i8'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512BW-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 8 for: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 10 for: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 11 for: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i8'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512DQ-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 8 for: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:13 Lat:17 SizeLat:14 for: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:18 Lat:24 SizeLat:19 for: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V2   = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
   %V4   = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
@@ -322,6 +322,3 @@ declare i8 @llvm.vector.reduce.smax.v16i8(<16 x i8>)
 declare i8 @llvm.vector.reduce.smax.v32i8(<32 x i8>)
 declare i8 @llvm.vector.reduce.smax.v64i8(<64 x i8>)
 declare i8 @llvm.vector.reduce.smax.v128i8(<128 x i8>)
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; SSE41: {{.*}}
-; SSE42: {{.*}}
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-smin.ll b/llvm/test/Analysis/CostModel/X86/reduce-smin.ll
index c40f2fd2f2967..5ae31798de6c5 100644
--- a/llvm/test/Analysis/CostModel/X86/reduce-smin.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-smin.ll
@@ -1,62 +1,62 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4,SSE41
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE4,SSE42
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx | FileCheck %s --check-prefixes=AVX1
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE4
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx | FileCheck %s --check-prefixes=AVX1
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ
 
 define i32 @reduce_i64(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i64'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:17 Lat:10 SizeLat:17 for: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:32 Lat:18 SizeLat:32 for: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:62 Lat:34 SizeLat:62 for: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:122 Lat:66 SizeLat:122 for: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i64'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSSE3-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:17 Lat:10 SizeLat:17 for: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:32 Lat:18 SizeLat:32 for: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:62 Lat:34 SizeLat:62 for: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:122 Lat:66 SizeLat:122 for: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SSE4-LABEL: 'reduce_i64'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE4-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
+; SSE4-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:9 SizeLat:5 for: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
+; SSE4-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:6 Lat:16 SizeLat:8 for: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
+; SSE4-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:10 Lat:30 SizeLat:14 for: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
+; SSE4-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:18 Lat:58 SizeLat:26 for: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
+; SSE4-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:9 SizeLat:5 for: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:7 Lat:17 SizeLat:9 for: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:13 Lat:26 SizeLat:21 for: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:25 Lat:44 SizeLat:45 for: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:9 SizeLat:5 for: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:17 SizeLat:9 for: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:24 SizeLat:12 for: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:13 Lat:38 SizeLat:18 for: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:7 SizeLat:3 for: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:11 SizeLat:5 for: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:15 SizeLat:7 for: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:8 Lat:18 SizeLat:8 for: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V1  = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef)
   %V2  = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
@@ -68,52 +68,52 @@ define i32 @reduce_i64(i32 %arg) {
 
 define i32 @reduce_i32(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:7 Lat:6 SizeLat:7 for: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:13 Lat:11 SizeLat:13 for: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:18 Lat:15 SizeLat:18 for: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:28 Lat:23 SizeLat:28 for: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:48 Lat:39 SizeLat:48 for: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:7 Lat:6 SizeLat:7 for: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:13 Lat:11 SizeLat:13 for: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:18 Lat:15 SizeLat:18 for: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:28 Lat:23 SizeLat:28 for: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:48 Lat:39 SizeLat:48 for: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SSE4-LABEL: 'reduce_i32'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE4-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 6 for: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 8 for: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 12 for: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:12 Lat:13 SizeLat:13 for: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:22 Lat:25 SizeLat:25 for: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX2-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:8 Lat:8 SizeLat:9 for: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:10 Lat:10 SizeLat:13 for: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:9 SizeLat:5 for: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:11 SizeLat:7 for: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:10 for: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:10 Lat:14 SizeLat:11 for: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V2  = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
   %V4  = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
@@ -125,76 +125,76 @@ define i32 @reduce_i32(i32 %arg) {
 
 define i32 @reduce_i16(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i16'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 8 for: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 10 for: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 14 for: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i16'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSSE3-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of 8 for: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of 10 for: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of 14 for: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SSE4-LABEL: 'reduce_i16'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE4-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 5 for: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 7 for: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 11 for: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:11 Lat:12 SizeLat:12 for: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:21 Lat:24 SizeLat:24 for: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX2-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:7 SizeLat:8 for: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:9 SizeLat:12 for: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512F-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:11 Lat:15 SizeLat:12 for: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:16 Lat:22 SizeLat:17 for: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512BW-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 8 for: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 9 for: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i16'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512DQ-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:11 Lat:15 SizeLat:12 for: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:16 Lat:22 SizeLat:17 for: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V2  = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
   %V4  = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
@@ -207,84 +207,84 @@ define i32 @reduce_i16(i32 %arg) {
 
 define i32 @reduce_i8(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:7 Lat:6 SizeLat:7 for: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:13 Lat:11 SizeLat:13 for: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:19 Lat:16 SizeLat:19 for: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:25 Lat:21 SizeLat:25 for: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:30 Lat:25 SizeLat:30 for: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:40 Lat:33 SizeLat:40 for: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:60 Lat:49 SizeLat:60 for: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i8'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:7 Lat:6 SizeLat:7 for: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:13 Lat:11 SizeLat:13 for: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:19 Lat:16 SizeLat:19 for: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:25 Lat:21 SizeLat:25 for: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:30 Lat:25 SizeLat:30 for: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:40 Lat:33 SizeLat:40 for: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:60 Lat:49 SizeLat:60 for: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SSE4-LABEL: 'reduce_i8'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE4-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 7 for: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 9 for: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 13 for: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i8'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 8 for: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:13 Lat:14 SizeLat:14 for: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:23 Lat:26 SizeLat:26 for: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX2-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 8 for: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:9 SizeLat:10 for: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:11 Lat:11 SizeLat:14 for: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i8'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512F-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 8 for: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:13 Lat:17 SizeLat:14 for: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:18 Lat:24 SizeLat:19 for: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i8'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512BW-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 8 for: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 10 for: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 11 for: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i8'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512DQ-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 8 for: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:13 Lat:17 SizeLat:14 for: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:18 Lat:24 SizeLat:19 for: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V2   = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
   %V4   = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
@@ -322,6 +322,3 @@ declare i8 @llvm.vector.reduce.smin.v16i8(<16 x i8>)
 declare i8 @llvm.vector.reduce.smin.v32i8(<32 x i8>)
 declare i8 @llvm.vector.reduce.smin.v64i8(<64 x i8>)
 declare i8 @llvm.vector.reduce.smin.v128i8(<128 x i8>)
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; SSE41: {{.*}}
-; SSE42: {{.*}}
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-umax.ll b/llvm/test/Analysis/CostModel/X86/reduce-umax.ll
index 5b0e0cd103843..3025192ddf35b 100644
--- a/llvm/test/Analysis/CostModel/X86/reduce-umax.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-umax.ll
@@ -1,62 +1,62 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4,SSE41
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE4,SSE42
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx | FileCheck %s --check-prefixes=AVX1
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE4
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx | FileCheck %s --check-prefixes=AVX1
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ
 
 define i32 @reduce_i64(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i64'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:17 Lat:10 SizeLat:17 for: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:32 Lat:18 SizeLat:32 for: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:62 Lat:34 SizeLat:62 for: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:122 Lat:66 SizeLat:122 for: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i64'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSSE3-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:17 Lat:10 SizeLat:17 for: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:32 Lat:18 SizeLat:32 for: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:62 Lat:34 SizeLat:62 for: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:122 Lat:66 SizeLat:122 for: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SSE4-LABEL: 'reduce_i64'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE4-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
+; SSE4-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:8 Lat:13 SizeLat:9 for: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
+; SSE4-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:14 Lat:24 SizeLat:16 for: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
+; SSE4-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:26 Lat:46 SizeLat:30 for: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
+; SSE4-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:50 Lat:90 SizeLat:58 for: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
+; SSE4-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:10 SizeLat:9 for: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:13 Lat:19 SizeLat:17 for: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:29 SizeLat:34 for: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:46 Lat:49 SizeLat:68 for: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:7 Lat:10 SizeLat:8 for: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:13 Lat:19 SizeLat:15 for: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:18 Lat:27 SizeLat:23 for: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:28 Lat:43 SizeLat:39 for: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:7 SizeLat:3 for: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:11 SizeLat:5 for: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:15 SizeLat:7 for: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:8 Lat:18 SizeLat:8 for: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V1  = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef)
   %V2  = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
@@ -68,52 +68,52 @@ define i32 @reduce_i64(i32 %arg) {
 
 define i32 @reduce_i32(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:10 Lat:7 SizeLat:10 for: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:19 Lat:13 SizeLat:19 for: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:27 Lat:18 SizeLat:27 for: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:43 Lat:28 SizeLat:43 for: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:75 Lat:48 SizeLat:75 for: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:10 Lat:7 SizeLat:10 for: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:19 Lat:13 SizeLat:19 for: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:27 Lat:18 SizeLat:27 for: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:43 Lat:28 SizeLat:43 for: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:75 Lat:48 SizeLat:75 for: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SSE4-LABEL: 'reduce_i32'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE4-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 6 for: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 8 for: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 12 for: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:12 Lat:13 SizeLat:13 for: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:22 Lat:25 SizeLat:25 for: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX2-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:8 Lat:8 SizeLat:9 for: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:10 Lat:10 SizeLat:13 for: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:9 SizeLat:5 for: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:11 SizeLat:7 for: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:10 for: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:10 Lat:14 SizeLat:11 for: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V2  = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
   %V4  = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
@@ -125,76 +125,76 @@ define i32 @reduce_i32(i32 %arg) {
 
 define i32 @reduce_i16(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i16'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-NEXT:  Cost Model: Found costs of 5 for: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 7 for: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 9 for: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:18 Lat:18 SizeLat:18 for: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:30 Lat:30 SizeLat:30 for: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i16'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSSE3-NEXT:  Cost Model: Found costs of 5 for: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of 7 for: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of 9 for: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:18 Lat:18 SizeLat:18 for: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:30 Lat:30 SizeLat:30 for: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SSE4-LABEL: 'reduce_i16'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE4-NEXT:  Cost Model: Found costs of 5 for: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 7 for: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 5 for: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 7 for: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 11 for: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-NEXT:  Cost Model: Found costs of 5 for: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 7 for: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:11 Lat:12 SizeLat:12 for: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:21 Lat:24 SizeLat:24 for: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX2-NEXT:  Cost Model: Found costs of 5 for: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 7 for: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:7 SizeLat:8 for: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:9 SizeLat:12 for: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512F-NEXT:  Cost Model: Found costs of 5 for: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 7 for: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:11 Lat:15 SizeLat:12 for: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:16 Lat:22 SizeLat:17 for: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512BW-NEXT:  Cost Model: Found costs of 5 for: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 7 for: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 8 for: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 9 for: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i16'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512DQ-NEXT:  Cost Model: Found costs of 5 for: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 7 for: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:11 Lat:15 SizeLat:12 for: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:16 Lat:22 SizeLat:17 for: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V2  = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
   %V4  = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
@@ -207,84 +207,84 @@ define i32 @reduce_i16(i32 %arg) {
 
 define i32 @reduce_i8(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 9 for: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 10 for: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 12 for: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 16 for: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i8'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSSE3-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of 9 for: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of 10 for: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of 12 for: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of 16 for: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SSE4-LABEL: 'reduce_i8'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE4-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 7 for: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 9 for: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 13 for: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i8'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 8 for: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:13 Lat:14 SizeLat:14 for: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:23 Lat:26 SizeLat:26 for: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX2-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 8 for: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:9 SizeLat:10 for: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:11 Lat:11 SizeLat:14 for: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i8'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512F-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 8 for: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:13 Lat:17 SizeLat:14 for: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:18 Lat:24 SizeLat:19 for: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i8'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512BW-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 8 for: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 10 for: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 11 for: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i8'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512DQ-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 8 for: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:13 Lat:17 SizeLat:14 for: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:18 Lat:24 SizeLat:19 for: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V2   = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
   %V4   = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
@@ -322,6 +322,3 @@ declare i8 @llvm.vector.reduce.umax.v16i8(<16 x i8>)
 declare i8 @llvm.vector.reduce.umax.v32i8(<32 x i8>)
 declare i8 @llvm.vector.reduce.umax.v64i8(<64 x i8>)
 declare i8 @llvm.vector.reduce.umax.v128i8(<128 x i8>)
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; SSE41: {{.*}}
-; SSE42: {{.*}}
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-umin.ll b/llvm/test/Analysis/CostModel/X86/reduce-umin.ll
index acd38421ba937..51d06a925f4c9 100644
--- a/llvm/test/Analysis/CostModel/X86/reduce-umin.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-umin.ll
@@ -1,62 +1,62 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4,SSE41
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE4,SSE42
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx | FileCheck %s --check-prefixes=AVX1
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE4
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx | FileCheck %s --check-prefixes=AVX1
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ
 
 define i32 @reduce_i64(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i64'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:17 Lat:10 SizeLat:17 for: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:32 Lat:18 SizeLat:32 for: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:62 Lat:34 SizeLat:62 for: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:122 Lat:66 SizeLat:122 for: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i64'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSSE3-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:17 Lat:10 SizeLat:17 for: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:32 Lat:18 SizeLat:32 for: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:62 Lat:34 SizeLat:62 for: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:122 Lat:66 SizeLat:122 for: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SSE4-LABEL: 'reduce_i64'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE4-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
+; SSE4-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:8 Lat:13 SizeLat:9 for: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
+; SSE4-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:14 Lat:24 SizeLat:16 for: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
+; SSE4-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:26 Lat:46 SizeLat:30 for: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
+; SSE4-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:50 Lat:90 SizeLat:58 for: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
+; SSE4-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:10 SizeLat:9 for: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:13 Lat:19 SizeLat:17 for: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:29 SizeLat:34 for: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:46 Lat:49 SizeLat:68 for: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:7 Lat:10 SizeLat:8 for: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:13 Lat:19 SizeLat:15 for: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:18 Lat:27 SizeLat:23 for: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:28 Lat:43 SizeLat:39 for: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:7 SizeLat:3 for: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:11 SizeLat:5 for: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:15 SizeLat:7 for: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:8 Lat:18 SizeLat:8 for: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V1  = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef)
   %V2  = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
@@ -68,52 +68,52 @@ define i32 @reduce_i64(i32 %arg) {
 
 define i32 @reduce_i32(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:10 Lat:7 SizeLat:10 for: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:19 Lat:13 SizeLat:19 for: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:27 Lat:18 SizeLat:27 for: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:43 Lat:28 SizeLat:43 for: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:75 Lat:48 SizeLat:75 for: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:10 Lat:7 SizeLat:10 for: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:19 Lat:13 SizeLat:19 for: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:27 Lat:18 SizeLat:27 for: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:43 Lat:28 SizeLat:43 for: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:75 Lat:48 SizeLat:75 for: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SSE4-LABEL: 'reduce_i32'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE4-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 6 for: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 8 for: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 12 for: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef)
+; SSE4-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:12 Lat:13 SizeLat:13 for: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:22 Lat:25 SizeLat:25 for: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX2-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:8 Lat:8 SizeLat:9 for: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:10 Lat:10 SizeLat:13 for: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:9 SizeLat:5 for: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:11 SizeLat:7 for: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:10 for: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:10 Lat:14 SizeLat:11 for: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V2  = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
   %V4  = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
@@ -125,76 +125,76 @@ define i32 @reduce_i32(i32 %arg) {
 
 define i32 @reduce_i16(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i16'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-NEXT:  Cost Model: Found costs of 5 for: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 7 for: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 9 for: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:18 Lat:18 SizeLat:18 for: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:30 Lat:30 SizeLat:30 for: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i16'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSSE3-NEXT:  Cost Model: Found costs of 5 for: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of 7 for: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of 9 for: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:18 Lat:18 SizeLat:18 for: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:30 Lat:30 SizeLat:30 for: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SSE4-LABEL: 'reduce_i16'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE4-NEXT:  Cost Model: Found costs of 5 for: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 7 for: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 5 for: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 7 for: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 11 for: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
+; SSE4-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-NEXT:  Cost Model: Found costs of 5 for: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 7 for: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:11 Lat:12 SizeLat:12 for: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:21 Lat:24 SizeLat:24 for: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX2-NEXT:  Cost Model: Found costs of 5 for: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 7 for: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:7 SizeLat:8 for: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:9 SizeLat:12 for: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512F-NEXT:  Cost Model: Found costs of 5 for: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 7 for: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:11 Lat:15 SizeLat:12 for: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:16 Lat:22 SizeLat:17 for: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512BW-NEXT:  Cost Model: Found costs of 5 for: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 7 for: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 8 for: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 9 for: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i16'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512DQ-NEXT:  Cost Model: Found costs of 5 for: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 7 for: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 4 for: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:11 Lat:15 SizeLat:12 for: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:16 Lat:22 SizeLat:17 for: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V2  = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
   %V4  = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
@@ -207,84 +207,84 @@ define i32 @reduce_i16(i32 %arg) {
 
 define i32 @reduce_i8(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 9 for: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 10 for: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 12 for: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 16 for: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i8'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSSE3-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of 9 for: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of 10 for: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of 12 for: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of 16 for: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SSE4-LABEL: 'reduce_i8'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE4-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 7 for: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 9 for: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found costs of 13 for: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
+; SSE4-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i8'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 8 for: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:13 Lat:14 SizeLat:14 for: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:23 Lat:26 SizeLat:26 for: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX2-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 8 for: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:9 SizeLat:10 for: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:11 Lat:11 SizeLat:14 for: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i8'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512F-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 8 for: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:13 Lat:17 SizeLat:14 for: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:18 Lat:24 SizeLat:19 for: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i8'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512BW-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 8 for: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 10 for: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 11 for: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i8'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512DQ-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 6 for: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 8 for: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:13 Lat:17 SizeLat:14 for: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:18 Lat:24 SizeLat:19 for: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V2   = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
   %V4   = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
@@ -322,6 +322,3 @@ declare i8 @llvm.vector.reduce.umin.v16i8(<16 x i8>)
 declare i8 @llvm.vector.reduce.umin.v32i8(<32 x i8>)
 declare i8 @llvm.vector.reduce.umin.v64i8(<64 x i8>)
 declare i8 @llvm.vector.reduce.umin.v128i8(<128 x i8>)
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; SSE41: {{.*}}
-; SSE42: {{.*}}
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-xor.ll b/llvm/test/Analysis/CostModel/X86/reduce-xor.ll
index 757db4a5a41be..c9cb6ef547e86 100644
--- a/llvm/test/Analysis/CostModel/X86/reduce-xor.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-xor.ll
@@ -1,37 +1,37 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
-; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ
 
 define i32 @reduce_i64(i32 %arg) {
 ; SSE-LABEL: 'reduce_i64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> undef)
+; SSE-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> undef)
+; SSE-NEXT:  Cost Model: Found costs of 4 for: %V4 = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> undef)
+; SSE-NEXT:  Cost Model: Found costs of 6 for: %V8 = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> undef)
+; SSE-NEXT:  Cost Model: Found costs of 10 for: %V16 = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> undef)
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_i64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:6 Lat:6 SizeLat:7 for: %V8 = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:8 Lat:8 SizeLat:11 for: %V16 = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %V2 = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:7 SizeLat:5 for: %V4 = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:7 for: %V8 = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:8 Lat:10 SizeLat:8 for: %V16 = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V1  = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> undef)
   %V2  = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> undef)
@@ -43,28 +43,28 @@ define i32 @reduce_i64(i32 %arg) {
 
 define i32 @reduce_i32(i32 %arg) {
 ; SSE-LABEL: 'reduce_i32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.vector.reduce.xor.v32i32(<32 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> undef)
+; SSE-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> undef)
+; SSE-NEXT:  Cost Model: Found costs of 6 for: %V8 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef)
+; SSE-NEXT:  Cost Model: Found costs of 8 for: %V16 = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> undef)
+; SSE-NEXT:  Cost Model: Found costs of 12 for: %V32 = call i32 @llvm.vector.reduce.xor.v32i32(<32 x i32> undef)
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX-LABEL: 'reduce_i32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.xor.v32i32(<32 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:8 Lat:8 SizeLat:9 for: %V16 = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:10 Lat:10 SizeLat:13 for: %V32 = call i32 @llvm.vector.reduce.xor.v32i32(<32 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.xor.v32i32(<32 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %V2 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:9 SizeLat:5 for: %V4 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:11 SizeLat:7 for: %V8 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:9 for: %V16 = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:10 Lat:14 SizeLat:10 for: %V32 = call i32 @llvm.vector.reduce.xor.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V2  = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> undef)
   %V4  = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> undef)
@@ -76,31 +76,40 @@ define i32 @reduce_i32(i32 %arg) {
 
 define i32 @reduce_i16(i32 %arg) {
 ; SSE-LABEL: 'reduce_i16'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> undef)
+; SSE-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef)
+; SSE-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef)
+; SSE-NEXT:  Cost Model: Found costs of 8 for: %V16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef)
+; SSE-NEXT:  Cost Model: Found costs of 10 for: %V32 = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> undef)
+; SSE-NEXT:  Cost Model: Found costs of 14 for: %V64 = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> undef)
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
-; AVX-LABEL: 'reduce_i16'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-LABEL: 'reduce_i16'
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %V2 = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:6 SizeLat:5 for: %V4 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:8 SizeLat:7 for: %V8 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:10 SizeLat:9 for: %V16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:10 Lat:11 SizeLat:11 for: %V32 = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:12 Lat:13 SizeLat:15 for: %V64 = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i16'
+; AVX2-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 9 for: %V16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:10 Lat:10 SizeLat:11 for: %V32 = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:12 Lat:12 SizeLat:15 for: %V64 = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i16'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:7 SizeLat:5 for: %V4 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:11 SizeLat:7 for: %V8 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:9 for: %V16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:11 Lat:15 SizeLat:11 for: %V32 = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:12 Lat:16 SizeLat:12 for: %V64 = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V2  = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> undef)
   %V4  = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef)
@@ -113,34 +122,44 @@ define i32 @reduce_i16(i32 %arg) {
 
 define i32 @reduce_i8(i32 %arg) {
 ; SSE-LABEL: 'reduce_i8'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> undef)
+; SSE-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef)
+; SSE-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef)
+; SSE-NEXT:  Cost Model: Found costs of 9 for: %V16 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef)
+; SSE-NEXT:  Cost Model: Found costs of 10 for: %V32 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef)
+; SSE-NEXT:  Cost Model: Found costs of 12 for: %V64 = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> undef)
+; SSE-NEXT:  Cost Model: Found costs of 16 for: %V128 = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> undef)
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i8'
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %V2 = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:7 SizeLat:5 for: %V4 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:7 for: %V8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:11 SizeLat:9 for: %V16 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:11 Lat:13 SizeLat:11 for: %V32 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:12 Lat:14 SizeLat:13 for: %V64 = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:14 Lat:16 SizeLat:17 for: %V128 = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
-; AVX-LABEL: 'reduce_i8'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX2-LABEL: 'reduce_i8'
+; AVX2-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 9 for: %V16 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 11 for: %V32 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:12 Lat:12 SizeLat:13 for: %V64 = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:14 Lat:14 SizeLat:17 for: %V128 = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i8'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:7 for: %V8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:9 for: %V16 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:11 Lat:15 SizeLat:11 for: %V32 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:13 Lat:17 SizeLat:13 for: %V64 = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:14 Lat:18 SizeLat:14 for: %V128 = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V2   = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> undef)
   %V4   = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef)
@@ -154,92 +173,92 @@ define i32 @reduce_i8(i32 %arg) {
 
 define i32 @reduce_i1(i32 %arg) {
 ; SSE2-LABEL: 'reduce_i1'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:19 Lat:19 SizeLat:19 for: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:46 Lat:46 SizeLat:46 for: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:48 Lat:48 SizeLat:48 for: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:52 Lat:52 SizeLat:52 for: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SSSE3-LABEL: 'reduce_i1'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSSE3-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of 9 for: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of 10 for: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of 12 for: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of 16 for: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SSE42-LABEL: 'reduce_i1'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE42-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found costs of 9 for: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found costs of 10 for: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found costs of 12 for: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found costs of 16 for: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i1'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 9 for: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:46 CodeSize:46 Lat:46 SizeLat:51 for: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:47 CodeSize:47 Lat:47 SizeLat:53 for: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:49 Lat:49 SizeLat:57 for: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i1'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 3 for: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 5 for: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 7 for: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 9 for: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:26 Lat:26 SizeLat:31 for: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:27 Lat:27 SizeLat:33 for: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:29 Lat:29 SizeLat:37 for: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i1'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 6 for: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 19 for: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 52 for: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 133 for: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of 134 for: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:136 CodeSize:136 Lat:135 SizeLat:135 for: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:140 CodeSize:140 Lat:136 SizeLat:136 for: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i1'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 775 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 776 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 6 for: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 19 for: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 52 for: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 133 for: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 326 for: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 775 for: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of 776 for: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i1'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512DQ-NEXT:  Cost Model: Found costs of 1 for: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 6 for: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 19 for: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 52 for: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 133 for: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of 134 for: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:136 CodeSize:136 Lat:135 SizeLat:135 for: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:140 CodeSize:140 Lat:136 SizeLat:136 for: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %V1   = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef)
   %V2   = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef)
diff --git a/llvm/test/Analysis/CostModel/X86/reduction.ll b/llvm/test/Analysis/CostModel/X86/reduction.ll
index 4ad0887a27884..0e0ad7e14f4eb 100644
--- a/llvm/test/Analysis/CostModel/X86/reduction.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduction.ll
@@ -1,54 +1,62 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
 
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -mcpu=slm | FileCheck %s --check-prefixes=SLM
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -mcpu=slm | FileCheck %s --check-prefixes=SLM
 
 ; These are old tests for matching reduction costs from extract elements - something that has now been removed.
 
 define fastcc float @reduction_cost_float(<4 x float> %rdx) {
 ; SSE2-LABEL: 'reduction_cost_float'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
+; SSE2-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r
 ;
 ; SSSE3-LABEL: 'reduction_cost_float'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
+; SSSE3-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r
 ;
 ; SSE42-LABEL: 'reduction_cost_float'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
-;
-; AVX-LABEL: 'reduction_cost_float'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
+; SSE42-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r
+;
+; AVX1-LABEL: 'reduction_cost_float'
+; AVX1-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
+; AVX1-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
+; AVX1-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r
+;
+; AVX2-LABEL: 'reduction_cost_float'
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r
 ;
 ; SLM-LABEL: 'reduction_cost_float'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
+; SLM-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r
 ;
   %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
   %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
@@ -61,44 +69,44 @@ define fastcc float @reduction_cost_float(<4 x float> %rdx) {
 
 define fastcc i32 @reduction_cost_int(<8 x i32> %rdx) {
 ; SSE-LABEL: 'reduction_cost_int'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %rdx, %rdx.shuf
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
+; SSE-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %bin.rdx = add <8 x i32> %rdx, %rdx.shuf
+; SSE-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2
+; SSE-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3
+; SSE-NEXT:  Cost Model: Found costs of 1 for: %r = extractelement <8 x i32> %bin.rdx.3, i32 0
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r
 ;
 ; AVX1-LABEL: 'reduction_cost_int'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = add <8 x i32> %rdx, %rdx.shuf
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
+; AVX1-NEXT:  Cost Model: Found costs of 4 for: %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:5 Lat:2 SizeLat:6 for: %bin.rdx = add <8 x i32> %rdx, %rdx.shuf
+; AVX1-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:5 Lat:2 SizeLat:6 for: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2
+; AVX1-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:5 Lat:2 SizeLat:6 for: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3
+; AVX1-NEXT:  Cost Model: Found costs of 1 for: %r = extractelement <8 x i32> %bin.rdx.3, i32 0
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r
 ;
 ; AVX2-LABEL: 'reduction_cost_int'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i32> %rdx, %rdx.shuf
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:1 SizeLat:2 for: %bin.rdx = add <8 x i32> %rdx, %rdx.shuf
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:1 SizeLat:2 for: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:1 SizeLat:2 for: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %r = extractelement <8 x i32> %bin.rdx.3, i32 0
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r
 ;
 ; SLM-LABEL: 'reduction_cost_int'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %rdx, %rdx.shuf
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
+; SLM-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %bin.rdx = add <8 x i32> %rdx, %rdx.shuf
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %r = extractelement <8 x i32> %bin.rdx.3, i32 0
+; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r
 ;
   %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef,
   <8 x i32> <i32 4    , i32     5, i32     6, i32     7,
@@ -119,59 +127,70 @@ define fastcc i32 @reduction_cost_int(<8 x i32> %rdx) {
 
 define fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) {
 ; SSE2-LABEL: 'pairwise_hadd'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+; SSE2-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
+; SSE2-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %r2 = fadd float %r, %f1
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r2
 ;
 ; SSSE3-LABEL: 'pairwise_hadd'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+; SSSE3-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
+; SSSE3-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %r2 = fadd float %r, %f1
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r2
 ;
 ; SSE42-LABEL: 'pairwise_hadd'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
-;
-; AVX-LABEL: 'pairwise_hadd'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+; SSE42-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
+; SSE42-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %r2 = fadd float %r, %f1
+; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r2
+;
+; AVX1-LABEL: 'pairwise_hadd'
+; AVX1-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+; AVX1-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
+; AVX1-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %r2 = fadd float %r, %f1
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r2
+;
+; AVX2-LABEL: 'pairwise_hadd'
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %r2 = fadd float %r, %f1
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r2
 ;
 ; SLM-LABEL: 'pairwise_hadd'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+; SLM-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
+; SLM-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %r2 = fadd float %r, %f1
+; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r2
 ;
   %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
   <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
@@ -191,59 +210,70 @@ define fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) {
 
 define fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) {
 ; SSE2-LABEL: 'pairwise_hadd_assoc'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.1, %rdx.shuf.0.0
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.1, %rdx.shuf.0.0
+; SSE2-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
+; SSE2-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %r2 = fadd float %r, %f1
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r2
 ;
 ; SSSE3-LABEL: 'pairwise_hadd_assoc'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.1, %rdx.shuf.0.0
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.1, %rdx.shuf.0.0
+; SSSE3-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
+; SSSE3-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %r2 = fadd float %r, %f1
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r2
 ;
 ; SSE42-LABEL: 'pairwise_hadd_assoc'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.1, %rdx.shuf.0.0
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
-;
-; AVX-LABEL: 'pairwise_hadd_assoc'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.1, %rdx.shuf.0.0
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.1, %rdx.shuf.0.0
+; SSE42-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
+; SSE42-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %r2 = fadd float %r, %f1
+; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r2
+;
+; AVX1-LABEL: 'pairwise_hadd_assoc'
+; AVX1-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.1, %rdx.shuf.0.0
+; AVX1-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
+; AVX1-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %r2 = fadd float %r, %f1
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r2
+;
+; AVX2-LABEL: 'pairwise_hadd_assoc'
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.1, %rdx.shuf.0.0
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %r2 = fadd float %r, %f1
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r2
 ;
 ; SLM-LABEL: 'pairwise_hadd_assoc'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.1, %rdx.shuf.0.0
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.1, %rdx.shuf.0.0
+; SLM-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
+; SLM-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %r2 = fadd float %r, %f1
+; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r2
 ;
   %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
   <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
@@ -263,54 +293,64 @@ define fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) {
 
 define fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) {
 ; SSE2-LABEL: 'pairwise_hadd_skip_first'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1
+; SSE2-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %r2 = fadd float %r, %f1
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r2
 ;
 ; SSSE3-LABEL: 'pairwise_hadd_skip_first'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1
+; SSSE3-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %r2 = fadd float %r, %f1
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r2
 ;
 ; SSE42-LABEL: 'pairwise_hadd_skip_first'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
-;
-; AVX-LABEL: 'pairwise_hadd_skip_first'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1
+; SSE42-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %r2 = fadd float %r, %f1
+; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r2
+;
+; AVX1-LABEL: 'pairwise_hadd_skip_first'
+; AVX1-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+; AVX1-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1
+; AVX1-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %r2 = fadd float %r, %f1
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r2
+;
+; AVX2-LABEL: 'pairwise_hadd_skip_first'
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %r2 = fadd float %r, %f1
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r2
 ;
 ; SLM-LABEL: 'pairwise_hadd_skip_first'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1
+; SLM-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %r2 = fadd float %r, %f1
+; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r2
 ;
   %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
   <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
@@ -328,34 +368,40 @@ define fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) {
 
 define fastcc double @no_pairwise_reduction2double(<2 x double> %rdx, double %f1) {
 ; SSE2-LABEL: 'no_pairwise_reduction2double'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf
+; SSE2-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <2 x double> %bin.rdx, i32 0
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r
 ;
 ; SSSE3-LABEL: 'no_pairwise_reduction2double'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf
+; SSSE3-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <2 x double> %bin.rdx, i32 0
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r
 ;
 ; SSE42-LABEL: 'no_pairwise_reduction2double'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
-;
-; AVX-LABEL: 'no_pairwise_reduction2double'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 poison>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf
+; SSE42-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <2 x double> %bin.rdx, i32 0
+; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r
+;
+; AVX1-LABEL: 'no_pairwise_reduction2double'
+; AVX1-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf
+; AVX1-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <2 x double> %bin.rdx, i32 0
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r
+;
+; AVX2-LABEL: 'no_pairwise_reduction2double'
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <2 x double> %bin.rdx, i32 0
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r
 ;
 ; SLM-LABEL: 'no_pairwise_reduction2double'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf
+; SLM-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <2 x double> %bin.rdx, i32 0
+; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r
 ;
   %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
   %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf
@@ -366,44 +412,52 @@ define fastcc double @no_pairwise_reduction2double(<2 x double> %rdx, double %f1
 
 define fastcc float @no_pairwise_reduction4float(<4 x float> %rdx, float %f1) {
 ; SSE2-LABEL: 'no_pairwise_reduction4float'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
+; SSE2-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r
 ;
 ; SSSE3-LABEL: 'no_pairwise_reduction4float'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
+; SSSE3-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r
 ;
 ; SSE42-LABEL: 'no_pairwise_reduction4float'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
-;
-; AVX-LABEL: 'no_pairwise_reduction4float'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
+; SSE42-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r
+;
+; AVX1-LABEL: 'no_pairwise_reduction4float'
+; AVX1-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
+; AVX1-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
+; AVX1-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r
+;
+; AVX2-LABEL: 'no_pairwise_reduction4float'
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r
 ;
 ; SLM-LABEL: 'no_pairwise_reduction4float'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
+; SLM-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r
 ;
   %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
   %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
@@ -416,52 +470,52 @@ define fastcc float @no_pairwise_reduction4float(<4 x float> %rdx, float %f1) {
 
 define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1) {
 ; SSE2-LABEL: 'no_pairwise_reduction4double'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
+; SSE2-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7
+; SSE2-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x double> %bin.rdx8, i32 0
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r
 ;
 ; SSSE3-LABEL: 'no_pairwise_reduction4double'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
+; SSSE3-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7
+; SSSE3-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x double> %bin.rdx8, i32 0
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r
 ;
 ; SSE42-LABEL: 'no_pairwise_reduction4double'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
+; SSE42-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7
+; SSE42-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x double> %bin.rdx8, i32 0
+; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r
 ;
 ; AVX1-LABEL: 'no_pairwise_reduction4double'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
+; AVX1-NEXT:  Cost Model: Found costs of 2 for: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:5 SizeLat:2 for: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf
+; AVX1-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:5 SizeLat:2 for: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7
+; AVX1-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x double> %bin.rdx8, i32 0
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r
 ;
 ; AVX2-LABEL: 'no_pairwise_reduction4double'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:2 for: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:2 for: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x double> %bin.rdx8, i32 0
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r
 ;
 ; SLM-LABEL: 'no_pairwise_reduction4double'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
+; SLM-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:8 SizeLat:2 for: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:8 SizeLat:2 for: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7
+; SLM-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x double> %bin.rdx8, i32 0
+; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r
 ;
   %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
   %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf
@@ -474,64 +528,64 @@ define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1
 
 define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) {
 ; SSE2-LABEL: 'no_pairwise_reduction8float'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
+; SSE2-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7
+; SSE2-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <8 x float> %bin.rdx8, i32 0
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r
 ;
 ; SSSE3-LABEL: 'no_pairwise_reduction8float'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
+; SSSE3-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7
+; SSSE3-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <8 x float> %bin.rdx8, i32 0
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r
 ;
 ; SSE42-LABEL: 'no_pairwise_reduction8float'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
+; SSE42-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7
+; SSE42-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <8 x float> %bin.rdx8, i32 0
+; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r
 ;
 ; AVX1-LABEL: 'no_pairwise_reduction8float'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
+; AVX1-NEXT:  Cost Model: Found costs of 4 for: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:5 SizeLat:2 for: %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3
+; AVX1-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:5 SizeLat:2 for: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
+; AVX1-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:5 SizeLat:2 for: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7
+; AVX1-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <8 x float> %bin.rdx8, i32 0
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r
 ;
 ; AVX2-LABEL: 'no_pairwise_reduction8float'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:2 for: %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:2 for: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:2 for: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <8 x float> %bin.rdx8, i32 0
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r
 ;
 ; SLM-LABEL: 'no_pairwise_reduction8float'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
+; SLM-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7
+; SLM-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <8 x float> %bin.rdx8, i32 0
+; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r
 ;
   %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
   %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3
@@ -545,17 +599,23 @@ define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) {
 }
 
 define fastcc i64 @no_pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) {
-; CHECK-LABEL: 'no_pairwise_reduction2i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <2 x i64> %rdx, %rdx.shuf
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <2 x i64> %bin.rdx, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
+; SSE-LABEL: 'no_pairwise_reduction2i64'
+; SSE-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 poison>
+; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %bin.rdx = add <2 x i64> %rdx, %rdx.shuf
+; SSE-NEXT:  Cost Model: Found costs of 1 for: %r = extractelement <2 x i64> %bin.rdx, i32 0
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r
+;
+; AVX-LABEL: 'no_pairwise_reduction2i64'
+; AVX-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 poison>
+; AVX-NEXT:  Cost Model: Found costs of 1 for: %bin.rdx = add <2 x i64> %rdx, %rdx.shuf
+; AVX-NEXT:  Cost Model: Found costs of 1 for: %r = extractelement <2 x i64> %bin.rdx, i32 0
+; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r
 ;
 ; SLM-LABEL: 'no_pairwise_reduction2i64'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = add <2 x i64> %rdx, %rdx.shuf
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <2 x i64> %bin.rdx, i32 0
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:2 SizeLat:2 for: %bin.rdx = add <2 x i64> %rdx, %rdx.shuf
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %r = extractelement <2 x i64> %bin.rdx, i32 0
+; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r
 ;
   %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
   %bin.rdx = add <2 x i64> %rdx, %rdx.shuf
@@ -566,20 +626,20 @@ define fastcc i64 @no_pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) {
 
 define fastcc i32 @no_pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) {
 ; CHECK-LABEL: 'no_pairwise_reduction4i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <4 x i32> %rdx, %rdx.shuf
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i32> %bin.rdx, %rdx.shuf7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %bin.rdx = add <4 x i32> %rdx, %rdx.shuf
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %bin.rdx8 = add <4 x i32> %bin.rdx, %rdx.shuf7
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r = extractelement <4 x i32> %bin.rdx8, i32 0
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r
 ;
 ; SLM-LABEL: 'no_pairwise_reduction4i32'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <4 x i32> %rdx, %rdx.shuf
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i32> %bin.rdx, %rdx.shuf7
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %bin.rdx = add <4 x i32> %rdx, %rdx.shuf
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %bin.rdx8 = add <4 x i32> %bin.rdx, %rdx.shuf7
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %r = extractelement <4 x i32> %bin.rdx8, i32 0
+; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r
 ;
   %rdx.shuf = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
   %bin.rdx = add <4 x i32> %rdx, %rdx.shuf
@@ -592,36 +652,36 @@ define fastcc i32 @no_pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) {
 
 define fastcc i64 @no_pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) {
 ; SSE-LABEL: 'no_pairwise_reduction4i64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
+; SSE-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:4 for: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf
+; SSE-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:4 for: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7
+; SSE-NEXT:  Cost Model: Found costs of 1 for: %r = extractelement <4 x i64> %bin.rdx8, i32 0
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r
 ;
 ; AVX1-LABEL: 'no_pairwise_reduction4i64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
+; AVX1-NEXT:  Cost Model: Found costs of 2 for: %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:5 Lat:2 SizeLat:6 for: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf
+; AVX1-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:5 Lat:2 SizeLat:6 for: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7
+; AVX1-NEXT:  Cost Model: Found costs of 1 for: %r = extractelement <4 x i64> %bin.rdx8, i32 0
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r
 ;
 ; AVX2-LABEL: 'no_pairwise_reduction4i64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:1 SizeLat:2 for: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:1 SizeLat:2 for: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %r = extractelement <4 x i64> %bin.rdx8, i32 0
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r
 ;
 ; SLM-LABEL: 'no_pairwise_reduction4i64'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
+; SLM-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:2 Lat:4 SizeLat:4 for: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:2 Lat:4 SizeLat:4 for: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %r = extractelement <4 x i64> %bin.rdx8, i32 0
+; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r
 ;
   %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
   %bin.rdx = add <4 x i64> %rdx, %rdx.shuf
@@ -634,54 +694,54 @@ define fastcc i64 @no_pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) {
 
 define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
 ; SSE2-LABEL: 'no_pairwise_reduction8i16'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %rdx.shuf3 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx4 = add <8 x i16> %rdx, %rdx.shuf3
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %rdx.shuf = shufflevector <8 x i16> %bin.rdx4, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
+; SSE2-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:5 SizeLat:5 for: %rdx.shuf3 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %bin.rdx4 = add <8 x i16> %rdx, %rdx.shuf3
+; SSE2-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:5 SizeLat:5 for: %rdx.shuf = shufflevector <8 x i16> %bin.rdx4, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %r = extractelement <8 x i16> %bin.rdx8, i32 0
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %r
 ;
 ; SSSE3-LABEL: 'no_pairwise_reduction8i16'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf3 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx4 = add <8 x i16> %rdx, %rdx.shuf3
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x i16> %bin.rdx4, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf3 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %bin.rdx4 = add <8 x i16> %rdx, %rdx.shuf3
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <8 x i16> %bin.rdx4, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %r = extractelement <8 x i16> %bin.rdx8, i32 0
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %r
 ;
 ; SSE42-LABEL: 'no_pairwise_reduction8i16'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf3 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx4 = add <8 x i16> %rdx, %rdx.shuf3
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x i16> %bin.rdx4, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf3 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %bin.rdx4 = add <8 x i16> %rdx, %rdx.shuf3
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <8 x i16> %bin.rdx4, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %r = extractelement <8 x i16> %bin.rdx8, i32 0
+; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %r
 ;
 ; AVX-LABEL: 'no_pairwise_reduction8i16'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf3 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx4 = add <8 x i16> %rdx, %rdx.shuf3
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x i16> %bin.rdx4, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
+; AVX-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf3 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:  Cost Model: Found costs of 1 for: %bin.rdx4 = add <8 x i16> %rdx, %rdx.shuf3
+; AVX-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <8 x i16> %bin.rdx4, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:  Cost Model: Found costs of 1 for: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf
+; AVX-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:  Cost Model: Found costs of 1 for: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7
+; AVX-NEXT:  Cost Model: Found costs of 1 for: %r = extractelement <8 x i16> %bin.rdx8, i32 0
+; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %r
 ;
 ; SLM-LABEL: 'no_pairwise_reduction8i16'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf3 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx4 = add <8 x i16> %rdx, %rdx.shuf3
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x i16> %bin.rdx4, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf3 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %bin.rdx4 = add <8 x i16> %rdx, %rdx.shuf3
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <8 x i16> %bin.rdx4, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %r = extractelement <8 x i16> %bin.rdx8, i32 0
+; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %r
 ;
   %rdx.shuf3 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
   %bin.rdx4 = add <8 x i16> %rdx, %rdx.shuf3
@@ -696,44 +756,44 @@ define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
 
 define fastcc i32 @no_pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) {
 ; SSE-LABEL: 'no_pairwise_reduction8i32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
+; SSE-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3
+; SSE-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf
+; SSE-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7
+; SSE-NEXT:  Cost Model: Found costs of 1 for: %r = extractelement <8 x i32> %bin.rdx8, i32 0
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r
 ;
 ; AVX1-LABEL: 'no_pairwise_reduction8i32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
+; AVX1-NEXT:  Cost Model: Found costs of 4 for: %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:5 Lat:2 SizeLat:6 for: %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3
+; AVX1-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:5 Lat:2 SizeLat:6 for: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf
+; AVX1-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:5 Lat:2 SizeLat:6 for: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7
+; AVX1-NEXT:  Cost Model: Found costs of 1 for: %r = extractelement <8 x i32> %bin.rdx8, i32 0
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r
 ;
 ; AVX2-LABEL: 'no_pairwise_reduction8i32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:1 SizeLat:2 for: %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:1 SizeLat:2 for: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:1 SizeLat:2 for: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %r = extractelement <8 x i32> %bin.rdx8, i32 0
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r
 ;
 ; SLM-LABEL: 'no_pairwise_reduction8i32'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
+; SLM-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %r = extractelement <8 x i32> %bin.rdx8, i32 0
+; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r
 ;
   %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
   %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3
@@ -748,39 +808,46 @@ define fastcc i32 @no_pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) {
 
 define fastcc double @pairwise_reduction2double(<2 x double> %rdx, double %f1) {
 ; SSE2-LABEL: 'pairwise_reduction2double'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
+; SSE2-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
+; SSE2-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <2 x double> %bin.rdx8, i32 0
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r
 ;
 ; SSSE3-LABEL: 'pairwise_reduction2double'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
+; SSSE3-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
+; SSSE3-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <2 x double> %bin.rdx8, i32 0
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r
 ;
 ; SSE42-LABEL: 'pairwise_reduction2double'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
-;
-; AVX-LABEL: 'pairwise_reduction2double'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 poison>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 poison>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
+; SSE42-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
+; SSE42-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <2 x double> %bin.rdx8, i32 0
+; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r
+;
+; AVX1-LABEL: 'pairwise_reduction2double'
+; AVX1-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
+; AVX1-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <2 x double> %bin.rdx8, i32 0
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r
+;
+; AVX2-LABEL: 'pairwise_reduction2double'
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <2 x double> %bin.rdx8, i32 0
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r
 ;
 ; SLM-LABEL: 'pairwise_reduction2double'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
+; SLM-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
+; SLM-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <2 x double> %bin.rdx8, i32 0
+; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r
 ;
   %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
   %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
@@ -792,54 +859,64 @@ define fastcc double @pairwise_reduction2double(<2 x double> %rdx, double %f1) {
 
 define fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) {
 ; SSE2-LABEL: 'pairwise_reduction4float'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+; SSE2-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
+; SSE2-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r
 ;
 ; SSSE3-LABEL: 'pairwise_reduction4float'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+; SSSE3-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
+; SSSE3-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r
 ;
 ; SSE42-LABEL: 'pairwise_reduction4float'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
-;
-; AVX-LABEL: 'pairwise_reduction4float'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+; SSE42-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
+; SSE42-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r
+;
+; AVX1-LABEL: 'pairwise_reduction4float'
+; AVX1-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %bin.rdx = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+; AVX1-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
+; AVX1-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r
+;
+; AVX2-LABEL: 'pairwise_reduction4float'
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %bin.rdx = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r
 ;
 ; SLM-LABEL: 'pairwise_reduction4float'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+; SLM-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
+; SLM-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r
 ;
   %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
   %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
@@ -854,64 +931,64 @@ define fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) {
 
 define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) {
 ; SSE2-LABEL: 'pairwise_reduction4double'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1
+; SSE2-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
+; SSE2-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x double> %bin.rdx8, i32 0
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r
 ;
 ; SSSE3-LABEL: 'pairwise_reduction4double'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1
+; SSSE3-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
+; SSSE3-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x double> %bin.rdx8, i32 0
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r
 ;
 ; SSE42-LABEL: 'pairwise_reduction4double'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1
+; SSE42-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
+; SSE42-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x double> %bin.rdx8, i32 0
+; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r
 ;
 ; AVX1-LABEL: 'pairwise_reduction4double'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
+; AVX1-NEXT:  Cost Model: Found costs of 2 for: %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of 2 for: %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:5 SizeLat:2 for: %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1
+; AVX1-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:5 SizeLat:2 for: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
+; AVX1-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x double> %bin.rdx8, i32 0
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r
 ;
 ; AVX2-LABEL: 'pairwise_reduction4double'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:2 for: %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:2 for: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x double> %bin.rdx8, i32 0
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r
 ;
 ; SLM-LABEL: 'pairwise_reduction4double'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:8 SizeLat:2 for: %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1
+; SLM-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:8 SizeLat:2 for: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
+; SLM-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <4 x double> %bin.rdx8, i32 0
+; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret double %r
 ;
   %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
   %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
@@ -926,82 +1003,82 @@ define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) {
 
 define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) {
 ; SSE2-LABEL: 'pairwise_reduction8float'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
+; SSE2-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1
+; SSE2-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <8 x float> %bin.rdx9, i32 0
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r
 ;
 ; SSSE3-LABEL: 'pairwise_reduction8float'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
+; SSSE3-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1
+; SSSE3-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <8 x float> %bin.rdx9, i32 0
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r
 ;
 ; SSE42-LABEL: 'pairwise_reduction8float'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
+; SSE42-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1
+; SSE42-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <8 x float> %bin.rdx9, i32 0
+; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r
 ;
 ; AVX1-LABEL: 'pairwise_reduction8float'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
+; AVX1-NEXT:  Cost Model: Found costs of 4 for: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of 4 for: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:5 SizeLat:2 for: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+; AVX1-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:5 SizeLat:2 for: %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
+; AVX1-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:5 SizeLat:2 for: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1
+; AVX1-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <8 x float> %bin.rdx9, i32 0
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r
 ;
 ; AVX2-LABEL: 'pairwise_reduction8float'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:2 for: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:2 for: %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:2 for: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <8 x float> %bin.rdx9, i32 0
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r
 ;
 ; SLM-LABEL: 'pairwise_reduction8float'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
+; SLM-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1
+; SLM-NEXT:  Cost Model: Found costs of 0 for: %r = extractelement <8 x float> %bin.rdx9, i32 0
+; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float %r
 ;
   %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef>
   %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
@@ -1018,19 +1095,26 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) {
 }
 
 define fastcc i64 @pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) {
-; CHECK-LABEL: 'pairwise_reduction2i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 0, i32 poison>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <2 x i64> %bin.rdx8, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
+; SSE-LABEL: 'pairwise_reduction2i64'
+; SSE-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 0, i32 poison>
+; SSE-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 poison>
+; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
+; SSE-NEXT:  Cost Model: Found costs of 1 for: %r = extractelement <2 x i64> %bin.rdx8, i32 0
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r
+;
+; AVX-LABEL: 'pairwise_reduction2i64'
+; AVX-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 0, i32 poison>
+; AVX-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 poison>
+; AVX-NEXT:  Cost Model: Found costs of 1 for: %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
+; AVX-NEXT:  Cost Model: Found costs of 1 for: %r = extractelement <2 x i64> %bin.rdx8, i32 0
+; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r
 ;
 ; SLM-LABEL: 'pairwise_reduction2i64'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 0, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <2 x i64> %bin.rdx8, i32 0
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
+; SLM-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 0, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:2 SizeLat:2 for: %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %r = extractelement <2 x i64> %bin.rdx8, i32 0
+; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r
 ;
   %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 0, i32 undef>
   %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
@@ -1042,24 +1126,24 @@ define fastcc i64 @pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) {
 
 define fastcc i32 @pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) {
 ; CHECK-LABEL: 'pairwise_reduction4i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <4 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %bin.rdx = add <4 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %bin.rdx8 = add <4 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r = extractelement <4 x i32> %bin.rdx8, i32 0
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r
 ;
 ; SLM-LABEL: 'pairwise_reduction4i32'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <4 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %bin.rdx = add <4 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1
+; SLM-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %bin.rdx8 = add <4 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %r = extractelement <4 x i32> %bin.rdx8, i32 0
+; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r
 ;
   %rdx.shuf.0.0 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
   %rdx.shuf.0.1 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
@@ -1074,44 +1158,44 @@ define fastcc i32 @pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) {
 
 define fastcc i64 @pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) {
 ; SSE-LABEL: 'pairwise_reduction4i64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <4 x i64> %rdx.shuf.0.0, %rdx.shuf.0.1
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
+; SSE-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:4 for: %bin.rdx = add <4 x i64> %rdx.shuf.0.0, %rdx.shuf.0.1
+; SSE-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:4 for: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
+; SSE-NEXT:  Cost Model: Found costs of 1 for: %r = extractelement <4 x i64> %bin.rdx8, i32 0
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r
 ;
 ; AVX1-LABEL: 'pairwise_reduction4i64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = add <4 x i64> %rdx.shuf.0.0, %rdx.shuf.0.1
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
+; AVX1-NEXT:  Cost Model: Found costs of 2 for: %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of 2 for: %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:5 Lat:2 SizeLat:6 for: %bin.rdx = add <4 x i64> %rdx.shuf.0.0, %rdx.shuf.0.1
+; AVX1-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:5 Lat:2 SizeLat:6 for: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
+; AVX1-NEXT:  Cost Model: Found costs of 1 for: %r = extractelement <4 x i64> %bin.rdx8, i32 0
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r
 ;
 ; AVX2-LABEL: 'pairwise_reduction4i64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <4 x i64> %rdx.shuf.0.0, %rdx.shuf.0.1
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:1 SizeLat:2 for: %bin.rdx = add <4 x i64> %rdx.shuf.0.0, %rdx.shuf.0.1
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:1 SizeLat:2 for: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %r = extractelement <4 x i64> %bin.rdx8, i32 0
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r
 ;
 ; SLM-LABEL: 'pairwise_reduction4i64'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bin.rdx = add <4 x i64> %rdx.shuf.0.0, %rdx.shuf.0.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:2 Lat:4 SizeLat:4 for: %bin.rdx = add <4 x i64> %rdx.shuf.0.0, %rdx.shuf.0.1
+; SLM-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:2 Lat:4 SizeLat:4 for: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %r = extractelement <4 x i64> %bin.rdx8, i32 0
+; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r
 ;
   %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
   %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
@@ -1126,69 +1210,69 @@ define fastcc i64 @pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) {
 
 define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
 ; SSE2-LABEL: 'pairwise_reduction8i16'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %rdx.shuf.0.1 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %rdx.shuf.0.0, %rdx.shuf.0.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %rdx.shuf.1.0 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %rdx.shuf.1.1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
+; SSE2-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:5 SizeLat:5 for: %rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:5 SizeLat:5 for: %rdx.shuf.0.1 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %bin.rdx = add <8 x i16> %rdx.shuf.0.0, %rdx.shuf.0.1
+; SSE2-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:5 SizeLat:5 for: %rdx.shuf.1.0 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:5 SizeLat:5 for: %rdx.shuf.1.1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %bin.rdx8 = add <8 x i16> %rdx.shuf.1.0, %rdx.shuf.1.1
+; SSE2-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %r = extractelement <8 x i16> %bin.rdx9, i32 0
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %r
 ;
 ; SSSE3-LABEL: 'pairwise_reduction8i16'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %rdx.shuf.0.0, %rdx.shuf.0.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %bin.rdx = add <8 x i16> %rdx.shuf.0.0, %rdx.shuf.0.1
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.0 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %bin.rdx8 = add <8 x i16> %rdx.shuf.1.0, %rdx.shuf.1.1
+; SSSE3-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %r = extractelement <8 x i16> %bin.rdx9, i32 0
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %r
 ;
 ; SSE42-LABEL: 'pairwise_reduction8i16'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %rdx.shuf.0.0, %rdx.shuf.0.1
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %bin.rdx = add <8 x i16> %rdx.shuf.0.0, %rdx.shuf.0.1
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.0 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %bin.rdx8 = add <8 x i16> %rdx.shuf.1.0, %rdx.shuf.1.1
+; SSE42-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %r = extractelement <8 x i16> %bin.rdx9, i32 0
+; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %r
 ;
 ; AVX-LABEL: 'pairwise_reduction8i16'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %rdx.shuf.0.0, %rdx.shuf.0.1
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %rdx.shuf.1.0, %rdx.shuf.1.1
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
+; AVX-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:  Cost Model: Found costs of 1 for: %bin.rdx = add <8 x i16> %rdx.shuf.0.0, %rdx.shuf.0.1
+; AVX-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.0 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:  Cost Model: Found costs of 1 for: %bin.rdx8 = add <8 x i16> %rdx.shuf.1.0, %rdx.shuf.1.1
+; AVX-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:  Cost Model: Found costs of 1 for: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1
+; AVX-NEXT:  Cost Model: Found costs of 1 for: %r = extractelement <8 x i16> %bin.rdx9, i32 0
+; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %r
 ;
 ; SLM-LABEL: 'pairwise_reduction8i16'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %rdx.shuf.0.0, %rdx.shuf.0.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %bin.rdx = add <8 x i16> %rdx.shuf.0.0, %rdx.shuf.0.1
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.0 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %bin.rdx8 = add <8 x i16> %rdx.shuf.1.0, %rdx.shuf.1.1
+; SLM-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %r = extractelement <8 x i16> %bin.rdx9, i32 0
+; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %r
 ;
   %rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef>
   %rdx.shuf.0.1 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
@@ -1206,56 +1290,56 @@ define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
 
 define fastcc i32 @pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) {
 ; SSE-LABEL: 'pairwise_reduction8i32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <8 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
+; SSE-NEXT:  Cost Model: Found costs of 2 for: %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found costs of 2 for: %rdx.shuf.0.1 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %bin.rdx = add <8 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1
+; SSE-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.0 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %bin.rdx8 = add <8 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1
+; SSE-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1
+; SSE-NEXT:  Cost Model: Found costs of 1 for: %r = extractelement <8 x i32> %bin.rdx9, i32 0
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r
 ;
 ; AVX1-LABEL: 'pairwise_reduction8i32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.0.1 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = add <8 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = add <8 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
+; AVX1-NEXT:  Cost Model: Found costs of 4 for: %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of 4 for: %rdx.shuf.0.1 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:5 Lat:2 SizeLat:6 for: %bin.rdx = add <8 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1
+; AVX1-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.0 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:5 Lat:2 SizeLat:6 for: %bin.rdx8 = add <8 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1
+; AVX1-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:5 Lat:2 SizeLat:6 for: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1
+; AVX1-NEXT:  Cost Model: Found costs of 1 for: %r = extractelement <8 x i32> %bin.rdx9, i32 0
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r
 ;
 ; AVX2-LABEL: 'pairwise_reduction8i32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.0.1 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:1 SizeLat:2 for: %bin.rdx = add <8 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.0 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:1 SizeLat:2 for: %bin.rdx8 = add <8 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:1 SizeLat:2 for: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %r = extractelement <8 x i32> %bin.rdx9, i32 0
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r
 ;
 ; SLM-LABEL: 'pairwise_reduction8i32'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <8 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
+; SLM-NEXT:  Cost Model: Found costs of 2 for: %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of 2 for: %rdx.shuf.0.1 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %bin.rdx = add <8 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.0 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.1.1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %bin.rdx8 = add <8 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1
+; SLM-NEXT:  Cost Model: Found costs of 0 for: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1
+; SLM-NEXT:  Cost Model: Found costs of 1 for: %r = extractelement <8 x i32> %bin.rdx9, i32 0
+; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r
 ;
   %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef>
   %rdx.shuf.0.1 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>