From f23e6b57e2f61c53cf464656778d71fb6e226ac1 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Tue, 6 May 2025 12:25:52 -0700 Subject: [PATCH] [AMDGPU] Change -cl-std to CL2.0 in builtins-amdgcn-gfx950.cl. NFC. We are universally using CL2.0 in our builtin tests. --- .../CodeGenOpenCL/builtins-amdgcn-gfx950.cl | 1589 +++++++++-------- 1 file changed, 879 insertions(+), 710 deletions(-) diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl index 8251d6c213e3d..8f48f12ed73d4 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl @@ -1,5 +1,5 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py -// RUN: %clang_cc1 -cl-std=CL1.2 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx950 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx950 -emit-llvm -o - %s | FileCheck %s // REQUIRES: amdgpu-registered-target #pragma OPENCL EXTENSION cl_khr_fp16 : enable @@ -21,11 +21,13 @@ typedef float __attribute__((ext_vector_type(32))) float32; // CHECK-NEXT: entry: // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.prng.b32(i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(1) [[TMP2]], align 4 // CHECK-NEXT: ret void // @@ -38,35 +40,38 @@ void test_prng_b32(global uint* out, uint a) { // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5) // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store i32 [[OLD:%.*]], ptr addrspace(5) [[OLD_ADDR]], align 4 -// CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4 -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OLD_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP0]], i32 [[TMP1]], i1 false, i1 false) // CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0 // CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1 // CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0 // CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1 -// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8 -// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4 -// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP10:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP8]], i32 [[TMP9]], i1 true, i1 false) // CHECK-NEXT: [[TMP11:%.*]] = extractvalue { i32, i32 } [[TMP10]], 0 // CHECK-NEXT: [[TMP12:%.*]] = extractvalue { i32, i32 } [[TMP10]], 1 // CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i64 0 // CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP12]], i64 1 -// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 8 -// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4 -// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 +// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP18:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP16]], i32 [[TMP17]], i1 false, i1 true) // CHECK-NEXT: [[TMP19:%.*]] = extractvalue { i32, i32 } [[TMP18]], 0 // CHECK-NEXT: [[TMP20:%.*]] = extractvalue { i32, i32 } [[TMP18]], 1 // CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i64 0 // CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i64 1 -// CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x i32> [[TMP22]], ptr addrspace(1) [[TMP23]], align 8 // CHECK-NEXT: ret void // @@ -81,35 +86,38 @@ void test_permlane16_swap(global uint2* out, uint old, uint src) { // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5) // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store i32 [[OLD:%.*]], ptr addrspace(5) [[OLD_ADDR]], align 4 -// CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4 -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OLD_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 [[TMP0]], i32 [[TMP1]], i1 false, i1 false) // CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0 // CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1 // CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0 // CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1 -// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8 -// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4 -// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP10:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 [[TMP8]], i32 [[TMP9]], i1 true, i1 false) // CHECK-NEXT: [[TMP11:%.*]] = extractvalue { i32, i32 } [[TMP10]], 0 // CHECK-NEXT: [[TMP12:%.*]] = extractvalue { i32, i32 } [[TMP10]], 1 // CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i64 0 // CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP12]], i64 1 -// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 8 -// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4 -// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 +// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP18:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 [[TMP16]], i32 [[TMP17]], i1 false, i1 true) // CHECK-NEXT: [[TMP19:%.*]] = extractvalue { i32, i32 } [[TMP18]], 0 // CHECK-NEXT: [[TMP20:%.*]] = extractvalue { i32, i32 } [[TMP18]], 1 // CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i64 0 // CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i64 1 -// CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x i32> [[TMP22]], ptr addrspace(1) [[TMP23]], align 8 // CHECK-NEXT: ret void // @@ -127,43 +135,49 @@ void test_permlane32_swap(global uint2* out, uint old, uint src) { // CHECK-NEXT: [[SRC0F32_ADDR:%.*]] = alloca <16 x float>, align 64, addrspace(5) // CHECK-NEXT: [[SRC1F32_ADDR:%.*]] = alloca <16 x float>, align 64, addrspace(5) // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT6:%.*]], ptr addrspace(5) [[OUT6_ADDR]], align 8 -// CHECK-NEXT: store <32 x bfloat> [[SRCBF32:%.*]], ptr addrspace(5) [[SRCBF32_ADDR]], align 64 -// CHECK-NEXT: store <32 x half> [[SRCH32:%.*]], ptr addrspace(5) [[SRCH32_ADDR]], align 64 -// CHECK-NEXT: store <16 x float> [[SRC0F32:%.*]], ptr addrspace(5) [[SRC0F32_ADDR]], align 64 -// CHECK-NEXT: store <16 x float> [[SRC1F32:%.*]], ptr addrspace(5) [[SRC1F32_ADDR]], align 64 -// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load <32 x bfloat>, ptr addrspace(5) [[SRCBF32_ADDR]], align 64 -// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[OUT6_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT6_ADDR]] to ptr +// CHECK-NEXT: [[SRCBF32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCBF32_ADDR]] to ptr +// CHECK-NEXT: [[SRCH32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCH32_ADDR]] to ptr +// CHECK-NEXT: [[SRC0F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC0F32_ADDR]] to ptr +// CHECK-NEXT: [[SRC1F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC1F32_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT6:%.*]], ptr [[OUT6_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <32 x bfloat> [[SRCBF32:%.*]], ptr [[SRCBF32_ADDR_ASCAST]], align 64 +// CHECK-NEXT: store <32 x half> [[SRCH32:%.*]], ptr [[SRCH32_ADDR_ASCAST]], align 64 +// CHECK-NEXT: store <16 x float> [[SRC0F32:%.*]], ptr [[SRC0F32_ADDR_ASCAST]], align 64 +// CHECK-NEXT: store <16 x float> [[SRC1F32:%.*]], ptr [[SRC1F32_ADDR_ASCAST]], align 64 +// CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <32 x bfloat>, ptr [[SRCBF32_ADDR_ASCAST]], align 64 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP2:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.bf16(<32 x bfloat> [[TMP0]], float [[TMP1]]) -// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUT6_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <6 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 32 -// CHECK-NEXT: [[TMP4:%.*]] = load <32 x half>, ptr addrspace(5) [[SRCH32_ADDR]], align 64 -// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load <32 x half>, ptr [[SRCH32_ADDR_ASCAST]], align 64 +// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP6:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.f16(<32 x half> [[TMP4]], float [[TMP5]]) -// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT6_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <6 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 32 -// CHECK-NEXT: [[TMP8:%.*]] = load <32 x bfloat>, ptr addrspace(5) [[SRCBF32_ADDR]], align 64 -// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load <32 x bfloat>, ptr [[SRCBF32_ADDR_ASCAST]], align 64 +// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP10:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.bf16(<32 x bfloat> [[TMP8]], float [[TMP9]]) -// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 +// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr [[OUT6_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <6 x i32> [[TMP10]], ptr addrspace(1) [[TMP11]], align 32 -// CHECK-NEXT: [[TMP12:%.*]] = load <32 x half>, ptr addrspace(5) [[SRCH32_ADDR]], align 64 -// CHECK-NEXT: [[TMP13:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP12:%.*]] = load <32 x half>, ptr [[SRCH32_ADDR_ASCAST]], align 64 +// CHECK-NEXT: [[TMP13:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP14:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.f16(<32 x half> [[TMP12]], float [[TMP13]]) -// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 +// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr [[OUT6_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <6 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 32 -// CHECK-NEXT: [[TMP16:%.*]] = load <16 x float>, ptr addrspace(5) [[SRC0F32_ADDR]], align 64 -// CHECK-NEXT: [[TMP17:%.*]] = load <16 x float>, ptr addrspace(5) [[SRC1F32_ADDR]], align 64 -// CHECK-NEXT: [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP16:%.*]] = load <16 x float>, ptr [[SRC0F32_ADDR_ASCAST]], align 64 +// CHECK-NEXT: [[TMP17:%.*]] = load <16 x float>, ptr [[SRC1F32_ADDR_ASCAST]], align 64 +// CHECK-NEXT: [[TMP18:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP19:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.bf6.f32(<16 x float> [[TMP16]], <16 x float> [[TMP17]], float [[TMP18]]) -// CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 +// CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr [[OUT6_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <6 x i32> [[TMP19]], ptr addrspace(1) [[TMP20]], align 32 -// CHECK-NEXT: [[TMP21:%.*]] = load <16 x float>, ptr addrspace(5) [[SRC0F32_ADDR]], align 64 -// CHECK-NEXT: [[TMP22:%.*]] = load <16 x float>, ptr addrspace(5) [[SRC1F32_ADDR]], align 64 -// CHECK-NEXT: [[TMP23:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP21:%.*]] = load <16 x float>, ptr [[SRC0F32_ADDR_ASCAST]], align 64 +// CHECK-NEXT: [[TMP22:%.*]] = load <16 x float>, ptr [[SRC1F32_ADDR_ASCAST]], align 64 +// CHECK-NEXT: [[TMP23:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP24:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.fp6.f32(<16 x float> [[TMP21]], <16 x float> [[TMP22]], float [[TMP23]]) -// CHECK-NEXT: [[TMP25:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 +// CHECK-NEXT: [[TMP25:%.*]] = load ptr addrspace(1), ptr [[OUT6_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <6 x i32> [[TMP24]], ptr addrspace(1) [[TMP25]], align 32 // CHECK-NEXT: ret void // @@ -183,16 +197,20 @@ void test_cvt_scalef32_pk(global uint6 *out6, bfloat32 srcbf32, half32 srch32, f // CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5) // CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5) // CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca i32, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store i32 [[SRC0:%.*]], ptr addrspace(5) [[SRC0_ADDR]], align 4 -// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4 -// CHECK-NEXT: store i32 [[SRC2:%.*]], ptr addrspace(5) [[SRC2_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC0_ADDR]], align 4 -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[SRC2_ADDR]], align 4 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC0_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC0_ADDR]] to ptr +// CHECK-NEXT: [[SRC1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC1_ADDR]] to ptr +// CHECK-NEXT: [[SRC2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC2_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[SRC0:%.*]], ptr [[SRC0_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SRC2:%.*]], ptr [[SRC2_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC0_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SRC2_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP3:%.*]] = call i16 @llvm.amdgcn.ashr.pk.i8.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]]) // CHECK-NEXT: [[CONV:%.*]] = zext i16 [[TMP3]] to i32 -// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[CONV]], ptr addrspace(1) [[TMP4]], align 4 // CHECK-NEXT: ret void // @@ -206,16 +224,20 @@ void test_ashr_pk_i8_i32(global int* out, uint src0, uint src1, uint src2) { // CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5) // CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5) // CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca i32, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store i32 [[SRC0:%.*]], ptr addrspace(5) [[SRC0_ADDR]], align 4 -// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4 -// CHECK-NEXT: store i32 [[SRC2:%.*]], ptr addrspace(5) [[SRC2_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC0_ADDR]], align 4 -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[SRC2_ADDR]], align 4 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC0_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC0_ADDR]] to ptr +// CHECK-NEXT: [[SRC1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC1_ADDR]] to ptr +// CHECK-NEXT: [[SRC2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC2_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[SRC0:%.*]], ptr [[SRC0_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SRC2:%.*]], ptr [[SRC2_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC0_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SRC2_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP3:%.*]] = call i16 @llvm.amdgcn.ashr.pk.u8.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]]) // CHECK-NEXT: [[CONV:%.*]] = zext i16 [[TMP3]] to i32 -// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[CONV]], ptr addrspace(1) [[TMP4]], align 4 // CHECK-NEXT: ret void // @@ -229,17 +251,21 @@ void test_ashr_pk_u8_i32(global int* out, uint src0, uint src1, uint src2) { // CHECK-NEXT: [[FC_ADDR:%.*]] = alloca float, align 4, addrspace(5) // CHECK-NEXT: [[V2SSA_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5) // CHECK-NEXT: [[V2SSB_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store float [[FC:%.*]], ptr addrspace(5) [[FC_ADDR]], align 4 -// CHECK-NEXT: store <2 x i16> [[V2SSA:%.*]], ptr addrspace(5) [[V2SSA_ADDR]], align 4 -// CHECK-NEXT: store <2 x i16> [[V2SSB:%.*]], ptr addrspace(5) [[V2SSB_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i16>, ptr addrspace(5) [[V2SSA_ADDR]], align 4 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[FC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[FC_ADDR]] to ptr +// CHECK-NEXT: [[V2SSA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V2SSA_ADDR]] to ptr +// CHECK-NEXT: [[V2SSB_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V2SSB_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store float [[FC:%.*]], ptr [[FC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store <2 x i16> [[V2SSA:%.*]], ptr [[V2SSA_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store <2 x i16> [[V2SSB:%.*]], ptr [[V2SSB_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i16>, ptr [[V2SSA_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i16> [[TMP0]] to <2 x bfloat> -// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i16>, ptr addrspace(5) [[V2SSB_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i16>, ptr [[V2SSB_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i16> [[TMP2]] to <2 x bfloat> -// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[FC_ADDR]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[FC_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP5:%.*]] = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> [[TMP1]], <2 x bfloat> [[TMP3]], float [[TMP4]], i1 false) -// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store float [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 // CHECK-NEXT: ret void // @@ -253,15 +279,19 @@ void builtins_amdgcn_dl_insts(global float *out, float fC, short2 v2ssA, short2 // CHECK-NEXT: [[FC_ADDR:%.*]] = alloca float, align 4, addrspace(5) // CHECK-NEXT: [[V2SSA_ADDR:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5) // CHECK-NEXT: [[V2SSB_ADDR:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store float [[FC:%.*]], ptr addrspace(5) [[FC_ADDR]], align 4 -// CHECK-NEXT: store <2 x bfloat> [[V2SSA:%.*]], ptr addrspace(5) [[V2SSA_ADDR]], align 4 -// CHECK-NEXT: store <2 x bfloat> [[V2SSB:%.*]], ptr addrspace(5) [[V2SSB_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[V2SSA_ADDR]], align 4 -// CHECK-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[V2SSB_ADDR]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[FC_ADDR]], align 4 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[FC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[FC_ADDR]] to ptr +// CHECK-NEXT: [[V2SSA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V2SSA_ADDR]] to ptr +// CHECK-NEXT: [[V2SSB_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V2SSB_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store float [[FC:%.*]], ptr [[FC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store <2 x bfloat> [[V2SSA:%.*]], ptr [[V2SSA_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store <2 x bfloat> [[V2SSB:%.*]], ptr [[V2SSB_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x bfloat>, ptr [[V2SSA_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[V2SSB_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[FC_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP3:%.*]] = call float @llvm.amdgcn.fdot2c.f32.bf16(<2 x bfloat> [[TMP0]], <2 x bfloat> [[TMP1]], float [[TMP2]], i1 false) -// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store float [[TMP3]], ptr addrspace(1) [[TMP4]], align 4 // CHECK-NEXT: ret void // @@ -274,64 +304,67 @@ void builtins_amdgcn_dl_dot2c(global float *out, float fC, bfloat2 v2ssA, bfloat // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP0]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP4:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP1]], i32 [[TMP2]], float [[TMP3]], i32 0, i1 false) -// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x half> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4 -// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP7:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP6]], align 4 -// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP10:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP7]], i32 [[TMP8]], float [[TMP9]], i32 1, i1 false) -// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x half> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 -// CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP13:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP12]], align 4 -// CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP15:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP15:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP16:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP13]], i32 [[TMP14]], float [[TMP15]], i32 2, i1 false) -// CHECK-NEXT: [[TMP17:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP17:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x half> [[TMP16]], ptr addrspace(1) [[TMP17]], align 4 -// CHECK-NEXT: [[TMP18:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP18:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP19:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP18]], align 4 -// CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP21:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP21:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP22:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP19]], i32 [[TMP20]], float [[TMP21]], i32 3, i1 false) -// CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x half> [[TMP22]], ptr addrspace(1) [[TMP23]], align 4 -// CHECK-NEXT: [[TMP24:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP24:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP25:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP24]], align 4 -// CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP27:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP27:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP28:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP25]], i32 [[TMP26]], float [[TMP27]], i32 0, i1 true) -// CHECK-NEXT: [[TMP29:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP29:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x half> [[TMP28]], ptr addrspace(1) [[TMP29]], align 4 -// CHECK-NEXT: [[TMP30:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP30:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP31:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP30]], align 4 -// CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP33:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP33:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP34:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP31]], i32 [[TMP32]], float [[TMP33]], i32 1, i1 true) -// CHECK-NEXT: [[TMP35:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP35:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x half> [[TMP34]], ptr addrspace(1) [[TMP35]], align 4 -// CHECK-NEXT: [[TMP36:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP36:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP37:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP36]], align 4 -// CHECK-NEXT: [[TMP38:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP39:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP38:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP39:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP40:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP37]], i32 [[TMP38]], float [[TMP39]], i32 2, i1 true) -// CHECK-NEXT: [[TMP41:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP41:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x half> [[TMP40]], ptr addrspace(1) [[TMP41]], align 4 -// CHECK-NEXT: [[TMP42:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP42:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP43:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP42]], align 4 -// CHECK-NEXT: [[TMP44:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP45:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP44:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP45:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP46:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP43]], i32 [[TMP44]], float [[TMP45]], i32 3, i1 true) -// CHECK-NEXT: [[TMP47:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP47:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x half> [[TMP46]], ptr addrspace(1) [[TMP47]], align 4 // CHECK-NEXT: ret void // @@ -352,28 +385,31 @@ void test_cvt_scalef32_f16_fp8(global half2* out, uint src, float scale) // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP2:%.*]] = call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 [[TMP0]], float [[TMP1]], i32 0) -// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store float [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP6:%.*]] = call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 [[TMP4]], float [[TMP5]], i32 1) -// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store float [[TMP6]], ptr addrspace(1) [[TMP7]], align 4 -// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 [[TMP8]], float [[TMP9]], i32 2) -// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store float [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 -// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP13:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP13:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 [[TMP12]], float [[TMP13]], i32 3) -// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store float [[TMP14]], ptr addrspace(1) [[TMP15]], align 4 // CHECK-NEXT: ret void // @@ -390,64 +426,67 @@ void test_cvt_scalef32_f32_fp8(global float* out, uint src, float scale) // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP0]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP4:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> [[TMP1]], i32 [[TMP2]], float [[TMP3]], i32 0, i1 false) -// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x half> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4 -// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP7:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP6]], align 4 -// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP10:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> [[TMP7]], i32 [[TMP8]], float [[TMP9]], i32 1, i1 false) -// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x half> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 -// CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP13:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP12]], align 4 -// CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP15:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP15:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP16:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> [[TMP13]], i32 [[TMP14]], float [[TMP15]], i32 2, i1 false) -// CHECK-NEXT: [[TMP17:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP17:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x half> [[TMP16]], ptr addrspace(1) [[TMP17]], align 4 -// CHECK-NEXT: [[TMP18:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP18:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP19:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP18]], align 4 -// CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP21:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP21:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP22:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> [[TMP19]], i32 [[TMP20]], float [[TMP21]], i32 3, i1 false) -// CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x half> [[TMP22]], ptr addrspace(1) [[TMP23]], align 4 -// CHECK-NEXT: [[TMP24:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP24:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP25:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP24]], align 4 -// CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP27:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP27:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP28:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> [[TMP25]], i32 [[TMP26]], float [[TMP27]], i32 0, i1 true) -// CHECK-NEXT: [[TMP29:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP29:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x half> [[TMP28]], ptr addrspace(1) [[TMP29]], align 4 -// CHECK-NEXT: [[TMP30:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP30:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP31:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP30]], align 4 -// CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP33:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP33:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP34:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> [[TMP31]], i32 [[TMP32]], float [[TMP33]], i32 1, i1 true) -// CHECK-NEXT: [[TMP35:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP35:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x half> [[TMP34]], ptr addrspace(1) [[TMP35]], align 4 -// CHECK-NEXT: [[TMP36:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP36:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP37:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP36]], align 4 -// CHECK-NEXT: [[TMP38:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP39:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP38:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP39:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP40:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> [[TMP37]], i32 [[TMP38]], float [[TMP39]], i32 2, i1 true) -// CHECK-NEXT: [[TMP41:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP41:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x half> [[TMP40]], ptr addrspace(1) [[TMP41]], align 4 -// CHECK-NEXT: [[TMP42:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP42:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP43:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP42]], align 4 -// CHECK-NEXT: [[TMP44:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP45:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP44:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP45:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP46:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> [[TMP43]], i32 [[TMP44]], float [[TMP45]], i32 3, i1 true) -// CHECK-NEXT: [[TMP47:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP47:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x half> [[TMP46]], ptr addrspace(1) [[TMP47]], align 4 // CHECK-NEXT: ret void // @@ -468,28 +507,31 @@ void test_cvt_scalef32_f16_bf8(global half2* out, uint src, float scale) // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP2:%.*]] = call float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32 [[TMP0]], float [[TMP1]], i32 0) -// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store float [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP6:%.*]] = call float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32 [[TMP4]], float [[TMP5]], i32 1) -// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store float [[TMP6]], ptr addrspace(1) [[TMP7]], align 4 -// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32 [[TMP8]], float [[TMP9]], i32 2) -// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store float [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 -// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP13:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP13:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32 [[TMP12]], float [[TMP13]], i32 3) -// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store float [[TMP14]], ptr addrspace(1) [[TMP15]], align 4 // CHECK-NEXT: ret void // @@ -507,25 +549,29 @@ void test_cvt_scalef32_f32_bf8(global float* out, uint src, float scale) // CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca float, align 4, addrspace(5) // CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca float, align 4, addrspace(5) // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store float [[SRC0:%.*]], ptr addrspace(5) [[SRC0_ADDR]], align 4 -// CHECK-NEXT: store float [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4 -// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC0_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC0_ADDR]] to ptr +// CHECK-NEXT: [[SRC1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC1_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store float [[SRC0:%.*]], ptr [[SRC0_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SRC1:%.*]], ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP0]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[SRC0_ADDR]], align 4 -// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SRC1_ADDR]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[SRC0_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f32(<2 x i16> [[TMP1]], float [[TMP2]], float [[TMP3]], float [[TMP4]], i1 true) -// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x i16> [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 -// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP8:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP7]], align 4 -// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SRC0_ADDR]], align 4 -// CHECK-NEXT: [[TMP10:%.*]] = load float, ptr addrspace(5) [[SRC1_ADDR]], align 4 -// CHECK-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[SRC0_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP12:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f32(<2 x i16> [[TMP8]], float [[TMP9]], float [[TMP10]], float [[TMP11]], i1 false) -// CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x i16> [[TMP12]], ptr addrspace(1) [[TMP13]], align 4 // CHECK-NEXT: ret void // @@ -541,25 +587,29 @@ void test_cvt_scalef32_pk_fp8_f32(global short2* out, float src0, float src1, fl // CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca float, align 4, addrspace(5) // CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca float, align 4, addrspace(5) // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store float [[SRC0:%.*]], ptr addrspace(5) [[SRC0_ADDR]], align 4 -// CHECK-NEXT: store float [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4 -// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC0_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC0_ADDR]] to ptr +// CHECK-NEXT: [[SRC1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC1_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store float [[SRC0:%.*]], ptr [[SRC0_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SRC1:%.*]], ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP0]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[SRC0_ADDR]], align 4 -// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SRC1_ADDR]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[SRC0_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f32(<2 x i16> [[TMP1]], float [[TMP2]], float [[TMP3]], float [[TMP4]], i1 true) -// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x i16> [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 -// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP8:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP7]], align 4 -// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SRC0_ADDR]], align 4 -// CHECK-NEXT: [[TMP10:%.*]] = load float, ptr addrspace(5) [[SRC1_ADDR]], align 4 -// CHECK-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[SRC0_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP12:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f32(<2 x i16> [[TMP8]], float [[TMP9]], float [[TMP10]], float [[TMP11]], i1 false) -// CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x i16> [[TMP12]], ptr addrspace(1) [[TMP13]], align 4 // CHECK-NEXT: ret void // @@ -575,18 +625,21 @@ void test_cvt_scalef32_pk_bf8_f32(global short2* out, float src0, float src1, fl // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp8(i32 [[TMP0]], float [[TMP1]], i1 true) -// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x float> [[TMP2]], ptr addrspace(1) [[TMP3]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp8(i32 [[TMP4]], float [[TMP5]], i1 false) -// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x float> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8 // CHECK-NEXT: ret void // @@ -601,18 +654,21 @@ void test_cvt_scalef32_pk_f32_fp8(global float2* out, unsigned int src, float sc // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.bf8(i32 [[TMP0]], float [[TMP1]], i1 true) -// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x float> [[TMP2]], ptr addrspace(1) [[TMP3]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.bf8(i32 [[TMP4]], float [[TMP5]], i1 false) -// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x float> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8 // CHECK-NEXT: ret void // @@ -627,22 +683,25 @@ void test_cvt_scalef32_pk_f32_bf8(global float2* out, unsigned int src, float sc // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5) // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store <2 x half> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x half> [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP0]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f16(<2 x i16> [[TMP1]], <2 x half> [[TMP2]], float [[TMP3]], i1 true) -// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x i16> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4 -// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP7:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP6]], align 4 -// CHECK-NEXT: [[TMP8:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load <2 x half>, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP10:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f16(<2 x i16> [[TMP7]], <2 x half> [[TMP8]], float [[TMP9]], i1 false) -// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x i16> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 // CHECK-NEXT: ret void // @@ -657,22 +716,25 @@ void test_cvt_scalef32_pk_fp8_f16(global short2* out, half2 src, float scale) // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5) // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store <2 x bfloat> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x bfloat> [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP0]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x bfloat>, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.bf16(<2 x i16> [[TMP1]], <2 x bfloat> [[TMP2]], float [[TMP3]], i1 true) -// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x i16> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4 -// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP7:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP6]], align 4 -// CHECK-NEXT: [[TMP8:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load <2 x bfloat>, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP10:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.bf16(<2 x i16> [[TMP7]], <2 x bfloat> [[TMP8]], float [[TMP9]], i1 false) -// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x i16> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 // CHECK-NEXT: ret void // @@ -687,22 +749,25 @@ void test_cvt_scalef32_pk_fp8_bf16(global short2* out, bfloat2 src, float scale) // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5) // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store <2 x half> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x half> [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP0]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f16(<2 x i16> [[TMP1]], <2 x half> [[TMP2]], float [[TMP3]], i1 true) -// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x i16> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4 -// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP7:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP6]], align 4 -// CHECK-NEXT: [[TMP8:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load <2 x half>, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP10:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f16(<2 x i16> [[TMP7]], <2 x half> [[TMP8]], float [[TMP9]], i1 false) -// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x i16> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 // CHECK-NEXT: ret void // @@ -717,22 +782,25 @@ void test_cvt_scalef32_pk_bf8_f16(global short2* out, half2 src, float scale) // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5) // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store <2 x bfloat> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x bfloat> [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP0]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x bfloat>, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.bf16(<2 x i16> [[TMP1]], <2 x bfloat> [[TMP2]], float [[TMP3]], i1 true) -// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x i16> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4 -// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP7:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP6]], align 4 -// CHECK-NEXT: [[TMP8:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load <2 x bfloat>, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP10:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.bf16(<2 x i16> [[TMP7]], <2 x bfloat> [[TMP8]], float [[TMP9]], i1 false) -// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x i16> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 // CHECK-NEXT: ret void // @@ -747,28 +815,31 @@ void test_cvt_scalef32_pk_bf8_bf16(global short2* out, bfloat2 src, float scale) // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp4(i32 [[TMP0]], float [[TMP1]], i32 0) -// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x float> [[TMP2]], ptr addrspace(1) [[TMP3]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp4(i32 [[TMP4]], float [[TMP5]], i32 1) -// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x float> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8 -// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP10:%.*]] = call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp4(i32 [[TMP8]], float [[TMP9]], i32 2) -// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x float> [[TMP10]], ptr addrspace(1) [[TMP11]], align 8 -// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP13:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP13:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP14:%.*]] = call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp4(i32 [[TMP12]], float [[TMP13]], i32 3) -// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x float> [[TMP14]], ptr addrspace(1) [[TMP15]], align 8 // CHECK-NEXT: ret void // @@ -786,41 +857,45 @@ void test_cvt_scalef32_pk_f32_fp4(global float2* out, uint src, float scale) // CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca float, align 4, addrspace(5) // CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca float, align 4, addrspace(5) // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store float [[SRC0:%.*]], ptr addrspace(5) [[SRC0_ADDR]], align 4 -// CHECK-NEXT: store float [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4 -// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC0_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC0_ADDR]] to ptr +// CHECK-NEXT: [[SRC1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC1_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store float [[SRC0:%.*]], ptr [[SRC0_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SRC1:%.*]], ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[SRC0_ADDR]], align 4 -// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SRC1_ADDR]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[SRC0_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 [[TMP1]], float [[TMP2]], float [[TMP3]], float [[TMP4]], i32 0) -// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 -// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4 -// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SRC0_ADDR]], align 4 -// CHECK-NEXT: [[TMP10:%.*]] = load float, ptr addrspace(5) [[SRC1_ADDR]], align 4 -// CHECK-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[SRC0_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 [[TMP8]], float [[TMP9]], float [[TMP10]], float [[TMP11]], i32 1) -// CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4 -// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4 -// CHECK-NEXT: [[TMP16:%.*]] = load float, ptr addrspace(5) [[SRC0_ADDR]], align 4 -// CHECK-NEXT: [[TMP17:%.*]] = load float, ptr addrspace(5) [[SRC1_ADDR]], align 4 -// CHECK-NEXT: [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP16:%.*]] = load float, ptr [[SRC0_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load float, ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 [[TMP15]], float [[TMP16]], float [[TMP17]], float [[TMP18]], i32 2) -// CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4 -// CHECK-NEXT: [[TMP21:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP21:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(1) [[TMP21]], align 4 -// CHECK-NEXT: [[TMP23:%.*]] = load float, ptr addrspace(5) [[SRC0_ADDR]], align 4 -// CHECK-NEXT: [[TMP24:%.*]] = load float, ptr addrspace(5) [[SRC1_ADDR]], align 4 -// CHECK-NEXT: [[TMP25:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP23:%.*]] = load float, ptr [[SRC0_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP24:%.*]] = load float, ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP25:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 [[TMP22]], float [[TMP23]], float [[TMP24]], float [[TMP25]], i32 3) -// CHECK-NEXT: [[TMP27:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP27:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP26]], ptr addrspace(1) [[TMP27]], align 4 // CHECK-NEXT: ret void // @@ -837,28 +912,31 @@ void test_cvt_scalef32_pk_fp4_f32(global unsigned int* out, float src0, float sr // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 [[TMP0]], float [[TMP1]], i32 0) -// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x half> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP6:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 [[TMP4]], float [[TMP5]], i32 1) -// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x half> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4 -// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP10:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 [[TMP8]], float [[TMP9]], i32 2) -// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x half> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 -// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP13:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP13:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP14:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 [[TMP12]], float [[TMP13]], i32 3) -// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x half> [[TMP14]], ptr addrspace(1) [[TMP15]], align 4 // CHECK-NEXT: ret void // @@ -875,28 +953,31 @@ void test_cvt_scalef32_pk_f16_fp4(global half2* out, uint src, float scale) // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp4(i32 [[TMP0]], float [[TMP1]], i32 0) -// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x bfloat> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP6:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp4(i32 [[TMP4]], float [[TMP5]], i32 1) -// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x bfloat> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4 -// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP10:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp4(i32 [[TMP8]], float [[TMP9]], i32 2) -// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x bfloat> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 -// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP13:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP13:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP14:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp4(i32 [[TMP12]], float [[TMP13]], i32 3) -// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x bfloat> [[TMP14]], ptr addrspace(1) [[TMP15]], align 4 // CHECK-NEXT: ret void // @@ -913,18 +994,21 @@ void test_cvt_scalef32_pk_bf16_fp4(global bfloat2* out, uint src, float scale) // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <6 x i32>, align 32, addrspace(5) // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store <6 x i32> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 32 -// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load <6 x i32>, ptr addrspace(5) [[SRC_ADDR]], align 32 -// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <6 x i32> [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <6 x i32>, ptr [[SRC_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP2:%.*]] = call <32 x float> @llvm.amdgcn.cvt.scalef32.pk32.f32.fp6(<6 x i32> [[TMP0]], float [[TMP1]]) -// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <32 x float> [[TMP2]], ptr addrspace(1) [[TMP3]], align 128 -// CHECK-NEXT: [[TMP4:%.*]] = load <6 x i32>, ptr addrspace(5) [[SRC_ADDR]], align 32 -// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load <6 x i32>, ptr [[SRC_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP6:%.*]] = call <32 x float> @llvm.amdgcn.cvt.scalef32.pk32.f32.bf6(<6 x i32> [[TMP4]], float [[TMP5]]) -// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <32 x float> [[TMP6]], ptr addrspace(1) [[TMP7]], align 128 // CHECK-NEXT: ret void // @@ -939,18 +1023,21 @@ void test_cvt_scalef32_pk_f32_fp6(global float32* out, uint6 src, float scale) // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <6 x i32>, align 32, addrspace(5) // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store <6 x i32> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 32 -// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load <6 x i32>, ptr addrspace(5) [[SRC_ADDR]], align 32 -// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <6 x i32> [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <6 x i32>, ptr [[SRC_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP2:%.*]] = call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.fp6(<6 x i32> [[TMP0]], float [[TMP1]]) -// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <32 x half> [[TMP2]], ptr addrspace(1) [[TMP3]], align 64 -// CHECK-NEXT: [[TMP4:%.*]] = load <6 x i32>, ptr addrspace(5) [[SRC_ADDR]], align 32 -// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load <6 x i32>, ptr [[SRC_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP6:%.*]] = call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.bf6(<6 x i32> [[TMP4]], float [[TMP5]]) -// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <32 x half> [[TMP6]], ptr addrspace(1) [[TMP7]], align 64 // CHECK-NEXT: ret void // @@ -965,18 +1052,21 @@ void test_cvt_scalef32_pk32_f16_fpbf6(global half32 *out, uint6 src, float scale // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <6 x i32>, align 32, addrspace(5) // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store <6 x i32> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 32 -// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load <6 x i32>, ptr addrspace(5) [[SRC_ADDR]], align 32 -// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <6 x i32> [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <6 x i32>, ptr [[SRC_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP2:%.*]] = call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.fp6(<6 x i32> [[TMP0]], float [[TMP1]]) -// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <32 x bfloat> [[TMP2]], ptr addrspace(1) [[TMP3]], align 64 -// CHECK-NEXT: [[TMP4:%.*]] = load <6 x i32>, ptr addrspace(5) [[SRC_ADDR]], align 32 -// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load <6 x i32>, ptr [[SRC_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP6:%.*]] = call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.bf6(<6 x i32> [[TMP4]], float [[TMP5]]) -// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <32 x bfloat> [[TMP6]], ptr addrspace(1) [[TMP7]], align 64 // CHECK-NEXT: ret void // @@ -991,18 +1081,21 @@ void test_cvt_scalef32_pk32_bf16_fpbf6(global bfloat32 *out, uint6 src, float sc // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp8(i32 [[TMP0]], float [[TMP1]], i1 true) -// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x half> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP6:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp8(i32 [[TMP4]], float [[TMP5]], i1 false) -// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x half> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4 // CHECK-NEXT: ret void // @@ -1017,18 +1110,21 @@ void test_cvt_scalef32_pk_f16_fp8(global half2* out, unsigned int src, float sca // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.bf8(i32 [[TMP0]], float [[TMP1]], i1 true) -// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x half> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP6:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.bf8(i32 [[TMP4]], float [[TMP5]], i1 false) -// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x half> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4 // CHECK-NEXT: ret void // @@ -1043,18 +1139,21 @@ void test_cvt_scalef32_pk_f16_bf8(global half2* out, unsigned int src, float sca // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp8(i32 [[TMP0]], float [[TMP1]], i1 true) -// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x bfloat> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP6:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp8(i32 [[TMP4]], float [[TMP5]], i1 false) -// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x bfloat> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4 // CHECK-NEXT: ret void // @@ -1069,18 +1168,21 @@ void test_cvt_scalef32_pk_bf16_fp8(global bfloat2* out, unsigned int src, float // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.bf8(i32 [[TMP0]], float [[TMP1]], i1 true) -// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x bfloat> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP6:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.bf8(i32 [[TMP4]], float [[TMP5]], i1 false) -// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x bfloat> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4 // CHECK-NEXT: ret void // @@ -1095,36 +1197,39 @@ void test_cvt_scalef32_pk_bf16_bf8(global bfloat2* out, unsigned int src, float // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5) // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store <2 x half> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x half> [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 [[TMP1]], <2 x half> [[TMP2]], float [[TMP3]], i32 0) -// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(1) [[TMP5]], align 4 -// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4 -// CHECK-NEXT: [[TMP8:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load <2 x half>, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 [[TMP7]], <2 x half> [[TMP8]], float [[TMP9]], i32 1) -// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 -// CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(1) [[TMP12]], align 4 -// CHECK-NEXT: [[TMP14:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP15:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP14:%.*]] = load <2 x half>, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP15:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 [[TMP13]], <2 x half> [[TMP14]], float [[TMP15]], i32 2) -// CHECK-NEXT: [[TMP17:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP17:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[TMP17]], align 4 -// CHECK-NEXT: [[TMP18:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP18:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr addrspace(1) [[TMP18]], align 4 -// CHECK-NEXT: [[TMP20:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP21:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP20:%.*]] = load <2 x half>, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP21:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 [[TMP19]], <2 x half> [[TMP20]], float [[TMP21]], i32 3) -// CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP22]], ptr addrspace(1) [[TMP23]], align 4 // CHECK-NEXT: ret void // @@ -1142,37 +1247,41 @@ void test_cvt_scalef32_pk_fp4_f16(global unsigned int* out, half2 src, float sca // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5) // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) // CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store <2 x bfloat> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 -// CHECK-NEXT: store i32 [[OLD:%.*]], ptr addrspace(5) [[OLD_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: [[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OLD_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x bfloat> [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x bfloat>, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 [[TMP1]], <2 x bfloat> [[TMP2]], float [[TMP3]], i32 0) -// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(1) [[TMP5]], align 4 -// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4 -// CHECK-NEXT: [[TMP8:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load <2 x bfloat>, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 [[TMP7]], <2 x bfloat> [[TMP8]], float [[TMP9]], i32 1) -// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 -// CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(1) [[TMP12]], align 4 -// CHECK-NEXT: [[TMP14:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP15:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP14:%.*]] = load <2 x bfloat>, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP15:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 [[TMP13]], <2 x bfloat> [[TMP14]], float [[TMP15]], i32 2) -// CHECK-NEXT: [[TMP17:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP17:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[TMP17]], align 4 -// CHECK-NEXT: [[TMP18:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP18:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr addrspace(1) [[TMP18]], align 4 -// CHECK-NEXT: [[TMP20:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP21:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP20:%.*]] = load <2 x bfloat>, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP21:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 [[TMP19]], <2 x bfloat> [[TMP20]], float [[TMP21]], i32 3) -// CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP22]], ptr addrspace(1) [[TMP23]], align 4 // CHECK-NEXT: ret void // @@ -1190,41 +1299,45 @@ void test_cvt_scalef32_pk_fp4_bf16(global unsigned int* out, bfloat2 src, float // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5) // CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5) // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store <2 x half> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4 -// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[SEED_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SEED_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x half> [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SEED:%.*]], ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 [[TMP1]], <2 x half> [[TMP2]], i32 [[TMP3]], float [[TMP4]], i32 0) -// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 -// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4 -// CHECK-NEXT: [[TMP9:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 -// CHECK-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load <2 x half>, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 [[TMP8]], <2 x half> [[TMP9]], i32 [[TMP10]], float [[TMP11]], i32 1) -// CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4 -// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4 -// CHECK-NEXT: [[TMP16:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 -// CHECK-NEXT: [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP16:%.*]] = load <2 x half>, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 [[TMP15]], <2 x half> [[TMP16]], i32 [[TMP17]], float [[TMP18]], i32 2) -// CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4 -// CHECK-NEXT: [[TMP21:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP21:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(1) [[TMP21]], align 4 -// CHECK-NEXT: [[TMP23:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 -// CHECK-NEXT: [[TMP25:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP23:%.*]] = load <2 x half>, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP25:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 [[TMP22]], <2 x half> [[TMP23]], i32 [[TMP24]], float [[TMP25]], i32 3) -// CHECK-NEXT: [[TMP27:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP27:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP26]], ptr addrspace(1) [[TMP27]], align 4 // CHECK-NEXT: ret void // @@ -1242,41 +1355,45 @@ void test_cvt_scalef32_sr_pk_fp4_f16(global unsigned *out, half2 src, uint seed, // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5) // CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5) // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store <2 x bfloat> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4 -// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[SEED_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SEED_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x bfloat> [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SEED:%.*]], ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x bfloat>, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.bf16(i32 [[TMP1]], <2 x bfloat> [[TMP2]], i32 [[TMP3]], float [[TMP4]], i32 0) -// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 -// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4 -// CHECK-NEXT: [[TMP9:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 -// CHECK-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load <2 x bfloat>, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.bf16(i32 [[TMP8]], <2 x bfloat> [[TMP9]], i32 [[TMP10]], float [[TMP11]], i32 1) -// CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4 -// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4 -// CHECK-NEXT: [[TMP16:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 -// CHECK-NEXT: [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP16:%.*]] = load <2 x bfloat>, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.bf16(i32 [[TMP15]], <2 x bfloat> [[TMP16]], i32 [[TMP17]], float [[TMP18]], i32 2) -// CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4 -// CHECK-NEXT: [[TMP21:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP21:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(1) [[TMP21]], align 4 -// CHECK-NEXT: [[TMP23:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 -// CHECK-NEXT: [[TMP25:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP23:%.*]] = load <2 x bfloat>, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP25:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.bf16(i32 [[TMP22]], <2 x bfloat> [[TMP23]], i32 [[TMP24]], float [[TMP25]], i32 3) -// CHECK-NEXT: [[TMP27:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP27:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP26]], ptr addrspace(1) [[TMP27]], align 4 // CHECK-NEXT: ret void // @@ -1294,41 +1411,45 @@ void test_cvt_scalef32_sr_pk_fp4_bf16(global unsigned *out, bfloat2 src, uint se // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x float>, align 8, addrspace(5) // CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5) // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store <2 x float> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8 -// CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4 -// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[SEED_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SEED_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x float> [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[SEED:%.*]], ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr addrspace(5) [[SRC_ADDR]], align 8 -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[SRC_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f32(i32 [[TMP1]], <2 x float> [[TMP2]], i32 [[TMP3]], float [[TMP4]], i32 0) -// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 -// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4 -// CHECK-NEXT: [[TMP9:%.*]] = load <2 x float>, ptr addrspace(5) [[SRC_ADDR]], align 8 -// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 -// CHECK-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load <2 x float>, ptr [[SRC_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f32(i32 [[TMP8]], <2 x float> [[TMP9]], i32 [[TMP10]], float [[TMP11]], i32 1) -// CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4 -// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4 -// CHECK-NEXT: [[TMP16:%.*]] = load <2 x float>, ptr addrspace(5) [[SRC_ADDR]], align 8 -// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 -// CHECK-NEXT: [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP16:%.*]] = load <2 x float>, ptr [[SRC_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f32(i32 [[TMP15]], <2 x float> [[TMP16]], i32 [[TMP17]], float [[TMP18]], i32 2) -// CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4 -// CHECK-NEXT: [[TMP21:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP21:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(1) [[TMP21]], align 4 -// CHECK-NEXT: [[TMP23:%.*]] = load <2 x float>, ptr addrspace(5) [[SRC_ADDR]], align 8 -// CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 -// CHECK-NEXT: [[TMP25:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP23:%.*]] = load <2 x float>, ptr [[SRC_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP25:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f32(i32 [[TMP22]], <2 x float> [[TMP23]], i32 [[TMP24]], float [[TMP25]], i32 3) -// CHECK-NEXT: [[TMP27:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP27:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP26]], ptr addrspace(1) [[TMP27]], align 4 // CHECK-NEXT: ret void // @@ -1348,47 +1469,53 @@ void test_cvt_scalef32_sr_pk_fp4_f32(global unsigned *out, float2 src, uint seed // CHECK-NEXT: [[SRCF32_ADDR:%.*]] = alloca <32 x float>, align 128, addrspace(5) // CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5) // CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca float, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT6:%.*]], ptr addrspace(5) [[OUT6_ADDR]], align 8 -// CHECK-NEXT: store <32 x bfloat> [[SRCBF32:%.*]], ptr addrspace(5) [[SRCBF32_ADDR]], align 64 -// CHECK-NEXT: store <32 x half> [[SRCH32:%.*]], ptr addrspace(5) [[SRCH32_ADDR]], align 64 -// CHECK-NEXT: store <32 x float> [[SRCF32:%.*]], ptr addrspace(5) [[SRCF32_ADDR]], align 128 -// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4 -// CHECK-NEXT: store float [[SRC2:%.*]], ptr addrspace(5) [[SRC2_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load <32 x bfloat>, ptr addrspace(5) [[SRCBF32_ADDR]], align 64 -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4 +// CHECK-NEXT: [[OUT6_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT6_ADDR]] to ptr +// CHECK-NEXT: [[SRCBF32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCBF32_ADDR]] to ptr +// CHECK-NEXT: [[SRCH32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCH32_ADDR]] to ptr +// CHECK-NEXT: [[SRCF32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCF32_ADDR]] to ptr +// CHECK-NEXT: [[SRC1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC1_ADDR]] to ptr +// CHECK-NEXT: [[SRC2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC2_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT6:%.*]], ptr [[OUT6_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <32 x bfloat> [[SRCBF32:%.*]], ptr [[SRCBF32_ADDR_ASCAST]], align 64 +// CHECK-NEXT: store <32 x half> [[SRCH32:%.*]], ptr [[SRCH32_ADDR_ASCAST]], align 64 +// CHECK-NEXT: store <32 x float> [[SRCF32:%.*]], ptr [[SRCF32_ADDR_ASCAST]], align 128 +// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SRC2:%.*]], ptr [[SRC2_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <32 x bfloat>, ptr [[SRCBF32_ADDR_ASCAST]], align 64 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[SRC2_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP3:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.bf16(<32 x bfloat> [[TMP0]], i32 [[TMP1]], float [[TMP2]]) -// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT6_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <6 x i32> [[TMP3]], ptr addrspace(1) [[TMP4]], align 32 -// CHECK-NEXT: [[TMP5:%.*]] = load <32 x half>, ptr addrspace(5) [[SRCH32_ADDR]], align 64 -// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4 -// CHECK-NEXT: [[TMP7:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load <32 x half>, ptr [[SRCH32_ADDR_ASCAST]], align 64 +// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[SRC2_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP8:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.f16(<32 x half> [[TMP5]], i32 [[TMP6]], float [[TMP7]]) -// CHECK-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 +// CHECK-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr [[OUT6_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <6 x i32> [[TMP8]], ptr addrspace(1) [[TMP9]], align 32 -// CHECK-NEXT: [[TMP10:%.*]] = load <32 x float>, ptr addrspace(5) [[SRCF32_ADDR]], align 128 -// CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4 -// CHECK-NEXT: [[TMP12:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4 +// CHECK-NEXT: [[TMP10:%.*]] = load <32 x float>, ptr [[SRCF32_ADDR_ASCAST]], align 128 +// CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP12:%.*]] = load float, ptr [[SRC2_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP13:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.f32(<32 x float> [[TMP10]], i32 [[TMP11]], float [[TMP12]]) -// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 +// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr [[OUT6_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <6 x i32> [[TMP13]], ptr addrspace(1) [[TMP14]], align 32 -// CHECK-NEXT: [[TMP15:%.*]] = load <32 x bfloat>, ptr addrspace(5) [[SRCBF32_ADDR]], align 64 -// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4 -// CHECK-NEXT: [[TMP17:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4 +// CHECK-NEXT: [[TMP15:%.*]] = load <32 x bfloat>, ptr [[SRCBF32_ADDR_ASCAST]], align 64 +// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load float, ptr [[SRC2_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP18:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.bf16(<32 x bfloat> [[TMP15]], i32 [[TMP16]], float [[TMP17]]) -// CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 +// CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr [[OUT6_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <6 x i32> [[TMP18]], ptr addrspace(1) [[TMP19]], align 32 -// CHECK-NEXT: [[TMP20:%.*]] = load <32 x half>, ptr addrspace(5) [[SRCH32_ADDR]], align 64 -// CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4 -// CHECK-NEXT: [[TMP22:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4 +// CHECK-NEXT: [[TMP20:%.*]] = load <32 x half>, ptr [[SRCH32_ADDR_ASCAST]], align 64 +// CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP22:%.*]] = load float, ptr [[SRC2_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP23:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.f16(<32 x half> [[TMP20]], i32 [[TMP21]], float [[TMP22]]) -// CHECK-NEXT: [[TMP24:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 +// CHECK-NEXT: [[TMP24:%.*]] = load ptr addrspace(1), ptr [[OUT6_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <6 x i32> [[TMP23]], ptr addrspace(1) [[TMP24]], align 32 -// CHECK-NEXT: [[TMP25:%.*]] = load <32 x float>, ptr addrspace(5) [[SRCF32_ADDR]], align 128 -// CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4 -// CHECK-NEXT: [[TMP27:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4 +// CHECK-NEXT: [[TMP25:%.*]] = load <32 x float>, ptr [[SRCF32_ADDR_ASCAST]], align 128 +// CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP27:%.*]] = load float, ptr [[SRC2_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP28:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.f32(<32 x float> [[TMP25]], i32 [[TMP26]], float [[TMP27]]) -// CHECK-NEXT: [[TMP29:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 +// CHECK-NEXT: [[TMP29:%.*]] = load ptr addrspace(1), ptr [[OUT6_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <6 x i32> [[TMP28]], ptr addrspace(1) [[TMP29]], align 32 // CHECK-NEXT: ret void // @@ -1408,33 +1535,37 @@ void test_cvt_scalef32_sr_pk32(global uint6 *out6, bfloat32 srcbf32, half32 srch // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5) // CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5) // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store bfloat [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 2 -// CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4 -// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[SEED_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SEED_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store bfloat [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 2 +// CHECK-NEXT: store i32 [[SEED:%.*]], ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = load bfloat, ptr addrspace(5) [[SRC_ADDR]], align 2 -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load bfloat, ptr [[SRC_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.bf16(i32 [[TMP1]], bfloat [[TMP2]], i32 [[TMP3]], float [[TMP4]], i32 0) -// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 -// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4 -// CHECK-NEXT: [[TMP9:%.*]] = load bfloat, ptr addrspace(5) [[SRC_ADDR]], align 2 -// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 -// CHECK-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load bfloat, ptr [[SRC_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.bf16(i32 [[TMP8]], bfloat [[TMP9]], i32 [[TMP10]], float [[TMP11]], i32 1) -// CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4 -// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4 -// CHECK-NEXT: [[TMP16:%.*]] = load bfloat, ptr addrspace(5) [[SRC_ADDR]], align 2 -// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 -// CHECK-NEXT: [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP16:%.*]] = load bfloat, ptr [[SRC_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.bf16(i32 [[TMP15]], bfloat [[TMP16]], i32 [[TMP17]], float [[TMP18]], i32 2) -// CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4 // CHECK-NEXT: ret void // @@ -1451,33 +1582,37 @@ void test_cvt_scalef32_sr_bf8_bf16(global unsigned *out, __bf16 src, uint seed, // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca half, align 2, addrspace(5) // CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5) // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store half [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 2 -// CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4 -// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[SEED_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SEED_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store half [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 2 +// CHECK-NEXT: store i32 [[SEED:%.*]], ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = load half, ptr addrspace(5) [[SRC_ADDR]], align 2 -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load half, ptr [[SRC_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.f16(i32 [[TMP1]], half [[TMP2]], i32 [[TMP3]], float [[TMP4]], i32 0) -// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 -// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4 -// CHECK-NEXT: [[TMP9:%.*]] = load half, ptr addrspace(5) [[SRC_ADDR]], align 2 -// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 -// CHECK-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load half, ptr [[SRC_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.f16(i32 [[TMP8]], half [[TMP9]], i32 [[TMP10]], float [[TMP11]], i32 1) -// CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4 -// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4 -// CHECK-NEXT: [[TMP16:%.*]] = load half, ptr addrspace(5) [[SRC_ADDR]], align 2 -// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 -// CHECK-NEXT: [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP16:%.*]] = load half, ptr [[SRC_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.f16(i32 [[TMP15]], half [[TMP16]], i32 [[TMP17]], float [[TMP18]], i32 2) -// CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4 // CHECK-NEXT: ret void // @@ -1494,33 +1629,37 @@ void test_cvt_scalef32_sr_bf8_f16(global unsigned *out, half src, uint seed, flo // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca float, align 4, addrspace(5) // CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5) // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store float [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4 -// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[SEED_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SEED_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store float [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SEED:%.*]], ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.f32(i32 [[TMP1]], float [[TMP2]], i32 [[TMP3]], float [[TMP4]], i32 0) -// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 -// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4 -// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 -// CHECK-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.f32(i32 [[TMP8]], float [[TMP9]], i32 [[TMP10]], float [[TMP11]], i32 1) -// CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4 -// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4 -// CHECK-NEXT: [[TMP16:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 -// CHECK-NEXT: [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP16:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.f32(i32 [[TMP15]], float [[TMP16]], i32 [[TMP17]], float [[TMP18]], i32 2) -// CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4 // CHECK-NEXT: ret void // @@ -1537,33 +1676,37 @@ void test_cvt_scalef32_sr_bf8_f32(global unsigned *out, float src, uint seed, fl // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5) // CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5) // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store bfloat [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 2 -// CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4 -// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[SEED_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SEED_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store bfloat [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 2 +// CHECK-NEXT: store i32 [[SEED:%.*]], ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = load bfloat, ptr addrspace(5) [[SRC_ADDR]], align 2 -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load bfloat, ptr [[SRC_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.bf16(i32 [[TMP1]], bfloat [[TMP2]], i32 [[TMP3]], float [[TMP4]], i32 0) -// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 -// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4 -// CHECK-NEXT: [[TMP9:%.*]] = load bfloat, ptr addrspace(5) [[SRC_ADDR]], align 2 -// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 -// CHECK-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load bfloat, ptr [[SRC_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.bf16(i32 [[TMP8]], bfloat [[TMP9]], i32 [[TMP10]], float [[TMP11]], i32 1) -// CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4 -// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4 -// CHECK-NEXT: [[TMP16:%.*]] = load bfloat, ptr addrspace(5) [[SRC_ADDR]], align 2 -// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 -// CHECK-NEXT: [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP16:%.*]] = load bfloat, ptr [[SRC_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.bf16(i32 [[TMP15]], bfloat [[TMP16]], i32 [[TMP17]], float [[TMP18]], i32 2) -// CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4 // CHECK-NEXT: ret void // @@ -1580,33 +1723,37 @@ void test_cvt_scalef32_sr_fp8_bf16(global unsigned *out, __bf16 src, uint seed, // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca half, align 2, addrspace(5) // CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5) // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store half [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 2 -// CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4 -// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[SEED_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SEED_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store half [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 2 +// CHECK-NEXT: store i32 [[SEED:%.*]], ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = load half, ptr addrspace(5) [[SRC_ADDR]], align 2 -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load half, ptr [[SRC_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.f16(i32 [[TMP1]], half [[TMP2]], i32 [[TMP3]], float [[TMP4]], i32 0) -// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 -// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4 -// CHECK-NEXT: [[TMP9:%.*]] = load half, ptr addrspace(5) [[SRC_ADDR]], align 2 -// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 -// CHECK-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load half, ptr [[SRC_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.f16(i32 [[TMP8]], half [[TMP9]], i32 [[TMP10]], float [[TMP11]], i32 1) -// CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4 -// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4 -// CHECK-NEXT: [[TMP16:%.*]] = load half, ptr addrspace(5) [[SRC_ADDR]], align 2 -// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 -// CHECK-NEXT: [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP16:%.*]] = load half, ptr [[SRC_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.f16(i32 [[TMP15]], half [[TMP16]], i32 [[TMP17]], float [[TMP18]], i32 2) -// CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4 // CHECK-NEXT: ret void // @@ -1623,33 +1770,37 @@ void test_cvt_scalef32_sr_fp8_f16(global unsigned *out, half src, uint seed, flo // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca float, align 4, addrspace(5) // CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5) // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store float [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4 -// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[SEED_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SEED_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store float [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SEED:%.*]], ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.f32(i32 [[TMP1]], float [[TMP2]], i32 [[TMP3]], float [[TMP4]], i32 0) -// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 -// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4 -// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 -// CHECK-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.f32(i32 [[TMP8]], float [[TMP9]], i32 [[TMP10]], float [[TMP11]], i32 1) -// CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4 -// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4 -// CHECK-NEXT: [[TMP16:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 -// CHECK-NEXT: [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP16:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.f32(i32 [[TMP15]], float [[TMP16]], i32 [[TMP17]], float [[TMP18]], i32 2) -// CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4 // CHECK-NEXT: ret void // @@ -1666,15 +1817,19 @@ void test_cvt_scalef32_sr_fp8_f32(global unsigned *out, float src, uint seed, fl // CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5) // CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5) // CHECK-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4 -// CHECK-NEXT: store i32 [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 4 -// CHECK-NEXT: store i32 [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4 -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[B_ADDR]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[C_ADDR]], align 4 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[C:%.*]], ptr [[C_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[C_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.bitop3.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 1) -// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4 // CHECK-NEXT: ret void // @@ -1689,15 +1844,19 @@ void test_bitop3_b32(global uint* out, uint a, uint b, uint c) // CHECK-NEXT: [[A_ADDR:%.*]] = alloca i16, align 2, addrspace(5) // CHECK-NEXT: [[B_ADDR:%.*]] = alloca i16, align 2, addrspace(5) // CHECK-NEXT: [[C_ADDR:%.*]] = alloca i16, align 2, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store i16 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 2 -// CHECK-NEXT: store i16 [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 2 -// CHECK-NEXT: store i16 [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 2 -// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr addrspace(5) [[A_ADDR]], align 2 -// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr addrspace(5) [[B_ADDR]], align 2 -// CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(5) [[C_ADDR]], align 2 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i16 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT: store i16 [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 2 +// CHECK-NEXT: store i16 [[C:%.*]], ptr [[C_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[B_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr [[C_ADDR_ASCAST]], align 2 // CHECK-NEXT: [[TMP3:%.*]] = call i16 @llvm.amdgcn.bitop3.i16(i16 [[TMP0]], i16 [[TMP1]], i16 [[TMP2]], i32 1) -// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i16 [[TMP3]], ptr addrspace(1) [[TMP4]], align 2 // CHECK-NEXT: ret void // @@ -1711,22 +1870,25 @@ void test_bitop3_b16(global ushort* out, ushort a, ushort b, ushort c) // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca float, align 4, addrspace(5) // CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store float [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[SEED_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SEED_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store float [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SEED:%.*]], ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[TMP0]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP4:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.sr.bf16.f32(<2 x bfloat> [[TMP1]], float [[TMP2]], i32 [[TMP3]], i1 false) -// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x bfloat> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4 -// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP7:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[TMP6]], align 4 -// CHECK-NEXT: [[TMP8:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP10:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.sr.bf16.f32(<2 x bfloat> [[TMP7]], float [[TMP8]], i32 [[TMP9]], i1 true) -// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x bfloat> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 // CHECK-NEXT: ret void // @@ -1741,22 +1903,25 @@ void test_cvt_sr_bf16_f32(global bfloat2 *out, float src, uint seed) // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca float, align 4, addrspace(5) // CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// CHECK-NEXT: store float [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[SEED_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SEED_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store float [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SEED:%.*]], ptr [[SEED_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP0]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP4:%.*]] = call <2 x half> @llvm.amdgcn.cvt.sr.f16.f32(<2 x half> [[TMP1]], float [[TMP2]], i32 [[TMP3]], i1 false) -// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x half> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4 -// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP7:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP6]], align 4 -// CHECK-NEXT: [[TMP8:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4 -// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP10:%.*]] = call <2 x half> @llvm.amdgcn.cvt.sr.f16.f32(<2 x half> [[TMP7]], float [[TMP8]], i32 [[TMP9]], i1 true) -// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store <2 x half> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 // CHECK-NEXT: ret void // @@ -1770,10 +1935,12 @@ void test_cvt_sr_f16_f32(global half2 *out, float src, uint seed) // CHECK-NEXT: entry: // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8 -// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4 +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4 // CHECK-NEXT: call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 12, i32 0, i32 0) // CHECK-NEXT: ret void // @@ -1785,10 +1952,12 @@ void test_global_load_lds_96(global void* src, local void *dst) { // CHECK-NEXT: entry: // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8 -// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4 +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4 // CHECK-NEXT: call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 16, i32 0, i32 0) // CHECK-NEXT: ret void //