[CIR][CIRGen][Builtin][X86] Lower AVX masked load intrinsics (#1763)

RiverDave · web-flow · commit 46686b76db1f · 2025-07-29T11:42:11.000-03:00
For these intrinsics there only seems to be one function where the IR emmited seems to diverge: for `_mm_load_sbh` loads a single 16-bit bfloat (__bf16) value from memory into the lowest element of a 128-bit bfloat vector (__m128bh), leaving the remaining lanes unchanged or filled with a passthrough value. It is implemented using a masked load with only the first lane enabled. [source for intrinsics with similar behaviour](https://gist.github.com/leopck/86799fee6ceb9649d0ebe32c1c6e5b85) In the CIR lowering of `_mm_load_sbh`, we are currently emitting the mask of intrinsic (`llvm.masked.load`) operand as an explicit constant vector: ``` llvm <8 x i1> <true, false, false, false, false, false, false, false> ``` whereas OG lowers: ```llvm <8 x i1> bitcast (<1 x i8> splat (i8 1) to <8 x i1>) ``` I believe both things are semantically equal so: Is it acceptable for CIR and OG to diverge in this way for masked loads, or should we aim for parity in how the mask is represented, even if that reduces readability in CIR?
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
@@ -895,6 +895,34 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
     return CIRBaseBuilderTy::createStore(loc, flag, dst);
   }
 
+  /// Create a call to a Masked Load intrinsic.
+  /// \p loc       - expression location
+  /// \p ty        - vector type to load
+  /// \p ptr       - base pointer for the load
+  /// \p alignment - alignment of the source location
+  /// \p mask      - vector of booleans which indicates what vector lanes should
+  ///                be accessed in memory
+  /// \p passThru  - pass-through value that is used to fill the masked-off
+  /// lanes
+  ///                of the result
+  mlir::Value createMaskedLoad(mlir::Location loc, mlir::Type ty,
+                               mlir::Value ptr, llvm::Align alignment,
+                               mlir::Value mask, mlir::Value passThru) {
+
+    assert(mlir::isa<cir::VectorType>(ty) && "Type should be vector");
+    assert(mask && "Mask should not be all-ones (null)");
+
+    if (!passThru)
+      passThru = this->getConstant(loc, cir::PoisonAttr::get(ty));
+
+    mlir::Value ops[] = {ptr, this->getUInt32(int32_t(alignment.value()), loc),
+                         mask, passThru};
+
+    return create<cir::LLVMIntrinsicCallOp>(loc, getStringAttr("masked.load"),
+                                            ty, ops)
+        .getResult();
+  }
+
   /// Create a call to a masked store intrinsic.
   /// \p loc       - expression location
   /// \p val       - data to be stored
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -105,6 +105,19 @@ static mlir::Value emitX86MaskedStore(CIRGenFunction &cgf,
                                             maskVec);
 }
 
+static mlir::Value emitX86MaskedLoad(CIRGenFunction &cgf,
+                                     ArrayRef<mlir::Value> ops,
+                                     llvm::Align alignment,
+                                     mlir::Location loc) {
+  mlir::Type ty = ops[1].getType();
+  mlir::Value ptr = ops[0];
+  mlir::Value maskVec =
+      getMaskVecValue(cgf, ops[2], cast<cir::VectorType>(ty).getSize(), loc);
+
+  return cgf.getBuilder().createMaskedLoad(loc, ty, ptr, alignment, maskVec,
+                                           ops[1]);
+}
+
 static mlir::Value emitX86SExtMask(CIRGenFunction &cgf, mlir::Value op,
                                    mlir::Type dstTy, mlir::Location loc) {
   unsigned numberOfElements = cast<cir::VectorType>(dstTy).getSize();
@@ -586,13 +599,15 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_loaddqudi128_mask:
   case X86::BI__builtin_ia32_loaddqudi256_mask:
   case X86::BI__builtin_ia32_loaddqudi512_mask:
-    llvm_unreachable("vfmaddsubph256_round_mask3 NYI");
+    return emitX86MaskedLoad(*this, Ops, llvm::Align(1),
+                             getLoc(E->getExprLoc()));
 
   case X86::BI__builtin_ia32_loadsbf16128_mask:
   case X86::BI__builtin_ia32_loadsh128_mask:
   case X86::BI__builtin_ia32_loadss128_mask:
   case X86::BI__builtin_ia32_loadsd128_mask:
-    llvm_unreachable("vfmaddsubph256_round_mask3 NYI");
+    return emitX86MaskedLoad(*this, Ops, llvm::Align(1),
+                             getLoc(E->getExprLoc()));
 
   case X86::BI__builtin_ia32_loadaps128_mask:
   case X86::BI__builtin_ia32_loadaps256_mask:
@@ -606,7 +621,10 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_movdqa64load128_mask:
   case X86::BI__builtin_ia32_movdqa64load256_mask:
   case X86::BI__builtin_ia32_movdqa64load512_mask:
-    llvm_unreachable("vfmaddsubph256_round_mask3 NYI");
+    return emitX86MaskedLoad(
+        *this, Ops,
+        getContext().getTypeAlignInChars(E->getArg(1)->getType()).getAsAlign(),
+        getLoc(E->getExprLoc()));
 
   case X86::BI__builtin_ia32_expandloaddf128_mask:
   case X86::BI__builtin_ia32_expandloaddf256_mask:
diff --git a/clang/test/CIR/CodeGen/X86/avx10_2bf16-builtins.c b/clang/test/CIR/CodeGen/X86/avx10_2bf16-builtins.c
@@ -13,3 +13,22 @@ void test_mm_mask_store_sbh(void *__P, __mmask8 __U, __m128bh __A) {
   // LLVM: call void @llvm.masked.store.v8bf16.p0(<8 x bfloat> %{{.*}}, ptr %{{.*}}, i32 1, <8 x i1> %{{.*}})
   _mm_mask_store_sbh(__P, __U, __A);
 }
+
+__m128bh test_mm_load_sbh(void const *A) {
+  // CIR-LABEL: _mm_load_sbh
+  // CIR: %{{.*}} = cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.vector<!cir.bf16 x 8>>, !u32i, !cir.vector<!cir.int<s, 1> x 8>, !cir.vector<!cir.bf16 x 8>) -> !cir.vector<!cir.bf16 x 8> 
+
+  // LLVM-LABEL: @test_mm_load_sbh
+  // NOTE: OG represents the mask using a bitcast from splat (i8 1), see IR-differences #1767
+  // LLVM: %{{.*}} = call <8 x bfloat> @llvm.masked.load.v8bf16.p0(ptr %{{.*}}, i32 1,  <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <8 x bfloat> %{{.*}})
+  return _mm_load_sbh(A);
+}
+
+__m128bh test_mm_mask_load_sbh(__m128bh __A, __mmask8 __U, const void *__W) {
+  // CIR-LABEL: _mm_mask_load_sbh
+  // CIR: %{{.*}} = cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.vector<!cir.bf16 x 8>>, !u32i, !cir.vector<!cir.int<s, 1> x 8>, !cir.vector<!cir.bf16 x 8>) -> !cir.vector<!cir.bf16 x 8>
+
+  // LLVM-LABEL: @test_mm_mask_load_sbh
+  // LLVM: %{{.*}} = call <8 x bfloat> @llvm.masked.load.v8bf16.p0(ptr %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}})
+  return _mm_mask_load_sbh(__A, __U, __W);
+}
diff --git a/clang/test/CIR/CodeGen/X86/avx512bw-builtins.c b/clang/test/CIR/CodeGen/X86/avx512bw-builtins.c
@@ -37,3 +37,39 @@ __m512i test_mm512_movm_epi16(__mmask32 __A) {
   // LLVM:  %{{.*}} = sext <32 x i1> %{{.*}} to <32 x i16>
   return _mm512_movm_epi16(__A); 
 }
+
+__m512i test_mm512_mask_loadu_epi8(__m512i __W, __mmask64 __U, void const *__P) {
+  // CIR-LABEL: _mm512_mask_loadu_epi8
+  // CIR: %{{.*}} = cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.vector<{{!s8i|!u8i}} x 64>>, !u32i, !cir.vector<!cir.int<s, 1> x 64>, !cir.vector<{{!s8i|!u8i}} x 64>) -> !cir.vector<{{!s8i|!u8i}} x 64>
+
+  // LLVM-LABEL: @test_mm512_mask_loadu_epi8
+  // LLVM: @llvm.masked.load.v64i8.p0(ptr %{{.*}}, i32 1, <64 x i1> %{{.*}}, <64 x i8> %{{.*}})
+  return _mm512_mask_loadu_epi8(__W, __U, __P); 
+}
+
+__m512i test_mm512_mask_loadu_epi16(__m512i __W, __mmask32 __U, void const *__P) {
+  // CIR-LABEL: _mm512_mask_loadu_epi16
+  // CIR: %{{.*}} = cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.vector<!s16i x 32>>, !u32i, !cir.vector<!cir.int<s, 1> x 32>, !cir.vector<!s16i x 32>) -> !cir.vector<!s16i x 32>
+
+  // LLVM-LABEL: @test_mm512_mask_loadu_epi16
+  // LLVM: @llvm.masked.load.v32i16.p0(ptr %{{.*}}, i32 1, <32 x i1> %{{.*}}, <32 x i16> %{{.*}})
+  return _mm512_mask_loadu_epi16(__W, __U, __P); 
+}
+
+__m512i test_mm512_maskz_loadu_epi16(__mmask32 __U, void const *__P) {
+  // CIR-LABEL: _mm512_maskz_loadu_epi16
+  // CIR: %{{.*}} = cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.vector<!s16i x 32>>, !u32i, !cir.vector<!cir.int<s, 1> x 32>, !cir.vector<!s16i x 32>) -> !cir.vector<!s16i x 32>
+
+  // LLVM-LABEL: @test_mm512_maskz_loadu_epi16
+  // LLVM: @llvm.masked.load.v32i16.p0(ptr %{{.*}}, i32 1, <32 x i1> %{{.*}}, <32 x i16> %{{.*}})
+  return _mm512_maskz_loadu_epi16(__U, __P); 
+}
+
+__m512i test_mm512_maskz_loadu_epi8(__mmask64 __U, void const *__P) {
+  // CIR-LABEL: _mm512_maskz_loadu_epi8
+  // CIR: cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.vector<{{!s8i|!u8i}} x 64>>, !u32i, !cir.vector<!cir.int<s, 1> x 64>, !cir.vector<{{!s8i|!u8i}} x 64>) -> !cir.vector<{{!s8i|!u8i}} x 64>
+
+  // LLVM-LABEL: @test_mm512_maskz_loadu_epi8
+  // LLVM: @llvm.masked.load.v64i8.p0(ptr %{{.*}}, i32 1, <64 x i1> %{{.*}}, <64 x i8> %{{.*}})
+  return _mm512_maskz_loadu_epi8(__U, __P); 
+}
diff --git a/clang/test/CIR/CodeGen/X86/avx512f-builtins.c b/clang/test/CIR/CodeGen/X86/avx512f-builtins.c
@@ -82,3 +82,171 @@ void test_mm512_mask_store_ps(void *p, __m512 a, __mmask16 m){
   // LLVM: @llvm.masked.store.v16f32.p0(<16 x float> %{{.*}}, ptr %{{.*}}, i32 64, <16 x i1> %{{.*}})
   _mm512_mask_store_ps(p, m, a);
 }
+
+__m512 test_mm512_mask_loadu_ps (__m512 __W, __mmask16 __U, void *__P)
+{
+  // CIR-LABEL: _mm512_mask_loadu_ps
+  // CIR: %{{.*}} = cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.float>, !u32i, !cir.vector<!cir.int<s, 1> x 16>, !cir.vector<!cir.float x 16>) -> !cir.vector<!cir.float x 16>
+
+  // LLVM-LABEL: test_mm512_mask_loadu_ps
+  // LLVM: @llvm.masked.load.v16f32.p0(ptr %{{.*}}, i32 1, <16 x i1> %{{.*}}, <16 x float> %{{.*}})
+  return _mm512_mask_loadu_ps (__W,__U, __P);
+}
+
+__m512 test_mm512_maskz_load_ps(__mmask16 __U, void *__P)
+{
+
+  // CIR-LABEL: _mm512_maskz_load_ps
+  // CIR: cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.vector<!cir.float x 16>>, !u32i, !cir.vector<!cir.int<s, 1> x 16>, !cir.vector<!cir.float x 16>) -> !cir.vector<!cir.float x 16>
+
+  // LLVM-LABEL: test_mm512_maskz_load_ps
+  // LLVM: @llvm.masked.load.v16f32.p0(ptr %{{.*}}, i32 64, <16 x i1> %{{.*}}, <16 x float> %{{.*}})
+  return _mm512_maskz_load_ps(__U, __P);
+}
+
+__m512d test_mm512_mask_loadu_pd (__m512d __W, __mmask8 __U, void *__P)
+{
+  // CIR-LABEL: _mm512_mask_loadu_pd
+  // CIR: %{{.*}} = cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.double>, !u32i, !cir.vector<!cir.int<s, 1> x 8>, !cir.vector<!cir.double x 8>) -> !cir.vector<!cir.double x 8>
+
+  // LLVM-LABEL: test_mm512_mask_loadu_pd
+  // LLVM: @llvm.masked.load.v8f64.p0(ptr %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x double> %{{.*}})
+  return _mm512_mask_loadu_pd (__W,__U, __P);
+}
+
+__m512d test_mm512_maskz_load_pd(__mmask8 __U, void *__P)
+{
+  // CIR-LABEL: _mm512_maskz_load_pd
+  // CIR: cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.vector<!cir.double x 8>>, !u32i, !cir.vector<!cir.int<s, 1> x 8>, !cir.vector<!cir.double x 8>) -> !cir.vector<!cir.double x 8>
+
+  // LLVM-LABEL: test_mm512_maskz_load_pd
+  // LLVM: @llvm.masked.load.v8f64.p0(ptr %{{.*}}, i32 64, <8 x i1> %{{.*}}, <8 x double> %{{.*}})
+  return _mm512_maskz_load_pd(__U, __P);
+}
+
+__m512i test_mm512_mask_loadu_epi32 (__m512i __W, __mmask16 __U, void *__P)
+{
+  // CIR-LABEL: _mm512_mask_loadu_epi32
+  // CIR: %{{.*}} = cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!s32i>, !u32i, !cir.vector<!cir.int<s, 1> x 16>, !cir.vector<!s32i x 16>) -> !cir.vector<!s32i x 16>
+
+  // LLVM-LABEL: test_mm512_mask_loadu_epi32
+  // LLVM: @llvm.masked.load.v16i32.p0(ptr %{{.*}}, i32 1, <16 x i1> %{{.*}}, <16 x i32> %{{.*}})
+  return _mm512_mask_loadu_epi32 (__W,__U, __P);
+}
+
+__m512i test_mm512_maskz_loadu_epi32 (__mmask16 __U, void *__P)
+{
+  // CIR-LABEL: _mm512_maskz_loadu_epi32
+  // CIR: cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!s32i>, !u32i, !cir.vector<!cir.int<s, 1> x 16>, !cir.vector<!s32i x 16>) -> !cir.vector<!s32i x 16>
+
+  // LLVM-LABEL: test_mm512_maskz_loadu_epi32
+  // LLVM: @llvm.masked.load.v16i32.p0(ptr %{{.*}}, i32 1, <16 x i1> %{{.*}}, <16 x i32> %{{.*}})
+  return _mm512_maskz_loadu_epi32 (__U, __P);
+}
+
+__m512i test_mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void *__P)
+{
+  // CIR-LABEL: _mm512_mask_loadu_epi64
+  // CIR: %{{.*}} = cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!s64i>, !u32i, !cir.vector<!cir.int<s, 1> x 8>, !cir.vector<!s64i x 8>) -> !cir.vector<!s64i x 8>
+
+  // LLVM-LABEL: test_mm512_mask_loadu_epi64 
+  // LLVM: @llvm.masked.load.v8i64.p0(ptr %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x i64> %{{.*}})
+  return _mm512_mask_loadu_epi64 (__W,__U, __P);
+}
+
+__m512i test_mm512_maskz_loadu_epi64 (__mmask16 __U, void *__P)
+{
+  // CIR-LABEL: _mm512_maskz_loadu_epi64
+  // CIR: cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!s64i>, !u32i, !cir.vector<!cir.int<s, 1> x 8>, !cir.vector<!s64i x 8>) -> !cir.vector<!s64i x 8>
+
+  // LLVM-LABEL: test_mm512_maskz_loadu_epi64
+  // LLVM: @llvm.masked.load.v8i64.p0(ptr %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x i64> %{{.*}})
+  return _mm512_maskz_loadu_epi64 (__U, __P);
+}
+
+__m128 test_mm_mask_load_ss(__m128 __A, __mmask8 __U, const float* __W)
+{
+  // CIR-LABEL: _mm_mask_load_ss
+  // CIR: %{{.*}} = cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.vector<!cir.float x 4>>, !u32i, !cir.vector<!cir.int<s, 1> x 4>, !cir.vector<!cir.float x 4>) -> !cir.vector<!cir.float x 4>
+  
+  // LLVM-LABEL: test_mm_mask_load_ss
+  // LLVM: call {{.*}}<4 x float> @llvm.masked.load.v4f32.p0(ptr %{{.*}}, i32 1, <4 x i1> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_mask_load_ss(__A, __U, __W);
+}
+
+__m128 test_mm_maskz_load_ss (__mmask8 __U, const float * __W)
+{
+  // CIR-LABEL: _mm_maskz_load_ss
+  // CIR: %{{.*}} = cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.vector<!cir.float x 4>>, !u32i, !cir.vector<!cir.int<s, 1> x 4>, !cir.vector<!cir.float x 4>) -> !cir.vector<!cir.float x 4>
+
+  // LLVM-LABEL: test_mm_maskz_load_ss
+  // LLVM: call {{.*}}<4 x float> @llvm.masked.load.v4f32.p0(ptr %{{.*}}, i32 1, <4 x i1> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_maskz_load_ss (__U, __W);
+}
+
+__m128d test_mm_mask_load_sd (__m128d __A, __mmask8 __U, const double * __W)
+{
+  // CIR-LABEL: _mm_mask_load_sd
+  // CIR: %{{.*}} = cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.vector<!cir.double x 2>>, !u32i, !cir.vector<!cir.int<s, 1> x 2>, !cir.vector<!cir.double x 2>) -> !cir.vector<!cir.double x 2>
+
+  // LLVM-LABEL: test_mm_mask_load_sd
+  // LLVM: call {{.*}}<2 x double> @llvm.masked.load.v2f64.p0(ptr %{{.*}}, i32 1, <2 x i1> %{{.*}}, <2 x double> %{{.*}})
+  return _mm_mask_load_sd (__A, __U, __W);
+}
+
+__m128d test_mm_maskz_load_sd (__mmask8 __U, const double * __W)
+{
+  // CIR-LABEL: _mm_maskz_load_sd
+  // CIR: %{{.*}} = cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.vector<!cir.double x 2>>, !u32i, !cir.vector<!cir.int<s, 1> x 2>, !cir.vector<!cir.double x 2>) -> !cir.vector<!cir.double x 2>
+
+  // LLVM-LABEL: test_mm_maskz_load_sd
+  // LLVM: call {{.*}}<2 x double> @llvm.masked.load.v2f64.p0(ptr %{{.*}}, i32 1, <2 x i1> %{{.*}}, <2 x double> %{{.*}})
+  return _mm_maskz_load_sd (__U, __W);
+}
+
+__m512 test_mm512_mask_load_ps (__m512 __W, __mmask16 __U, void *__P)
+{
+  // CIR-LABEL: _mm512_mask_load_ps
+  // CIR: %{{.*}} = cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.vector<!cir.float x 16>>, !u32i, !cir.vector<!cir.int<s, 1> x 16>, !cir.vector<!cir.float x 16>) -> !cir.vector<!cir.float x 16>
+
+  // LLVM-LABEL: test_mm512_mask_load_ps
+  // LLVM: @llvm.masked.load.v16f32.p0(ptr %{{.*}}, i32 64, <16 x i1> %{{.*}}, <16 x float> %{{.*}})
+  return _mm512_mask_load_ps (__W,__U, __P);
+}
+
+__m512d test_mm512_mask_load_pd (__m512d __W, __mmask8 __U, void *__P)
+{
+  // CIR-LABEL: _mm512_mask_load_pd
+  // CIR: %{{.*}} = cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.vector<!cir.double x 8>>, !u32i, !cir.vector<!cir.int<s, 1> x 8>, !cir.vector<!cir.double x 8>) -> !cir.vector<!cir.double x 8>
+
+  // LLVM-LABEL: test_mm512_mask_load_pd
+  // LLVM: @llvm.masked.load.v8f64.p0(ptr %{{.*}}, i32 64, <8 x i1> %{{.*}}, <8 x double> %{{.*}})
+  return _mm512_mask_load_pd (__W,__U, __P);
+}
+
+__m512i test_mm512_mask_load_epi32(__m512i __W, __mmask16 __U, void const *__P) {
+  // CIR-LABEL: _mm512_mask_load_epi32
+  // CIR: %{{.*}} = cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.vector<!s32i x 16>>, !u32i, !cir.vector<!cir.int<s, 1> x 16>, !cir.vector<!s32i x 16>) -> !cir.vector<!s32i x 16>
+
+  // LLVM-LABEL: test_mm512_mask_load_epi32
+  // LLVM: @llvm.masked.load.v16i32.p0(ptr %{{.*}}, i32 64, <16 x i1> %{{.*}}, <16 x i32> %{{.*}})
+  return _mm512_mask_load_epi32(__W, __U, __P); 
+}
+
+__m512i test_mm512_mask_load_epi64(__m512i __W, __mmask8 __U, void const *__P) {
+  // CIR-LABEL: _mm512_mask_load_epi64
+  // CIR: %{{.*}} = cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.vector<!s64i x 8>>, !u32i, !cir.vector<!cir.int<s, 1> x 8>, !cir.vector<!s64i x 8>) -> !cir.vector<!s64i x 8>
+
+  // LLVM-LABEL: test_mm512_mask_load_epi64
+  // LLVM: @llvm.masked.load.v8i64.p0(ptr %{{.*}}, i32 64, <8 x i1> %{{.*}}, <8 x i64> %{{.*}})
+  return _mm512_mask_load_epi64(__W, __U, __P); 
+}
+
+__m512i test_mm512_maskz_load_epi64(__mmask8 __U, void const *__P) {
+  // CIR-LABEL: _mm512_maskz_load_epi64
+  // CIR: %{{.*}} = cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.vector<!s64i x 8>>, !u32i, !cir.vector<!cir.int<s, 1> x 8>, !cir.vector<!s64i x 8>) -> !cir.vector<!s64i x 8>
+
+  // LLVM-LABEL: test_mm512_maskz_load_epi64
+  // LLVM: @llvm.masked.load.v8i64.p0(ptr %{{.*}}, i32 64, <8 x i1> %{{.*}}, <8 x i64> %{{.*}})
+  return _mm512_maskz_load_epi64(__U, __P); 
+}
diff --git a/clang/test/CIR/CodeGen/X86/avx512fp16-builtins.c b/clang/test/CIR/CodeGen/X86/avx512fp16-builtins.c
@@ -14,3 +14,21 @@ void test_mm_mask_store_sh(void *__P, __mmask8 __U, __m128h __A) {
   // LLVM: call void @llvm.masked.store.v8f16.p0(<8 x half> %{{.*}}, ptr %{{.*}}, i32 1, <8 x i1> %{{.*}})
   _mm_mask_store_sh(__P, __U, __A);
 }
+
+__m128h test_mm_mask_load_sh(__m128h __A, __mmask8 __U, const void *__W) {
+  // CIR-LABEL: _mm_mask_load_sh
+  // CIR: %{{.*}} = cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.vector<!cir.f16 x 8>>, !u32i, !cir.vector<!cir.int<s, 1> x 8>, !cir.vector<!cir.f16 x 8>) -> !cir.vector<!cir.f16 x 8>
+
+  // LLVM-LABEL: @test_mm_mask_load_sh
+  // LLVM: %{{.*}} = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x half> %{{.*}})
+  return _mm_mask_load_sh(__A, __U, __W);
+}
+
+__m128h test_mm_maskz_load_sh(__mmask8 __U, const void *__W) {
+  // CIR-LABEL: _mm_maskz_load_sh
+  // CIR: %{{.*}} = cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.vector<!cir.f16 x 8>>, !u32i, !cir.vector<!cir.int<s, 1> x 8>, !cir.vector<!cir.f16 x 8>) -> !cir.vector<!cir.f16 x 8>
+
+  // LLVM-LABEL: @test_mm_maskz_load_sh
+  // LLVM: %{{.*}} = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x half> %{{.*}})
+  return _mm_maskz_load_sh(__U, __W);
+}
diff --git a/clang/test/CIR/CodeGen/X86/avx512vl-builtins.c b/clang/test/CIR/CodeGen/X86/avx512vl-builtins.c
diff --git a/clang/test/CIR/CodeGen/X86/avx512vlbw-buiiltins.c b/clang/test/CIR/CodeGen/X86/avx512vlbw-buiiltins.c