diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index 77f19343653db..66869f71fbf5d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -540,32 +540,32 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
   }
 
   // Now see if we can emit a target-specific builtin.
-  if (mlir::Value v = emitTargetBuiltinExpr(builtinID, e, returnValue)) {
-    switch (evalKind) {
-    case cir::TEK_Scalar:
-      if (mlir::isa<cir::VoidType>(v.getType()))
-        return RValue::get(nullptr);
-      return RValue::get(v);
-    case cir::TEK_Aggregate:
-      cgm.errorNYI(e->getSourceRange(), "aggregate return value from builtin");
-      return getUndefRValue(e->getType());
-    case cir::TEK_Complex:
-      llvm_unreachable("No current target builtin returns complex");
-    }
-    llvm_unreachable("Bad evaluation kind in EmitBuiltinExpr");
+  RValue value = emitTargetBuiltinExpr(builtinID, e, returnValue);
+
+  if (value.isScalar()) {
+    if (!value.getValue() ||
+        mlir::isa<cir::VoidType>(value.getValue().getType()))
+      return RValue::getIgnored();
+
+    return value;
   }
 
-  cgm.errorNYI(e->getSourceRange(),
-               std::string("unimplemented builtin call: ") +
-                   getContext().BuiltinInfo.getName(builtinID));
-  return getUndefRValue(e->getType());
+  if (value.isAggregate()) {
+    cgm.errorNYI(e->getSourceRange(), "aggregate return value from builtin");
+    return getUndefRValue(e->getType());
+  }
+
+  if (value.isComplex()) {
+    llvm_unreachable("No current target builtin returns complex");
+  }
+
+  llvm_unreachable("Bad evaluation kind in EmitBuiltinExpr");
 }
 
-static mlir::Value emitTargetArchBuiltinExpr(CIRGenFunction *cgf,
-                                             unsigned builtinID,
-                                             const CallExpr *e,
-                                             ReturnValueSlot &returnValue,
-                                             llvm::Triple::ArchType arch) {
+static std::optional<mlir::Value>
+emitTargetArchBuiltinExpr(CIRGenFunction *cgf, unsigned builtinID,
+                          const CallExpr *e, ReturnValueSlot &returnValue,
+                          llvm::Triple::ArchType arch) {
   // When compiling in HipStdPar mode we have to be conservative in rejecting
   // target specific features in the FE, and defer the possible error to the
   // AcceleratorCodeSelection pass, wherein iff an unsupported target builtin is
@@ -616,18 +616,28 @@ static mlir::Value emitTargetArchBuiltinExpr(CIRGenFunction *cgf,
   }
 }
 
-mlir::Value
-CIRGenFunction::emitTargetBuiltinExpr(unsigned builtinID, const CallExpr *e,
-                                      ReturnValueSlot &returnValue) {
+RValue CIRGenFunction::emitTargetBuiltinExpr(unsigned builtinID,
+                                             const CallExpr *e,
+                                             ReturnValueSlot &returnValue) {
+  std::optional<mlir::Value> valueOpt;
   if (getContext().BuiltinInfo.isAuxBuiltinID(builtinID)) {
     assert(getContext().getAuxTargetInfo() && "Missing aux target info");
-    return emitTargetArchBuiltinExpr(
+    valueOpt = emitTargetArchBuiltinExpr(
         this, getContext().BuiltinInfo.getAuxBuiltinID(builtinID), e,
         returnValue, getContext().getAuxTargetInfo()->getTriple().getArch());
+  } else {
+    valueOpt = emitTargetArchBuiltinExpr(this, builtinID, e, returnValue,
+                                         getTarget().getTriple().getArch());
+  }
+
+  if (!valueOpt) {
+    cgm.errorNYI(e->getSourceRange(),
+                 std::string("unimplemented builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return getUndefRValue(e->getType());
   }
 
-  return emitTargetArchBuiltinExpr(this, builtinID, e, returnValue,
-                                   getTarget().getTriple().getArch());
+  return RValue::get(*valueOpt);
 }
 
 mlir::Value CIRGenFunction::emitScalarOrConstFoldImmArg(
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index ee6900141647f..ad5742aef174a 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -20,6 +20,11 @@
 using namespace clang;
 using namespace clang::CIRGen;
 
+/// Get integer from a mlir::Value that is an int constant or a constant op.
+static int64_t getIntValueFromConstOp(mlir::Value val) {
+  return val.getDefiningOp<cir::ConstantOp>().getIntValue().getSExtValue();
+}
+
 template <typename... Operands>
 static mlir::Value emitIntrinsicCallOp(CIRGenFunction &cgf, const CallExpr *e,
                                        const std::string &str,
@@ -33,6 +38,32 @@ static mlir::Value emitIntrinsicCallOp(CIRGenFunction &cgf, const CallExpr *e,
       .getResult();
 }
 
+static mlir::Value emitPrefetch(CIRGenFunction &cgf, unsigned builtinID,
+                                const CallExpr *e,
+                                const SmallVector<mlir::Value> &ops) {
+  CIRGenBuilderTy &builder = cgf.getBuilder();
+  mlir::Location location = cgf.getLoc(e->getExprLoc());
+  mlir::Type voidTy = builder.getVoidTy();
+  mlir::Value address = builder.createPtrBitcast(ops[0], voidTy);
+  bool isWrite{};
+  int locality{};
+
+  assert(builtinID == X86::BI_mm_prefetch || builtinID == X86::BI_m_prefetchw ||
+         builtinID == X86::BI_m_prefetch && "Expected prefetch builtin");
+
+  if (builtinID == X86::BI_mm_prefetch) {
+    int hint = getIntValueFromConstOp(ops[1]);
+    isWrite = (hint >> 2) & 0x1;
+    locality = hint & 0x3;
+  } else {
+    isWrite = (builtinID == X86::BI_m_prefetchw);
+    locality = 0x3;
+  }
+
+  cir::PrefetchOp::create(builder, location, address, locality, isWrite);
+  return {};
+}
+
 // OG has unordered comparison as a form of optimization in addition to
 // ordered comparison, while CIR doesn't.
 //
@@ -68,8 +99,8 @@ static mlir::Value emitVectorFCmp(CIRGenBuilderTy &builder,
   return bitCast;
 }
 
-mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
-                                               const CallExpr *expr) {
+std::optional<mlir::Value>
+CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, const CallExpr *expr) {
   if (builtinID == Builtin::BI__builtin_cpu_is) {
     cgm.errorNYI(expr->getSourceRange(), "__builtin_cpu_is");
     return {};
@@ -120,6 +151,9 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI_mm_sfence:
     return emitIntrinsicCallOp(*this, expr, "x86.sse.sfence", voidTy);
   case X86::BI_mm_prefetch:
+  case X86::BI_m_prefetch:
+  case X86::BI_m_prefetchw:
+    return emitPrefetch(*this, builtinID, expr, ops);
   case X86::BI__rdtsc:
   case X86::BI__builtin_ia32_rdtscp:
   case X86::BI__builtin_ia32_lzcnt_u16:
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index b22bf2d87fc10..db1eac4116d0d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -1759,9 +1759,8 @@ class CIRGenFunction : public CIRGenTypeCache {
                                      bool buildingTopLevelCase);
   mlir::LogicalResult emitSwitchStmt(const clang::SwitchStmt &s);
 
-  mlir::Value emitTargetBuiltinExpr(unsigned builtinID,
-                                    const clang::CallExpr *e,
-                                    ReturnValueSlot &returnValue);
+  RValue emitTargetBuiltinExpr(unsigned builtinID, const clang::CallExpr *e,
+                               ReturnValueSlot &returnValue);
 
   /// Given a value and its clang type, returns the value casted to its memory
   /// representation.
@@ -1801,7 +1800,8 @@ class CIRGenFunction : public CIRGenTypeCache {
 
   mlir::LogicalResult emitWhileStmt(const clang::WhileStmt &s);
 
-  mlir::Value emitX86BuiltinExpr(unsigned builtinID, const CallExpr *e);
+  std::optional<mlir::Value> emitX86BuiltinExpr(unsigned builtinID,
+                                                const CallExpr *e);
 
   /// Given an assignment `*lhs = rhs`, emit a test that checks if \p rhs is
   /// nonnull, if 1\p LHS is marked _Nonnull.
diff --git a/clang/test/CIR/CodeGen/X86/prefetchw-builtin.c b/clang/test/CIR/CodeGen/X86/prefetchw-builtin.c
new file mode 100644
index 0000000000000..7d7ce348b8d88
--- /dev/null
+++ b/clang/test/CIR/CodeGen/X86/prefetchw-builtin.c
@@ -0,0 +1,36 @@
+
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse -fclangir -emit-cir -o %t.cir -Wall -Werror
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse -fclangir -emit-llvm -o %t.ll -Wall -Werror
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse -fno-signed-char -fclangir -emit-cir -o %t.cir -Wall -Werror
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse -fclangir -emit-llvm -o %t.ll -Wall -Werror
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG
+
+
+#include <x86intrin.h>
+
+void test_m_prefetch_w(void *p) {
+  // CIR-LABEL: test_m_prefetch_w
+  // LLVM-LABEL: test_m_prefetch_w
+  // OGCG-LABEL: test_m_prefetch_w
+  return _m_prefetchw(p);
+  // CIR: cir.prefetch write locality(3) %{{.*}} : !cir.ptr<!void>
+  // LLVM: call void @llvm.prefetch.p0(ptr {{.*}}, i32 1, i32 3, i32 1)
+  // OGCG: call void @llvm.prefetch.p0(ptr {{.*}}, i32 1, i32 3, i32 1)
+}
+
+void test_m_prefetch(void *p) {
+  // CIR-LABEL: test_m_prefetch
+  // LLVM-LABEL: test_m_prefetch
+  // OGCG-LABEL: test_m_prefetch
+  return _m_prefetch(p);
+  // CIR: cir.prefetch read locality(3) %{{.*}} : !cir.ptr<!void>
+  // LLVM: call void @llvm.prefetch.p0(ptr {{.*}}, i32 0, i32 3, i32 1)
+  // OGCG: call void @llvm.prefetch.p0(ptr {{.*}}, i32 0, i32 3, i32 1)
+}
diff --git a/clang/test/CIR/CodeGen/X86/sse-builtins.c b/clang/test/CIR/CodeGen/X86/sse-builtins.c
index 3a61018741958..e8203265f89f3 100644
--- a/clang/test/CIR/CodeGen/X86/sse-builtins.c
+++ b/clang/test/CIR/CodeGen/X86/sse-builtins.c
@@ -26,3 +26,33 @@ void test_mm_sfence(void) {
   // LLVM: call void @llvm.x86.sse.sfence()
   // OGCG: call void @llvm.x86.sse.sfence()
 }
+
+void test_mm_prefetch(char const* p) {
+  // CIR-LABEL: test_mm_prefetch
+  // LLVM-LABEL: test_mm_prefetch
+  // OGCG-LABEL: test_mm_prefetch
+  _mm_prefetch(p, 0);
+  // CIR: cir.prefetch read locality(0) %{{.*}} : !cir.ptr<!void>
+  // LLVM: call void @llvm.prefetch.p0(ptr {{.*}}, i32 0, i32 0, i32 1)
+  // OGCG: call void @llvm.prefetch.p0(ptr {{.*}}, i32 0, i32 0, i32 1)
+}
+
+void test_mm_prefetch_local(char const* p) {
+  // CIR-LABEL: test_mm_prefetch_local
+  // LLVM-LABEL: test_mm_prefetch_local
+  // OGCG-LABEL: test_mm_prefetch_local
+  _mm_prefetch(p, 3);
+  // CIR: cir.prefetch read locality(3) %{{.*}} : !cir.ptr<!void>
+  // LLVM: call void @llvm.prefetch.p0(ptr {{.*}}, i32 0, i32 3, i32 1)
+  // OGCG: call void @llvm.prefetch.p0(ptr {{.*}}, i32 0, i32 3, i32 1)
+}
+
+void test_mm_prefetch_write(char const* p) {
+  // CIR-LABEL: test_mm_prefetch_write
+  // LLVM-LABEL: test_mm_prefetch_write
+  // OGCG-LABEL: test_mm_prefetch_write
+  _mm_prefetch(p, 7);
+  // CIR: cir.prefetch write locality(3) %{{.*}} : !cir.ptr<!void>
+  // LLVM: call void @llvm.prefetch.p0(ptr {{.*}}, i32 1, i32 3, i32 1)
+  // OGCG: call void @llvm.prefetch.p0(ptr {{.*}}, i32 1, i32 3, i32 1)
+}