diff --git a/clang/lib/CIR/CodeGen/ABIInfo.h b/clang/lib/CIR/CodeGen/ABIInfo.h
index 4d03db38cabb9..ef9ade0a4a3ac 100644
--- a/clang/lib/CIR/CodeGen/ABIInfo.h
+++ b/clang/lib/CIR/CodeGen/ABIInfo.h
@@ -9,6 +9,9 @@
 #ifndef LLVM_CLANG_LIB_CIR_ABIINFO_H
 #define LLVM_CLANG_LIB_CIR_ABIINFO_H
 
+#include "clang/Basic/LangOptions.h"
+#include "clang/CIR/Dialect/IR/CIRTypes.h"
+
 namespace clang::CIRGen {
 
 class CIRGenFunctionInfo;
@@ -23,6 +26,12 @@ class ABIInfo {
   ABIInfo(CIRGenTypes &cgt) : cgt(cgt) {}
 
   virtual ~ABIInfo();
+
+  /// Returns the optimal vector memory type based on the given vector type. For
+  /// example, on certain targets, a vector with 3 elements might be promoted to
+  /// one with 4 elements to improve performance.
+  virtual cir::VectorType
+  getOptimalVectorMemoryType(cir::VectorType ty, const LangOptions &opt) const;
 };
 
 } // namespace clang::CIRGen
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index fa68ad931ba74..e40fa87adeec9 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -314,21 +314,36 @@ void CIRGenFunction::emitStoreOfScalar(mlir::Value value, Address addr,
                                        bool isInit, bool isNontemporal) {
   assert(!cir::MissingFeatures::opLoadStoreThreadLocal());
 
+  mlir::Type srcTy = addr.getElementType();
   if (const auto *clangVecTy = ty->getAs<clang::VectorType>()) {
-    // Boolean vectors use `iN` as storage type.
-    if (clangVecTy->isExtVectorBoolType())
-      cgm.errorNYI(addr.getPointer().getLoc(),
-                   "emitStoreOfScalar ExtVectorBoolType");
+    if (auto vecTy = dyn_cast<cir::VectorType>(srcTy)) {
+      cir::VectorType newVecTy =
+          cgm.getTargetCIRGenInfo().getABIInfo().getOptimalVectorMemoryType(
+              vecTy, getLangOpts());
+
+      if (!clangVecTy->isPackedVectorBoolType(getContext()) &&
+          vecTy != newVecTy) {
+
+        const unsigned oldNumElements = vecTy.getSize();
+        const unsigned newNumElements = newVecTy.getSize();
+        SmallVector<mlir::Attribute, 8> indices;
+        indices.reserve(newNumElements);
+        for (unsigned i = 0; i < newNumElements; ++i) {
+          int64_t value = i < oldNumElements ? (int64_t)i : -1;
+          indices.push_back(cir::IntAttr::get(builder.getSInt64Ty(), value));
+        }
 
-    // Handle vectors of size 3 like size 4 for better performance.
-    const mlir::Type elementType = addr.getElementType();
-    const auto vecTy = cast<cir::VectorType>(elementType);
+        cir::ConstantOp poison = builder.getConstant(
+            value.getLoc(), cir::PoisonAttr::get(value.getType()));
+        value =
+            cir::VecShuffleOp::create(builder, value.getLoc(), newVecTy, value,
+                                      poison, builder.getArrayAttr(indices));
+        srcTy = newVecTy;
+      }
 
-    // TODO(CIR): Use `ABIInfo::getOptimalVectorMemoryType` once it upstreamed
-    assert(!cir::MissingFeatures::cirgenABIInfo());
-    if (vecTy.getSize() == 3 && !getLangOpts().PreserveVec3Type)
-      cgm.errorNYI(addr.getPointer().getLoc(),
-                   "emitStoreOfScalar Vec3 & PreserveVec3Type disabled");
+      if (addr.getElementType() != srcTy)
+        addr = addr.withElementType(builder, srcTy);
+    }
   }
 
   value = emitToMemory(value, ty);
@@ -565,13 +580,31 @@ mlir::Value CIRGenFunction::emitLoadOfScalar(Address addr, bool isVolatile,
       return nullptr;
     }
 
-    const auto vecTy = cast<cir::VectorType>(eltTy);
+    // Handles vectors of sizes that are likely to be expanded to a larger size
+    // to optimize performance.
+    auto vecTy = cast<cir::VectorType>(eltTy);
+    cir::VectorType newVecTy =
+        cgm.getTargetCIRGenInfo().getABIInfo().getOptimalVectorMemoryType(
+            vecTy, getLangOpts());
+
+    if (vecTy != newVecTy) {
+      Address cast = addr.withElementType(builder, newVecTy);
+      mlir::Value value = builder.createLoad(cgm.getLoc(loc), cast, isVolatile);
+
+      unsigned oldNumElements = vecTy.getSize();
+      SmallVector<mlir::Attribute, 8> indices;
+      indices.reserve(oldNumElements);
+      for (unsigned i = 0; i < oldNumElements; i++) {
+        indices.push_back(cir::IntAttr::get(builder.getSInt64Ty(), i));
+      }
+
+      cir::ConstantOp poison = builder.getConstant(
+          value.getLoc(), cir::PoisonAttr::get(value.getType()));
+      value = builder.create<cir::VecShuffleOp>(
+          cgm.getLoc(loc), vecTy, value, poison, builder.getArrayAttr(indices));
 
-    // Handle vectors of size 3 like size 4 for better performance.
-    assert(!cir::MissingFeatures::cirgenABIInfo());
-    if (vecTy.getSize() == 3 && !getLangOpts().PreserveVec3Type)
-      cgm.errorNYI(addr.getPointer().getLoc(),
-                   "emitLoadOfScalar Vec3 & PreserveVec3Type disabled");
+      return value;
+    }
   }
 
   assert(!cir::MissingFeatures::opLoadStoreTbaa());
diff --git a/clang/lib/CIR/CodeGen/TargetInfo.cpp b/clang/lib/CIR/CodeGen/TargetInfo.cpp
index 62a8c59abe604..252d9e9ad48bc 100644
--- a/clang/lib/CIR/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CIR/CodeGen/TargetInfo.cpp
@@ -60,6 +60,15 @@ clang::CIRGen::createX8664TargetCIRGenInfo(CIRGenTypes &cgt) {
 
 ABIInfo::~ABIInfo() noexcept = default;
 
+cir::VectorType
+ABIInfo::getOptimalVectorMemoryType(cir::VectorType ty,
+                                    const LangOptions &opt) const {
+  if (ty.getSize() == 3 && !opt.PreserveVec3Type) {
+    return cir::VectorType::get(ty.getElementType(), 4);
+  }
+  return ty;
+}
+
 bool TargetCIRGenInfo::isNoProtoCallVariadic(
     const FunctionNoProtoType *fnType) const {
   // The following conventions are known to require this to be false:
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index 58ef500446aa7..f4aab4fc6c5be 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -321,6 +321,12 @@ static LogicalResult checkConstantTypes(mlir::Operation *op, mlir::Type opType,
         "zero expects struct, array, vector, or complex type");
   }
 
+  if (isa<cir::PoisonAttr>(attrType)) {
+    if (!::mlir::isa<cir::VoidType>(opType))
+      return success();
+    return op->emitOpError("poison expects non-void type");
+  }
+
   if (mlir::isa<cir::BoolAttr>(attrType)) {
     if (!mlir::isa<cir::BoolType>(opType))
       return op->emitOpError("result type (")
@@ -2123,23 +2129,25 @@ OpFoldResult cir::VecShuffleOp::fold(FoldAdaptor adaptor) {
   if (!vec1Attr || !vec2Attr)
     return {};
 
-  mlir::Type vec1ElemTy =
-      mlir::cast<cir::VectorType>(vec1Attr.getType()).getElementType();
+  mlir::ArrayAttr indicesElts = adaptor.getIndices();
+  auto indicesEltsRange = indicesElts.getAsRange<cir::IntAttr>();
+
+  // In MLIR DenseElementsAttr can't contain undef attr, so we can't fold
+  // the shuffle op to ConstVector if it's contain index  with -1 value
+  if (std::find_if(indicesEltsRange.begin(), indicesEltsRange.end(),
+                   [](cir::IntAttr idx) { return idx.getSInt() != -1; }) !=
+      indicesEltsRange.end()) {
+    return {};
+  }
 
   mlir::ArrayAttr vec1Elts = vec1Attr.getElts();
   mlir::ArrayAttr vec2Elts = vec2Attr.getElts();
-  mlir::ArrayAttr indicesElts = adaptor.getIndices();
 
   SmallVector<mlir::Attribute, 16> elements;
   elements.reserve(indicesElts.size());
 
   uint64_t vec1Size = vec1Elts.size();
-  for (const auto &idxAttr : indicesElts.getAsRange<cir::IntAttr>()) {
-    if (idxAttr.getSInt() == -1) {
-      elements.push_back(cir::UndefAttr::get(vec1ElemTy));
-      continue;
-    }
-
+  for (const auto &idxAttr : indicesEltsRange) {
     uint64_t idxValue = idxAttr.getUInt();
     elements.push_back(idxValue < vec1Size ? vec1Elts[idxValue]
                                            : vec2Elts[idxValue - vec1Size]);
diff --git a/clang/test/CIR/CodeGen/vector-with-size-3.cpp b/clang/test/CIR/CodeGen/vector-with-size-3.cpp
new file mode 100644
index 0000000000000..2108d3cefb8c5
--- /dev/null
+++ b/clang/test/CIR/CodeGen/vector-with-size-3.cpp
@@ -0,0 +1,38 @@
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s --check-prefix=LLVM
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s --check-prefix=OGCG
+
+typedef int vi3 __attribute__((ext_vector_type(3)));
+
+void store_load() {
+  vi3 a;
+  vi3 b = a;
+}
+
+// CIR: %[[A_ADDR:.*]] = cir.alloca !cir.vector<3 x !s32i>, !cir.ptr<!cir.vector<3 x !s32i>>, ["a"]
+// CIR: %[[B_ADDR:.*]] = cir.alloca !cir.vector<3 x !s32i>, !cir.ptr<!cir.vector<3 x !s32i>>, ["b", init]
+// CIR: %[[A_V4:.*]] = cir.cast(bitcast, %[[A_ADDR]] : !cir.ptr<!cir.vector<3 x !s32i>>), !cir.ptr<!cir.vector<4 x !s32i>>
+// CIR: %[[TMP_A_V4:.*]] = cir.load{{.*}} %[[A_V4]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
+// CIR: %[[POISON:.*]] = cir.const #cir.poison : !cir.vector<4 x !s32i>
+// CIR: %[[SHUFFLE_V4:.*]] = cir.vec.shuffle(%3, %[[POISON]] : !cir.vector<4 x !s32i>) [#cir.int<0> : !s64i, #cir.int<1> : !s64i, #cir.int<2> : !s64i] : !cir.vector<3 x !s32i>
+// CIR: %[[POISON:.*]] = cir.const #cir.poison : !cir.vector<3 x !s32i>
+// CIR: %[[SHUFFLE_V3:.*]] = cir.vec.shuffle(%[[SHUFFLE_V4]], %[[POISON]] : !cir.vector<3 x !s32i>) [#cir.int<0> : !s64i, #cir.int<1> : !s64i, #cir.int<2> : !s64i, #cir.int<-1> : !s64i] : !cir.vector<4 x !s32i>
+// CIR: %[[TMP_B_V4:.*]] = cir.cast(bitcast, %[[B_ADDR]] : !cir.ptr<!cir.vector<3 x !s32i>>), !cir.ptr<!cir.vector<4 x !s32i>>
+// CIR: cir.store{{.*}} %[[SHUFFLE_V3]], %[[TMP_B_V4]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
+
+// LLVM: %[[A_ADDR:.*]] = alloca <3 x i32>, i64 1, align 16
+// LLVM: %[[B_ADDR:.*]] = alloca <3 x i32>, i64 1, align 16
+// LLVM: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[A_ADDR]], align 16
+// LLVM: %[[SHUFFLE_V4:.*]] = shufflevector <4 x i32> %[[TMP_A]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// LLVM: %[[SHUFFLE_V3:.*]] = shufflevector <3 x i32> %[[SHUFFLE_V4]], <3 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// LLVM: store <4 x i32> %[[SHUFFLE_V3]], ptr %[[B_ADDR]], align 16
+
+// OGCG: %[[A_ADDR:.*]] = alloca <3 x i32>, align 16
+// OGCG: %[[B_ADDR:.*]] = alloca <3 x i32>, align 16
+// OGCG: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[A_ADDR]], align 16
+// OGCG: %[[SHUFFLE_V4:.*]] = shufflevector <4 x i32> %[[TMP_A]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// OGCG: %[[SHUFFLE_V3:.*]] = shufflevector <3 x i32> %[[SHUFFLE_V4]], <3 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// OGCG: store <4 x i32> %[[SHUFFLE_V3]], ptr %[[B_ADDR]], align 16